[PATCH v1 1/1] perf test: Simplify metric value validation test final report

From: weilin . wang
Date: Tue Jan 30 2024 - 13:09:22 EST


From: Weilin Wang <weilin.wang@xxxxxxxxx>

The original test report was too complicated to read with information
that not really useful. This new update simplify the report which should
largely improve the readibility.

Signed-off-by: Weilin Wang <weilin.wang@xxxxxxxxx>
---
.../tests/shell/lib/perf_metric_validation.py | 231 ++++++++++--------
tools/perf/tests/shell/stat_metrics_values.sh | 4 +-
2 files changed, 127 insertions(+), 108 deletions(-)

diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py
index 50a34a9cc040..a2d235252183 100644
--- a/tools/perf/tests/shell/lib/perf_metric_validation.py
+++ b/tools/perf/tests/shell/lib/perf_metric_validation.py
@@ -1,4 +1,4 @@
-#SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0
import re
import csv
import json
@@ -6,36 +6,61 @@ import argparse
from pathlib import Path
import subprocess

+
+class TestError:
+ def __init__(self, metric: list[str], wl: str, value: list[float], low: float, up=float('nan'), description=str()):
+ self.metric: list = metric # multiple metrics in relationship type tests
+ self.workloads = [wl] # multiple workloads possible
+ self.collectedValue: list = value
+ self.valueLowBound = low
+ self.valueUpBound = up
+ self.description = description
+
+ def __repr__(self) -> str:
+ if len(self.metric) > 1:
+ return "\nMetric Relationship Error: \tThe collected value of metric {0}\n\
+ \tis {1} in workload(s): {2} \n\
+ \tbut expected value range is [{3}, {4}]\n\
+ \tRelationship rule description: \'{5}\'".format(self.metric, self.collectedValue, self.workloads,
+ self.valueLowBound, self.valueUpBound, self.description)
+ elif len(self.collectedValue) == 0:
+ return "\nNo Metric Value Error: \tMetric {0} returns with no value \n\
+ \tworkload(s): {1}".format(self.metric, self.workloads)
+ else:
+ return "\nWrong Metric Value Error: \tThe collected value of metric {0}\n\
+ \tis {1} in workload(s): {2}\n\
+ \tbut expected value range is [{3}, {4}]"\
+ .format(self.metric, self.collectedValue, self.workloads,
+ self.valueLowBound, self.valueUpBound)
+
+
class Validator:
def __init__(self, rulefname, reportfname='', t=5, debug=False, datafname='', fullrulefname='', workload='true', metrics=''):
self.rulefname = rulefname
self.reportfname = reportfname
self.rules = None
- self.collectlist:str = metrics
+ self.collectlist: str = metrics
self.metrics = self.__set_metrics(metrics)
self.skiplist = set()
self.tolerance = t

self.workloads = [x for x in workload.split(",") if x]
- self.wlidx = 0 # idx of current workloads
- self.allresults = dict() # metric results of all workload
- self.allignoremetrics = dict() # metrics with no results or negative results
- self.allfailtests = dict()
+ self.wlidx = 0 # idx of current workloads
+ self.allresults = dict() # metric results of all workload
self.alltotalcnt = dict()
self.allpassedcnt = dict()
- self.allerrlist = dict()

- self.results = dict() # metric results of current workload
+ self.results = dict() # metric results of current workload
# vars for test pass/failure statistics
- self.ignoremetrics= set() # metrics with no results or negative results, neg result counts as a failed test
- self.failtests = dict()
+ # metrics with no results or negative results, neg result counts failed tests
+ self.ignoremetrics = set()
self.totalcnt = 0
self.passedcnt = 0
# vars for errors
self.errlist = list()

# vars for Rule Generator
- self.pctgmetrics = set() # Percentage rule
+ self.pctgmetrics = set() # Percentage rule

# vars for debug
self.datafname = datafname
@@ -69,10 +94,10 @@ class Validator:
ensure_ascii=True,
indent=4)

- def get_results(self, idx:int = 0):
+ def get_results(self, idx: int = 0):
return self.results[idx]

- def get_bounds(self, lb, ub, error, alias={}, ridx:int = 0) -> list:
+ def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list:
"""
Get bounds and tolerance from lb, ub, and error.
If missing lb, use 0.0; missing ub, use float('inf); missing error, use self.tolerance.
@@ -85,7 +110,7 @@ class Validator:
tolerance, denormalized base on upper bound value
"""
# init ubv and lbv to invalid values
- def get_bound_value (bound, initval, ridx):
+ def get_bound_value(bound, initval, ridx):
val = initval
if isinstance(bound, int) or isinstance(bound, float):
val = bound
@@ -113,10 +138,10 @@ class Validator:

return lbv, ubv, denormerr

- def get_value(self, name:str, ridx:int = 0) -> list:
+ def get_value(self, name: str, ridx: int = 0) -> list:
"""
Get value of the metric from self.results.
- If result of this metric is not provided, the metric name will be added into self.ignoremetics and self.errlist.
+ If result of this metric is not provided, the metric name will be added into self.ignoremetics.
All future test(s) on this metric will fail.

@param name: name of the metric
@@ -142,7 +167,7 @@ class Validator:
Check if metrics value are non-negative.
One metric is counted as one test.
Failure: when metric value is negative or not provided.
- Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics.
+ Metrics with negative value will be added into self.ignoremetrics.
"""
negmetric = dict()
pcnt = 0
@@ -155,25 +180,27 @@ class Validator:
else:
pcnt += 1
tcnt += 1
+ # The first round collect_perf() run these metrics with simple workload
+ # "true". We give metrics a second chance with a longer workload if less
+ # than 20 metrics failed positive test.
if len(rerun) > 0 and len(rerun) < 20:
second_results = dict()
self.second_test(rerun, second_results)
for name, val in second_results.items():
- if name not in negmetric: continue
+ if name not in negmetric:
+ continue
if val >= 0:
del negmetric[name]
pcnt += 1

- self.failtests['PositiveValueTest']['Total Tests'] = tcnt
- self.failtests['PositiveValueTest']['Passed Tests'] = pcnt
if len(negmetric.keys()):
self.ignoremetrics.update(negmetric.keys())
- negmessage = ["{0}(={1:.4f})".format(name, val) for name, val in negmetric.items()]
- self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue': negmessage})
+ self.errlist.extend(
+ [TestError([m], self.workloads[self.wlidx], negmetric[m], 0) for m in negmetric.keys()])

return

- def evaluate_formula(self, formula:str, alias:dict, ridx:int = 0):
+ def evaluate_formula(self, formula: str, alias: dict, ridx: int = 0):
"""
Evaluate the value of formula.

@@ -187,10 +214,11 @@ class Validator:
sign = "+"
f = str()

- #TODO: support parenthesis?
+ # TODO: support parenthesis?
for i in range(len(formula)):
if i+1 == len(formula) or formula[i] in ('+', '-', '*', '/'):
- s = alias[formula[b:i]] if i+1 < len(formula) else alias[formula[b:]]
+ s = alias[formula[b:i]] if i + \
+ 1 < len(formula) else alias[formula[b:]]
v = self.get_value(s, ridx)
if not v:
errs.append(s)
@@ -228,49 +256,49 @@ class Validator:
alias = dict()
for m in rule['Metrics']:
alias[m['Alias']] = m['Name']
- lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
- val, f = self.evaluate_formula(rule['Formula'], alias, ridx=rule['RuleIndex'])
+ lbv, ubv, t = self.get_bounds(
+ rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
+ val, f = self.evaluate_formula(
+ rule['Formula'], alias, ridx=rule['RuleIndex'])
+
+ lb = rule['RangeLower']
+ ub = rule['RangeUpper']
+ if isinstance(lb, str):
+ if lb in alias:
+ lb = alias[lb]
+ if isinstance(ub, str):
+ if ub in alias:
+ ub = alias[ub]
+
if val == -1:
- self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Description':f})
+ self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [],
+ lb, ub, rule['Description']))
elif not self.check_bound(val, lbv, ubv, t):
- lb = rule['RangeLower']
- ub = rule['RangeUpper']
- if isinstance(lb, str):
- if lb in alias:
- lb = alias[lb]
- if isinstance(ub, str):
- if ub in alias:
- ub = alias[ub]
- self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Formula':f,
- 'RangeLower': lb, 'LowerBoundValue': self.get_value(lb),
- 'RangeUpper': ub, 'UpperBoundValue':self.get_value(ub),
- 'ErrorThreshold': t, 'CollectedValue': val})
+ self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [val],
+ lb, ub, rule['Description']))
else:
self.passedcnt += 1
- self.failtests['RelationshipTest']['Passed Tests'] += 1
self.totalcnt += 1
- self.failtests['RelationshipTest']['Total Tests'] += 1

return

-
# Single Metric Test
- def single_test(self, rule:dict):
+ def single_test(self, rule: dict):
"""
Validate if the metrics are in the required value range.
eg. lower_bound <= metrics_value <= upper_bound
One metric is counted as one test in this type of test.
One rule may include one or more metrics.
Failure: when the metric value not provided or the value is outside the bounds.
- This test updates self.total_cnt and records failed tests in self.failtest['SingleMetricTest'].
+ This test updates self.total_cnt.

@param rule: dict with metrics to validate and the value range requirement
"""
- lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
+ lbv, ubv, t = self.get_bounds(
+ rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
metrics = rule['Metrics']
passcnt = 0
totalcnt = 0
- faillist = list()
failures = dict()
rerun = list()
for m in metrics:
@@ -286,25 +314,20 @@ class Validator:
second_results = dict()
self.second_test(rerun, second_results)
for name, val in second_results.items():
- if name not in failures: continue
+ if name not in failures:
+ continue
if self.check_bound(val, lbv, ubv, t):
passcnt += 1
del failures[name]
else:
- failures[name] = val
+ failures[name] = [val]
self.results[0][name] = val

self.totalcnt += totalcnt
self.passedcnt += passcnt
- self.failtests['SingleMetricTest']['Total Tests'] += totalcnt
- self.failtests['SingleMetricTest']['Passed Tests'] += passcnt
if len(failures.keys()) != 0:
- faillist = [{'MetricName':name, 'CollectedValue':val} for name, val in failures.items()]
- self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'],
- 'RangeLower': rule['RangeLower'],
- 'RangeUpper': rule['RangeUpper'],
- 'ErrorThreshold':rule['ErrorThreshold'],
- 'Failure':faillist})
+ self.errlist.extend([TestError([name], self.workloads[self.wlidx], val,
+ rule['RangeLower'], rule['RangeUpper']) for name, val in failures.items()])

return

@@ -312,19 +335,11 @@ class Validator:
"""
Create final report and write into a JSON file.
"""
- alldata = list()
- for i in range(0, len(self.workloads)):
- reportstas = {"Total Rule Count": self.alltotalcnt[i], "Passed Rule Count": self.allpassedcnt[i]}
- data = {"Metric Validation Statistics": reportstas, "Tests in Category": self.allfailtests[i],
- "Errors":self.allerrlist[i]}
- alldata.append({"Workload": self.workloads[i], "Report": data})
-
- json_str = json.dumps(alldata, indent=4)
- print("Test validation finished. Final report: ")
- print(json_str)
+ print(self.errlist)

if self.debug:
- allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]} for i in range(0, len(self.workloads))]
+ allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]}
+ for i in range(0, len(self.workloads))]
self.json_dump(allres, self.datafname)

def check_rule(self, testtype, metric_list):
@@ -342,13 +357,13 @@ class Validator:
return True

# Start of Collector and Converter
- def convert(self, data: list, metricvalues:dict):
+ def convert(self, data: list, metricvalues: dict):
"""
Convert collected metric data from the -j output to dict of {metric_name:value}.
"""
for json_string in data:
try:
- result =json.loads(json_string)
+ result = json.loads(json_string)
if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "":
name = result["metric-unit"].split(" ")[1] if len(result["metric-unit"].split(" ")) > 1 \
else result["metric-unit"]
@@ -365,9 +380,10 @@ class Validator:
print(" ".join(command))
cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8')
data = [x+'}' for x in cmd.stderr.split('}\n') if x]
+ if data[0][0] != '{':
+ data[0] = data[0][data[0].find('{'):]
return data

-
def collect_perf(self, workload: str):
"""
Collect metric data with "perf stat -M" on given workload with -a and -j.
@@ -385,14 +401,18 @@ class Validator:
if rule["TestType"] == "RelationshipTest":
metrics = [m["Name"] for m in rule["Metrics"]]
if not any(m not in collectlist[0] for m in metrics):
- collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))]
+ collectlist[rule["RuleIndex"]] = [
+ ",".join(list(set(metrics)))]

for idx, metrics in collectlist.items():
- if idx == 0: wl = "true"
- else: wl = workload
+ if idx == 0:
+ wl = "true"
+ else:
+ wl = workload
for metric in metrics:
data = self._run_perf(metric, wl)
- if idx not in self.results: self.results[idx] = dict()
+ if idx not in self.results:
+ self.results[idx] = dict()
self.convert(data, self.results[idx])
return

@@ -412,7 +432,8 @@ class Validator:
2) create metric name list
"""
command = ['perf', 'list', '-j', '--details', 'metrics']
- cmd = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
+ cmd = subprocess.run(command, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, encoding='utf-8')
try:
data = json.loads(cmd.stdout)
for m in data:
@@ -453,12 +474,12 @@ class Validator:
rules = data['RelationshipRules']
self.skiplist = set([name.lower() for name in data['SkipList']])
self.rules = self.remove_unsupported_rules(rules)
- pctgrule = {'RuleIndex':0,
- 'TestType':'SingleMetricTest',
- 'RangeLower':'0',
+ pctgrule = {'RuleIndex': 0,
+ 'TestType': 'SingleMetricTest',
+ 'RangeLower': '0',
'RangeUpper': '100',
'ErrorThreshold': self.tolerance,
- 'Description':'Metrics in percent unit have value with in [0, 100]',
+ 'Description': 'Metrics in percent unit have value with in [0, 100]',
'Metrics': [{'Name': m.lower()} for m in self.pctgmetrics]}
self.rules.append(pctgrule)

@@ -469,8 +490,9 @@ class Validator:
idx += 1

if self.debug:
- #TODO: need to test and generate file name correctly
- data = {'RelationshipRules':self.rules, 'SupportedMetrics': [{"MetricName": name} for name in self.metrics]}
+ # TODO: need to test and generate file name correctly
+ data = {'RelationshipRules': self.rules, 'SupportedMetrics': [
+ {"MetricName": name} for name in self.metrics]}
self.json_dump(data, self.fullrulefname)

return
@@ -482,20 +504,17 @@ class Validator:
@param key: key to the dictionaries (index of self.workloads).
'''
self.allresults[key] = self.results
- self.allignoremetrics[key] = self.ignoremetrics
- self.allfailtests[key] = self.failtests
self.alltotalcnt[key] = self.totalcnt
self.allpassedcnt[key] = self.passedcnt
- self.allerrlist[key] = self.errlist

- #Initialize data structures before data validation of each workload
+ # Initialize data structures before data validation of each workload
def _init_data(self):

- testtypes = ['PositiveValueTest', 'RelationshipTest', 'SingleMetricTest']
+ testtypes = ['PositiveValueTest',
+ 'RelationshipTest', 'SingleMetricTest']
self.results = dict()
- self.ignoremetrics= set()
+ self.ignoremetrics = set()
self.errlist = list()
- self.failtests = {k:{'Total Tests':0, 'Passed Tests':0, 'Failed Tests':[]} for k in testtypes}
self.totalcnt = 0
self.passedcnt = 0

@@ -525,32 +544,33 @@ class Validator:
testtype = r['TestType']
if not self.check_rule(testtype, r['Metrics']):
continue
- if testtype == 'RelationshipTest':
+ if testtype == 'RelationshipTest':
self.relationship_test(r)
elif testtype == 'SingleMetricTest':
self.single_test(r)
else:
print("Unsupported Test Type: ", testtype)
- self.errlist.append("Unsupported Test Type from rule: " + r['RuleIndex'])
- self._storewldata(i)
print("Workload: ", self.workloads[i])
- print("Total metrics collected: ", self.failtests['PositiveValueTest']['Total Tests'])
- print("Non-negative metric count: ", self.failtests['PositiveValueTest']['Passed Tests'])
print("Total Test Count: ", self.totalcnt)
print("Passed Test Count: ", self.passedcnt)
-
+ self._storewldata(i)
self.create_report()
- return sum(self.alltotalcnt.values()) != sum(self.allpassedcnt.values())
+ return len(self.errlist) > 0
# End of Class Validator


def main() -> None:
- parser = argparse.ArgumentParser(description="Launch metric value validation")
-
- parser.add_argument("-rule", help="Base validation rule file", required=True)
- parser.add_argument("-output_dir", help="Path for validator output file, report file", required=True)
- parser.add_argument("-debug", help="Debug run, save intermediate data to files", action="store_true", default=False)
- parser.add_argument("-wl", help="Workload to run while data collection", default="true")
+ parser = argparse.ArgumentParser(
+ description="Launch metric value validation")
+
+ parser.add_argument(
+ "-rule", help="Base validation rule file", required=True)
+ parser.add_argument(
+ "-output_dir", help="Path for validator output file, report file", required=True)
+ parser.add_argument("-debug", help="Debug run, save intermediate data to files",
+ action="store_true", default=False)
+ parser.add_argument(
+ "-wl", help="Workload to run while data collection", default="true")
parser.add_argument("-m", help="Metric list to validate", default="")
args = parser.parse_args()
outpath = Path(args.output_dir)
@@ -559,8 +579,8 @@ def main() -> None:
datafile = Path.joinpath(outpath, 'perf_data.json')

validator = Validator(args.rule, reportf, debug=args.debug,
- datafname=datafile, fullrulefname=fullrule, workload=args.wl,
- metrics=args.m)
+ datafname=datafile, fullrulefname=fullrule, workload=args.wl,
+ metrics=args.m)
ret = validator.test()

return ret
@@ -569,6 +589,3 @@ def main() -> None:
if __name__ == "__main__":
import sys
sys.exit(main())
-
-
-
diff --git a/tools/perf/tests/shell/stat_metrics_values.sh b/tools/perf/tests/shell/stat_metrics_values.sh
index 7ca172599aa6..279f19c5919a 100755
--- a/tools/perf/tests/shell/stat_metrics_values.sh
+++ b/tools/perf/tests/shell/stat_metrics_values.sh
@@ -19,6 +19,8 @@ echo "Output will be stored in: $tmpdir"
$PYTHON $pythonvalidator -rule $rulefile -output_dir $tmpdir -wl "${workload}"
ret=$?
rm -rf $tmpdir
-
+if [ $ret -ne 0 ]; then
+ echo "Metric validation return with erros. Please check metrics reported with errors."
+fi
exit $ret

--
2.42.0