Re: [PATCH v1 1/1] perf test: Simplify metric value validation test final report

From: Ian Rogers
Date: Tue Jan 30 2024 - 20:49:43 EST


On Tue, Jan 30, 2024 at 10:09 AM <weilin.wang@xxxxxxxxx> wrote:
>
> From: Weilin Wang <weilin.wang@xxxxxxxxx>
>
> The original test report was too complicated to read with information
> that not really useful. This new update simplify the report which should
> largely improve the readibility.
>
> Signed-off-by: Weilin Wang <weilin.wang@xxxxxxxxx>

Reviewed-by: Ian Rogers <irogers@xxxxxxxxxx>

Thanks,
Ian

> ---
> .../tests/shell/lib/perf_metric_validation.py | 231 ++++++++++--------
> tools/perf/tests/shell/stat_metrics_values.sh | 4 +-
> 2 files changed, 127 insertions(+), 108 deletions(-)
>
> diff --git a/tools/perf/tests/shell/lib/perf_metric_validation.py b/tools/perf/tests/shell/lib/perf_metric_validation.py
> index 50a34a9cc040..a2d235252183 100644
> --- a/tools/perf/tests/shell/lib/perf_metric_validation.py
> +++ b/tools/perf/tests/shell/lib/perf_metric_validation.py
> @@ -1,4 +1,4 @@
> -#SPDX-License-Identifier: GPL-2.0
> +# SPDX-License-Identifier: GPL-2.0
> import re
> import csv
> import json
> @@ -6,36 +6,61 @@ import argparse
> from pathlib import Path
> import subprocess
>
> +
> +class TestError:
> + def __init__(self, metric: list[str], wl: str, value: list[float], low: float, up=float('nan'), description=str()):
> + self.metric: list = metric # multiple metrics in relationship type tests
> + self.workloads = [wl] # multiple workloads possible
> + self.collectedValue: list = value
> + self.valueLowBound = low
> + self.valueUpBound = up
> + self.description = description
> +
> + def __repr__(self) -> str:
> + if len(self.metric) > 1:
> + return "\nMetric Relationship Error: \tThe collected value of metric {0}\n\
> + \tis {1} in workload(s): {2} \n\
> + \tbut expected value range is [{3}, {4}]\n\
> + \tRelationship rule description: \'{5}\'".format(self.metric, self.collectedValue, self.workloads,
> + self.valueLowBound, self.valueUpBound, self.description)
> + elif len(self.collectedValue) == 0:
> + return "\nNo Metric Value Error: \tMetric {0} returns with no value \n\
> + \tworkload(s): {1}".format(self.metric, self.workloads)
> + else:
> + return "\nWrong Metric Value Error: \tThe collected value of metric {0}\n\
> + \tis {1} in workload(s): {2}\n\
> + \tbut expected value range is [{3}, {4}]"\
> + .format(self.metric, self.collectedValue, self.workloads,
> + self.valueLowBound, self.valueUpBound)
> +
> +
> class Validator:
> def __init__(self, rulefname, reportfname='', t=5, debug=False, datafname='', fullrulefname='', workload='true', metrics=''):
> self.rulefname = rulefname
> self.reportfname = reportfname
> self.rules = None
> - self.collectlist:str = metrics
> + self.collectlist: str = metrics
> self.metrics = self.__set_metrics(metrics)
> self.skiplist = set()
> self.tolerance = t
>
> self.workloads = [x for x in workload.split(",") if x]
> - self.wlidx = 0 # idx of current workloads
> - self.allresults = dict() # metric results of all workload
> - self.allignoremetrics = dict() # metrics with no results or negative results
> - self.allfailtests = dict()
> + self.wlidx = 0 # idx of current workloads
> + self.allresults = dict() # metric results of all workload
> self.alltotalcnt = dict()
> self.allpassedcnt = dict()
> - self.allerrlist = dict()
>
> - self.results = dict() # metric results of current workload
> + self.results = dict() # metric results of current workload
> # vars for test pass/failure statistics
> - self.ignoremetrics= set() # metrics with no results or negative results, neg result counts as a failed test
> - self.failtests = dict()
> + # metrics with no results or negative results, neg result counts failed tests
> + self.ignoremetrics = set()
> self.totalcnt = 0
> self.passedcnt = 0
> # vars for errors
> self.errlist = list()
>
> # vars for Rule Generator
> - self.pctgmetrics = set() # Percentage rule
> + self.pctgmetrics = set() # Percentage rule
>
> # vars for debug
> self.datafname = datafname
> @@ -69,10 +94,10 @@ class Validator:
> ensure_ascii=True,
> indent=4)
>
> - def get_results(self, idx:int = 0):
> + def get_results(self, idx: int = 0):
> return self.results[idx]
>
> - def get_bounds(self, lb, ub, error, alias={}, ridx:int = 0) -> list:
> + def get_bounds(self, lb, ub, error, alias={}, ridx: int = 0) -> list:
> """
> Get bounds and tolerance from lb, ub, and error.
> If missing lb, use 0.0; missing ub, use float('inf); missing error, use self.tolerance.
> @@ -85,7 +110,7 @@ class Validator:
> tolerance, denormalized base on upper bound value
> """
> # init ubv and lbv to invalid values
> - def get_bound_value (bound, initval, ridx):
> + def get_bound_value(bound, initval, ridx):
> val = initval
> if isinstance(bound, int) or isinstance(bound, float):
> val = bound
> @@ -113,10 +138,10 @@ class Validator:
>
> return lbv, ubv, denormerr
>
> - def get_value(self, name:str, ridx:int = 0) -> list:
> + def get_value(self, name: str, ridx: int = 0) -> list:
> """
> Get value of the metric from self.results.
> - If result of this metric is not provided, the metric name will be added into self.ignoremetics and self.errlist.
> + If result of this metric is not provided, the metric name will be added into self.ignoremetics.
> All future test(s) on this metric will fail.
>
> @param name: name of the metric
> @@ -142,7 +167,7 @@ class Validator:
> Check if metrics value are non-negative.
> One metric is counted as one test.
> Failure: when metric value is negative or not provided.
> - Metrics with negative value will be added into the self.failtests['PositiveValueTest'] and self.ignoremetrics.
> + Metrics with negative value will be added into self.ignoremetrics.
> """
> negmetric = dict()
> pcnt = 0
> @@ -155,25 +180,27 @@ class Validator:
> else:
> pcnt += 1
> tcnt += 1
> + # The first round collect_perf() run these metrics with simple workload
> + # "true". We give metrics a second chance with a longer workload if less
> + # than 20 metrics failed positive test.
> if len(rerun) > 0 and len(rerun) < 20:
> second_results = dict()
> self.second_test(rerun, second_results)
> for name, val in second_results.items():
> - if name not in negmetric: continue
> + if name not in negmetric:
> + continue
> if val >= 0:
> del negmetric[name]
> pcnt += 1
>
> - self.failtests['PositiveValueTest']['Total Tests'] = tcnt
> - self.failtests['PositiveValueTest']['Passed Tests'] = pcnt
> if len(negmetric.keys()):
> self.ignoremetrics.update(negmetric.keys())
> - negmessage = ["{0}(={1:.4f})".format(name, val) for name, val in negmetric.items()]
> - self.failtests['PositiveValueTest']['Failed Tests'].append({'NegativeValue': negmessage})
> + self.errlist.extend(
> + [TestError([m], self.workloads[self.wlidx], negmetric[m], 0) for m in negmetric.keys()])
>
> return
>
> - def evaluate_formula(self, formula:str, alias:dict, ridx:int = 0):
> + def evaluate_formula(self, formula: str, alias: dict, ridx: int = 0):
> """
> Evaluate the value of formula.
>
> @@ -187,10 +214,11 @@ class Validator:
> sign = "+"
> f = str()
>
> - #TODO: support parenthesis?
> + # TODO: support parenthesis?
> for i in range(len(formula)):
> if i+1 == len(formula) or formula[i] in ('+', '-', '*', '/'):
> - s = alias[formula[b:i]] if i+1 < len(formula) else alias[formula[b:]]
> + s = alias[formula[b:i]] if i + \
> + 1 < len(formula) else alias[formula[b:]]
> v = self.get_value(s, ridx)
> if not v:
> errs.append(s)
> @@ -228,49 +256,49 @@ class Validator:
> alias = dict()
> for m in rule['Metrics']:
> alias[m['Alias']] = m['Name']
> - lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
> - val, f = self.evaluate_formula(rule['Formula'], alias, ridx=rule['RuleIndex'])
> + lbv, ubv, t = self.get_bounds(
> + rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'], alias, ridx=rule['RuleIndex'])
> + val, f = self.evaluate_formula(
> + rule['Formula'], alias, ridx=rule['RuleIndex'])
> +
> + lb = rule['RangeLower']
> + ub = rule['RangeUpper']
> + if isinstance(lb, str):
> + if lb in alias:
> + lb = alias[lb]
> + if isinstance(ub, str):
> + if ub in alias:
> + ub = alias[ub]
> +
> if val == -1:
> - self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Description':f})
> + self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [],
> + lb, ub, rule['Description']))
> elif not self.check_bound(val, lbv, ubv, t):
> - lb = rule['RangeLower']
> - ub = rule['RangeUpper']
> - if isinstance(lb, str):
> - if lb in alias:
> - lb = alias[lb]
> - if isinstance(ub, str):
> - if ub in alias:
> - ub = alias[ub]
> - self.failtests['RelationshipTest']['Failed Tests'].append({'RuleIndex': rule['RuleIndex'], 'Formula':f,
> - 'RangeLower': lb, 'LowerBoundValue': self.get_value(lb),
> - 'RangeUpper': ub, 'UpperBoundValue':self.get_value(ub),
> - 'ErrorThreshold': t, 'CollectedValue': val})
> + self.errlist.append(TestError([m['Name'] for m in rule['Metrics']], self.workloads[self.wlidx], [val],
> + lb, ub, rule['Description']))
> else:
> self.passedcnt += 1
> - self.failtests['RelationshipTest']['Passed Tests'] += 1
> self.totalcnt += 1
> - self.failtests['RelationshipTest']['Total Tests'] += 1
>
> return
>
> -
> # Single Metric Test
> - def single_test(self, rule:dict):
> + def single_test(self, rule: dict):
> """
> Validate if the metrics are in the required value range.
> eg. lower_bound <= metrics_value <= upper_bound
> One metric is counted as one test in this type of test.
> One rule may include one or more metrics.
> Failure: when the metric value not provided or the value is outside the bounds.
> - This test updates self.total_cnt and records failed tests in self.failtest['SingleMetricTest'].
> + This test updates self.total_cnt.
>
> @param rule: dict with metrics to validate and the value range requirement
> """
> - lbv, ubv, t = self.get_bounds(rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
> + lbv, ubv, t = self.get_bounds(
> + rule['RangeLower'], rule['RangeUpper'], rule['ErrorThreshold'])
> metrics = rule['Metrics']
> passcnt = 0
> totalcnt = 0
> - faillist = list()
> failures = dict()
> rerun = list()
> for m in metrics:
> @@ -286,25 +314,20 @@ class Validator:
> second_results = dict()
> self.second_test(rerun, second_results)
> for name, val in second_results.items():
> - if name not in failures: continue
> + if name not in failures:
> + continue
> if self.check_bound(val, lbv, ubv, t):
> passcnt += 1
> del failures[name]
> else:
> - failures[name] = val
> + failures[name] = [val]
> self.results[0][name] = val
>
> self.totalcnt += totalcnt
> self.passedcnt += passcnt
> - self.failtests['SingleMetricTest']['Total Tests'] += totalcnt
> - self.failtests['SingleMetricTest']['Passed Tests'] += passcnt
> if len(failures.keys()) != 0:
> - faillist = [{'MetricName':name, 'CollectedValue':val} for name, val in failures.items()]
> - self.failtests['SingleMetricTest']['Failed Tests'].append({'RuleIndex':rule['RuleIndex'],
> - 'RangeLower': rule['RangeLower'],
> - 'RangeUpper': rule['RangeUpper'],
> - 'ErrorThreshold':rule['ErrorThreshold'],
> - 'Failure':faillist})
> + self.errlist.extend([TestError([name], self.workloads[self.wlidx], val,
> + rule['RangeLower'], rule['RangeUpper']) for name, val in failures.items()])
>
> return
>
> @@ -312,19 +335,11 @@ class Validator:
> """
> Create final report and write into a JSON file.
> """
> - alldata = list()
> - for i in range(0, len(self.workloads)):
> - reportstas = {"Total Rule Count": self.alltotalcnt[i], "Passed Rule Count": self.allpassedcnt[i]}
> - data = {"Metric Validation Statistics": reportstas, "Tests in Category": self.allfailtests[i],
> - "Errors":self.allerrlist[i]}
> - alldata.append({"Workload": self.workloads[i], "Report": data})
> -
> - json_str = json.dumps(alldata, indent=4)
> - print("Test validation finished. Final report: ")
> - print(json_str)
> + print(self.errlist)
>
> if self.debug:
> - allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]} for i in range(0, len(self.workloads))]
> + allres = [{"Workload": self.workloads[i], "Results": self.allresults[i]}
> + for i in range(0, len(self.workloads))]
> self.json_dump(allres, self.datafname)
>
> def check_rule(self, testtype, metric_list):
> @@ -342,13 +357,13 @@ class Validator:
> return True
>
> # Start of Collector and Converter
> - def convert(self, data: list, metricvalues:dict):
> + def convert(self, data: list, metricvalues: dict):
> """
> Convert collected metric data from the -j output to dict of {metric_name:value}.
> """
> for json_string in data:
> try:
> - result =json.loads(json_string)
> + result = json.loads(json_string)
> if "metric-unit" in result and result["metric-unit"] != "(null)" and result["metric-unit"] != "":
> name = result["metric-unit"].split(" ")[1] if len(result["metric-unit"].split(" ")) > 1 \
> else result["metric-unit"]
> @@ -365,9 +380,10 @@ class Validator:
> print(" ".join(command))
> cmd = subprocess.run(command, stderr=subprocess.PIPE, encoding='utf-8')
> data = [x+'}' for x in cmd.stderr.split('}\n') if x]
> + if data[0][0] != '{':
> + data[0] = data[0][data[0].find('{'):]
> return data
>
> -
> def collect_perf(self, workload: str):
> """
> Collect metric data with "perf stat -M" on given workload with -a and -j.
> @@ -385,14 +401,18 @@ class Validator:
> if rule["TestType"] == "RelationshipTest":
> metrics = [m["Name"] for m in rule["Metrics"]]
> if not any(m not in collectlist[0] for m in metrics):
> - collectlist[rule["RuleIndex"]] = [",".join(list(set(metrics)))]
> + collectlist[rule["RuleIndex"]] = [
> + ",".join(list(set(metrics)))]
>
> for idx, metrics in collectlist.items():
> - if idx == 0: wl = "true"
> - else: wl = workload
> + if idx == 0:
> + wl = "true"
> + else:
> + wl = workload
> for metric in metrics:
> data = self._run_perf(metric, wl)
> - if idx not in self.results: self.results[idx] = dict()
> + if idx not in self.results:
> + self.results[idx] = dict()
> self.convert(data, self.results[idx])
> return
>
> @@ -412,7 +432,8 @@ class Validator:
> 2) create metric name list
> """
> command = ['perf', 'list', '-j', '--details', 'metrics']
> - cmd = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8')
> + cmd = subprocess.run(command, stdout=subprocess.PIPE,
> + stderr=subprocess.PIPE, encoding='utf-8')
> try:
> data = json.loads(cmd.stdout)
> for m in data:
> @@ -453,12 +474,12 @@ class Validator:
> rules = data['RelationshipRules']
> self.skiplist = set([name.lower() for name in data['SkipList']])
> self.rules = self.remove_unsupported_rules(rules)
> - pctgrule = {'RuleIndex':0,
> - 'TestType':'SingleMetricTest',
> - 'RangeLower':'0',
> + pctgrule = {'RuleIndex': 0,
> + 'TestType': 'SingleMetricTest',
> + 'RangeLower': '0',
> 'RangeUpper': '100',
> 'ErrorThreshold': self.tolerance,
> - 'Description':'Metrics in percent unit have value with in [0, 100]',
> + 'Description': 'Metrics in percent unit have value with in [0, 100]',
> 'Metrics': [{'Name': m.lower()} for m in self.pctgmetrics]}
> self.rules.append(pctgrule)
>
> @@ -469,8 +490,9 @@ class Validator:
> idx += 1
>
> if self.debug:
> - #TODO: need to test and generate file name correctly
> - data = {'RelationshipRules':self.rules, 'SupportedMetrics': [{"MetricName": name} for name in self.metrics]}
> + # TODO: need to test and generate file name correctly
> + data = {'RelationshipRules': self.rules, 'SupportedMetrics': [
> + {"MetricName": name} for name in self.metrics]}
> self.json_dump(data, self.fullrulefname)
>
> return
> @@ -482,20 +504,17 @@ class Validator:
> @param key: key to the dictionaries (index of self.workloads).
> '''
> self.allresults[key] = self.results
> - self.allignoremetrics[key] = self.ignoremetrics
> - self.allfailtests[key] = self.failtests
> self.alltotalcnt[key] = self.totalcnt
> self.allpassedcnt[key] = self.passedcnt
> - self.allerrlist[key] = self.errlist
>
> - #Initialize data structures before data validation of each workload
> + # Initialize data structures before data validation of each workload
> def _init_data(self):
>
> - testtypes = ['PositiveValueTest', 'RelationshipTest', 'SingleMetricTest']
> + testtypes = ['PositiveValueTest',
> + 'RelationshipTest', 'SingleMetricTest']
> self.results = dict()
> - self.ignoremetrics= set()
> + self.ignoremetrics = set()
> self.errlist = list()
> - self.failtests = {k:{'Total Tests':0, 'Passed Tests':0, 'Failed Tests':[]} for k in testtypes}
> self.totalcnt = 0
> self.passedcnt = 0
>
> @@ -525,32 +544,33 @@ class Validator:
> testtype = r['TestType']
> if not self.check_rule(testtype, r['Metrics']):
> continue
> - if testtype == 'RelationshipTest':
> + if testtype == 'RelationshipTest':
> self.relationship_test(r)
> elif testtype == 'SingleMetricTest':
> self.single_test(r)
> else:
> print("Unsupported Test Type: ", testtype)
> - self.errlist.append("Unsupported Test Type from rule: " + r['RuleIndex'])
> - self._storewldata(i)
> print("Workload: ", self.workloads[i])
> - print("Total metrics collected: ", self.failtests['PositiveValueTest']['Total Tests'])
> - print("Non-negative metric count: ", self.failtests['PositiveValueTest']['Passed Tests'])
> print("Total Test Count: ", self.totalcnt)
> print("Passed Test Count: ", self.passedcnt)
> -
> + self._storewldata(i)
> self.create_report()
> - return sum(self.alltotalcnt.values()) != sum(self.allpassedcntvalues())
> + return len(self.errlist) > 0
> # End of Class Validator
>
>
> def main() -> None:
> - parser = argparse.ArgumentParser(description="Launch metric value validation")
> -
> - parser.add_argument("-rule", help="Base validation rule file", required=True)
> - parser.add_argument("-output_dir", help="Path for validator output file, report file", required=True)
> - parser.add_argument("-debug", help="Debug run, save intermediate data to files", action="store_true", default=False)
> - parser.add_argument("-wl", help="Workload to run while data collection", default="true")
> + parser = argparse.ArgumentParser(
> + description="Launch metric value validation")
> +
> + parser.add_argument(
> + "-rule", help="Base validation rule file", required=True)
> + parser.add_argument(
> + "-output_dir", help="Path for validator output file, report file", required=True)
> + parser.add_argument("-debug", help="Debug run, save intermediate data to files",
> + action="store_true", default=False)
> + parser.add_argument(
> + "-wl", help="Workload to run while data collection", default="true")
> parser.add_argument("-m", help="Metric list to validate", default="")
> args = parser.parse_args()
> outpath = Path(args.output_dir)
> @@ -559,8 +579,8 @@ def main() -> None:
> datafile = Path.joinpath(outpath, 'perf_data.json')
>
> validator = Validator(args.rule, reportf, debug=args.debug,
> - datafname=datafile, fullrulefname=fullrule, workload=args.wl,
> - metrics=args.m)
> + datafname=datafile, fullrulefname=fullrule, workload=args.wl,
> + metrics=args.m)
> ret = validator.test()
>
> return ret
> @@ -569,6 +589,3 @@ def main() -> None:
> if __name__ == "__main__":
> import sys
> sys.exit(main())
> -
> -
> -
> diff --git a/tools/perf/tests/shell/stat_metrics_values.sh b/tools/perf/tests/shell/stat_metrics_values.sh
> index 7ca172599aa6..279f19c5919a 100755
> --- a/tools/perf/tests/shell/stat_metrics_values.sh
> +++ b/tools/perf/tests/shell/stat_metrics_values.sh
> @@ -19,6 +19,8 @@ echo "Output will be stored in: $tmpdir"
> $PYTHON $pythonvalidator -rule $rulefile -output_dir $tmpdir -wl "${workload}"
> ret=$?
> rm -rf $tmpdir
> -
> +if [ $ret -ne 0 ]; then
> + echo "Metric validation return with erros. Please check metrics reported with errors."
> +fi
> exit $ret
>
> --
> 2.42.0
>