| #!/usr/bin/env python3 |
| |
| import argparse |
| import functools |
| import pathlib |
| import re |
| import statistics |
| import sys |
| import tempfile |
| |
| import numpy |
| import pandas |
| import plotly.express |
| import tabulate |
| |
| def parse_lnt(lines, aggregate=statistics.median): |
| """ |
| Parse lines in LNT format and return a list of dictionnaries of the form: |
| |
| [ |
| { |
| 'benchmark': <benchmark1>, |
| <metric1>: float, |
| <metric2>: float, |
| ... |
| }, |
| { |
| 'benchmark': <benchmark2>, |
| <metric1>: float, |
| <metric2>: float, |
| ... |
| }, |
| ... |
| ] |
| |
| If a metric has multiple values associated to it, they are aggregated into a single |
| value using the provided aggregation function. |
| """ |
| results = {} |
| for line in lines: |
| line = line.strip() |
| if not line: |
| continue |
| |
| (identifier, value) = line.split(' ') |
| (benchmark, metric) = identifier.split('.') |
| if benchmark not in results: |
| results[benchmark] = {'benchmark': benchmark} |
| |
| entry = results[benchmark] |
| if metric not in entry: |
| entry[metric] = [] |
| entry[metric].append(float(value)) |
| |
| for (bm, entry) in results.items(): |
| for metric in entry: |
| if isinstance(entry[metric], list): |
| entry[metric] = aggregate(entry[metric]) |
| |
| return list(results.values()) |
| |
| def plain_text_comparison(data, metric, baseline_name=None, candidate_name=None): |
| """ |
| Create a tabulated comparison of the baseline and the candidate for the given metric. |
| """ |
| data = data.replace(numpy.nan, None) # avoid NaNs in tabulate output |
| headers = ['Benchmark', baseline_name, candidate_name, 'Difference', '% Difference'] |
| fmt = (None, '.2f', '.2f', '.2f', '.2%') |
| table = data[['benchmark', f'{metric}_0', f'{metric}_1', 'difference', 'percent']] |
| |
| # Compute the geomean and report on their difference |
| geomean_0 = statistics.geometric_mean(data[f'{metric}_0'].dropna()) |
| geomean_1 = statistics.geometric_mean(data[f'{metric}_1'].dropna()) |
| geomean_row = ['Geomean', geomean_0, geomean_1, (geomean_1 - geomean_0), (geomean_1 - geomean_0) / geomean_0] |
| table.loc[table.index.max() + 1] = geomean_row |
| |
| return tabulate.tabulate(table.set_index('benchmark'), headers=headers, floatfmt=fmt, numalign='right') |
| |
| def create_chart(data, metric, subtitle=None, series_names=None): |
| """ |
| Create a bar chart comparing the given metric across the provided series. |
| """ |
| data = data.rename(columns={f'{metric}_{i}': series_names[i] for i in range(len(series_names))}) |
| title = ' vs '.join(series_names) |
| figure = plotly.express.bar(data, title=title, subtitle=subtitle, x='benchmark', y=series_names, barmode='group') |
| figure.update_layout(xaxis_title='', yaxis_title='', legend_title='') |
| return figure |
| |
| def produce_kpis(data, noise, extrema, series, series_names, meta_candidate, title): |
| addendum = f"{noise}% noise threshold, based on {len(data)} benchmarks" |
| top_addendum = f"by >= {extrema}%, {noise}% noise threshold, based on {len(data)} benchmarks" |
| headers = [title if title else ''] |
| columns = [[ |
| f'Benchmarks where {meta_candidate} is faster than {series_names[0]} ({addendum})', |
| f'Neutral benchmarks ({addendum})', |
| f'Benchmarks where {meta_candidate} is slower than {series_names[0]} ({addendum})', |
| f'Worst performers ({top_addendum})', |
| f'Best performers ({top_addendum})', |
| ]] |
| fmt = [None] |
| |
| def compute_kpis(base, cand): |
| diff = data[cand] - data[base] |
| pct = diff / data[base] |
| faster = data[(data[base] > data[cand]) & (pct.abs() > noise)] |
| neutral = data[pct.abs() <= noise] |
| slower = data[(data[base] < data[cand]) & (pct.abs() > noise)] |
| worst = data[(data[base] < data[cand]) & (pct.abs() >= extrema)] |
| best = data[(data[base] > data[cand]) & (pct.abs() >= extrema)] |
| return list(map(lambda k: len(k) / len(data), [faster, neutral, slower, worst, best])) |
| |
| baseline = series[0] |
| for (i, candidate) in enumerate(series[1:], start=1): |
| kpis = compute_kpis(baseline, candidate) |
| if i > 1: # after the first series, also generate a relative difference |
| previous_kpis = columns[-1] |
| rel_deltas = [(kpis[k] - previous_kpis[k]) / previous_kpis[k] for k in range(len(kpis))] |
| headers.append('rel Δ') |
| columns.append(rel_deltas) |
| fmt.append('+.2%') |
| |
| headers.append(series_names[i]) |
| columns.append(kpis) |
| fmt.append('.2%') |
| |
| rows = list(zip(*columns)) |
| print(tabulate.tabulate(rows, headers=headers, floatfmt=fmt)) |
| |
| def main(argv): |
| parser = argparse.ArgumentParser( |
| prog='compare-benchmarks', |
| description='Compare the results of multiple sets of benchmarks in LNT format.', |
| epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.') |
| parser.add_argument('files', type=argparse.FileType('r'), nargs='+', |
| help='Path to LNT format files containing the benchmark results to compare. In the text format, ' |
| 'exactly two files must be compared.') |
| parser.add_argument('--output', '-o', type=pathlib.Path, required=False, |
| help='Path of a file where to output the resulting comparison. If the output format is `text`, ' |
| 'default to stdout. If the output format is `chart`, default to a temporary file which is ' |
| 'opened automatically once generated, but not removed after creation.') |
| parser.add_argument('--metric', type=str, default='execution_time', |
| help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- ' |
| 'this option allows selecting which metric is being analyzed. The default is `execution_time`.') |
| parser.add_argument('--filter', type=str, required=False, |
| help='An optional regular expression used to filter the benchmarks included in the comparison. ' |
| 'Only benchmarks whose names match the regular expression will be included.') |
| parser.add_argument('--sort', type=str, required=False, default='benchmark', |
| choices=['benchmark', 'baseline', 'candidate', 'percent_diff'], |
| help='Optional sorting criteria for displaying results. By default, results are displayed in ' |
| 'alphabetical order of the benchmark. Supported sorting criteria are: ' |
| '`benchmark` (sort using the alphabetical name of the benchmark), ' |
| '`baseline` (sort using the absolute number of the baseline run), ' |
| '`candidate` (sort using the absolute number of the candidate run), ' |
| 'and `percent_diff` (sort using the percent difference between the baseline and the candidate). ' |
| 'Note that when more than two input files are compared, the only valid sorting order is `benchmark`.') |
| parser.add_argument('--format', type=str, choices=['text', 'chart', 'kpi'], default='text', |
| help='Select the output format. `text` generates a plain-text comparison in tabular form, `chart` ' |
| 'generates a self-contained HTML graph that can be opened in a browser, and `kpi` generates a ' |
| 'summary report based on a few KPIs. The default is `text`.') |
| parser.add_argument('--open', action='store_true', |
| help='Whether to automatically open the generated HTML file when finished. This option only makes sense ' |
| 'when the output format is `chart`.') |
| parser.add_argument('--series-names', type=str, required=False, |
| help='Optional comma-delimited list of names to use for the various series. By default, we use ' |
| 'Baseline and Candidate for two input files, and CandidateN for subsequent inputs.') |
| parser.add_argument('--subtitle', type=str, required=False, |
| help='Optional subtitle to use for the chart. This can be used to help identify the contents of the chart. ' |
| 'This option cannot be used with the plain text output.') |
| parser.add_argument('--noise-threshold', type=float, required=False, |
| help='Noise threshold used by KPIs to determine noise. This is a floating point number between ' |
| '0 and 1 that represents the percentage of difference required between two results in order ' |
| 'for them not to be considered "within the noise" of each other.') |
| parser.add_argument('--top-performer-threshold', type=float, required=False, default=0.5, |
| help='Threshold percent used by KPIs to determine top (and worst) performers. This is a floating ' |
| 'point number between 0 and 1 that represents the percentage of difference required to consider ' |
| 'a benchmark to be a top/worst performer. For example, if this number is 0.5, we consider top/worst ' |
| 'performers in the data to be benchmarks that have at least 50%% of difference between the baseline ' |
| 'and the candidate.') |
| parser.add_argument('--meta-candidate', type=str, required=False, |
| help='The name to use for the candidate when producing a KPI report. Required for --format=kpi.') |
| parser.add_argument('--discard-benchmarks-introduced-after', type=str, required=False, |
| help='Discard benchmarks introduced after the given candidate. This is useful to stabilize reports ' |
| 'when new benchmarks are introduced as time goes on, which would change the total number of ' |
| 'benchmarks and hence appear to retroactively change the report for previous candidates. ' |
| 'If used, the name used here must correspond to the name of a series (as passed to or defaulted ' |
| 'via `--series-names`.') |
| args = parser.parse_args(argv) |
| |
| # Validate arguments (the values admissible for various arguments depend on other |
| # arguments, the number of inputs, etc) |
| if args.format == 'text': |
| if len(args.files) != 2: |
| parser.error('--format=text requires exactly two input files to compare') |
| if args.subtitle is not None: |
| parser.error('Passing --subtitle makes no sense with --format=text') |
| if args.open: |
| parser.error('Passing --open makes no sense with --format=text') |
| |
| if args.format == 'kpi': |
| if args.open: |
| parser.error('Passing --open makes no sense with --format=kpi') |
| if args.noise_threshold is None: |
| raise parser.error('--format=kpi requires passing a --noise-threshold') |
| if args.meta_candidate is None: |
| raise parser.error('--format=kpi requires passing a --meta-candidate') |
| |
| if len(args.files) != 2 and args.sort != 'benchmark': |
| parser.error('Using any sort order other than `benchmark` requires exactly two input files.') |
| |
| if args.series_names is None: |
| args.series_names = ['Baseline'] |
| if len(args.files) == 2: |
| args.series_names += ['Candidate'] |
| elif len(args.files) > 2: |
| args.series_names.extend(f'Candidate{n}' for n in range(1, len(args.files))) |
| else: |
| args.series_names = args.series_names.split(',') |
| if len(args.series_names) != len(args.files): |
| parser.error(f'Passed incorrect number of series names: got {len(args.series_names)} series names but {len(args.files)} inputs to compare') |
| |
| # Parse the raw LNT data and store each input in a dataframe |
| lnt_inputs = [parse_lnt(file.readlines()) for file in args.files] |
| inputs = [pandas.DataFrame(lnt).rename(columns={args.metric: f'{args.metric}_{i}'}) for (i, lnt) in enumerate(lnt_inputs)] |
| |
| # Join the inputs into a single dataframe |
| data = functools.reduce(lambda a, b: a.merge(b, how='outer', on='benchmark'), inputs) |
| |
| # If we have exactly two data sets, compute additional info in new columns |
| if len(lnt_inputs) == 2: |
| data['difference'] = data[f'{args.metric}_1'] - data[f'{args.metric}_0'] |
| data['percent'] = data['difference'] / data[f'{args.metric}_0'] |
| |
| if args.filter is not None: |
| keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None] |
| data = data[data['benchmark'].isin(keeplist)] |
| |
| # Sort the data by the appropriate criteria |
| if args.sort == 'benchmark': |
| data = data.sort_values(by='benchmark') |
| elif args.sort == 'baseline': |
| data = data.sort_values(by=f'{args.metric}_0') |
| elif args.sort == 'candidate': |
| data = data.sort_values(by=f'{args.metric}_1') |
| elif args.sort == 'percent_diff': |
| data = data.sort_values(by=f'percent') |
| |
| if args.format == 'chart': |
| figure = create_chart(data, args.metric, subtitle=args.subtitle, series_names=args.series_names) |
| do_open = args.output is None or args.open |
| output = args.output or tempfile.NamedTemporaryFile(suffix='.html').name |
| plotly.io.write_html(figure, file=output, auto_open=do_open) |
| elif args.format == 'kpi': |
| if args.discard_benchmarks_introduced_after is not None: |
| index = args.series_names.index(args.discard_benchmarks_introduced_after) |
| series_to_filter = [f'{args.metric}_{i}' for i in range(index+1, len(lnt_inputs))] |
| for candidate in series_to_filter: |
| first_candidate = f'{args.metric}_1' |
| data = data[~(data[first_candidate].isna() & data[candidate].notna())] |
| produce_kpis(data, noise=args.noise_threshold, |
| extrema=args.top_performer_threshold, |
| series=[f'{args.metric}_{i}' for i in range(len(lnt_inputs))], |
| series_names=args.series_names, |
| meta_candidate=args.meta_candidate, |
| title=args.subtitle) |
| else: |
| diff = plain_text_comparison(data, args.metric, baseline_name=args.series_names[0], |
| candidate_name=args.series_names[1]) |
| diff += '\n' |
| if args.output is not None: |
| with open(args.output, 'w') as out: |
| out.write(diff) |
| else: |
| sys.stdout.write(diff) |
| |
| if __name__ == '__main__': |
| main(sys.argv[1:]) |