From 5f76fbaf41109faffdf7df266a948ecda7cfea88 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 00:55:42 -0300 Subject: [PATCH 01/15] add stats helper --- every_eval_ever/helpers/dataset_statistics.py | 671 ++++++++++++++++++ tests/test_dataset_statistics.py | 154 ++++ 2 files changed, 825 insertions(+) create mode 100644 every_eval_ever/helpers/dataset_statistics.py create mode 100644 tests/test_dataset_statistics.py diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py new file mode 100644 index 000000000..e766edc85 --- /dev/null +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -0,0 +1,671 @@ +"""Descriptive and uncertainty-aware summaries for Every Eval Ever.""" + +from __future__ import annotations + +import argparse +import json +import math +import random +import re +import statistics +import sys +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any, Iterable + +SEP = '=' * 72 +SUB = '-' * 72 + +REPO_ID = 'evaleval/EEE_datastore' +FOLDER_PATH = 'viewer_parquets' +HUGGING_FACE_DATASTORE = f'datasets/{REPO_ID}/{FOLDER_PATH}/**/*.parquet' + +CONTINUOUS_SCORE_TYPE = 'continuous' +STABILIZATION_WEIGHT = 5.0 +BOOTSTRAP_ITERATIONS = 400 +RANDOM_SEED = 20260429 + + +def read_data(datastore: str) -> list[str]: + from huggingface_hub import HfFileSystem + + hffs = HfFileSystem() + files = hffs.glob(datastore) + return [f'hf://{f}' for f in files if f.endswith('dataset.parquet')] + + +def load_schema_table(con: Any, table: str) -> None: + schema_urls = read_data(HUGGING_FACE_DATASTORE) + if not schema_urls: + raise RuntimeError('No schema parquet files found') + con.execute( + f""" + CREATE OR REPLACE TABLE {table} AS + SELECT * FROM read_parquet(?, union_by_name=true, filename=true) + """, + [schema_urls], + ) + + +def extract_result_rows( + con: Any, schema_table: str +) -> list[dict[str, Any]]: + rows = con.execute( + f""" + SELECT + schema_version, + evaluation_id, + model_info.id AS model_id, + model_info.developer AS model_developer, + er.evaluation_name AS evaluation_name, + er.source_data.dataset_name AS benchmark, + er.metric_config.score_type AS score_type, + er.metric_config.lower_is_better AS lower_is_better, + TRY_CAST(er.metric_config.min_score AS DOUBLE) AS min_score, + TRY_CAST(er.metric_config.max_score AS DOUBLE) AS max_score, + TRY_CAST(er.score_details.score AS DOUBLE) AS score, + er.score_details.uncertainty IS NOT NULL AS has_uncertainty, + er.metric_config.metric_id AS metric_id, + er.metric_config.metric_name AS metric_name, + er.metric_config.metric_kind AS metric_kind, + er.metric_config.metric_unit AS metric_unit, + source_metadata.source_organization_name AS source_organization + FROM {schema_table}, + LATERAL UNNEST(evaluation_results) AS t(er) + """ + ).fetchall() + columns = [ + 'schema_version', + 'evaluation_id', + 'model_id', + 'model_developer', + 'evaluation_name', + 'benchmark', + 'score_type', + 'lower_is_better', + 'min_score', + 'max_score', + 'score', + 'has_uncertainty', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', + 'source_organization', + ] + return [dict(zip(columns, row)) for row in rows] + + +def normalize_score( + score: float, + min_score: float, + max_score: float, + lower_is_better: bool, +) -> float: + normalized = (score - min_score) / (max_score - min_score) + if lower_is_better: + normalized = 1.0 - normalized + return normalized + + +def percentile(values: list[float], pct: float) -> float | None: + if not values: + return None + ordered = sorted(values) + index = (len(ordered) - 1) * pct + lower = math.floor(index) + upper = math.ceil(index) + if lower == upper: + return ordered[int(index)] + weight = index - lower + return ordered[lower] * (1.0 - weight) + ordered[upper] * weight + + +def numeric_summary(values: Iterable[float]) -> dict[str, float | int | None]: + vals = [v for v in values if v is not None and math.isfinite(v)] + if not vals: + return { + 'count': 0, + 'min': None, + 'median': None, + 'mean': None, + 'max': None, + 'stddev': None, + } + return { + 'count': len(vals), + 'min': min(vals), + 'median': statistics.median(vals), + 'mean': statistics.mean(vals), + 'max': max(vals), + 'stddev': statistics.stdev(vals) if len(vals) > 1 else 0.0, + } + + +def shared_evaluation_key(row: dict[str, Any]) -> str: + parts = [ + row.get('benchmark'), + row.get('evaluation_name'), + row.get('score_type'), + row.get('min_score'), + row.get('max_score'), + bool(row.get('lower_is_better')), + ] + return json.dumps(parts, sort_keys=False, separators=(',', ':')) + + +def quality_counts(rows: list[dict[str, Any]]) -> dict[str, int]: + counts = { + 'total_result_rows': len(rows), + 'missing_score': 0, + 'missing_bounds': 0, + 'zero_width_bounds': 0, + 'incompatible_score_type': 0, + 'out_of_range': 0, + 'missing_metadata': 0, + 'has_uncertainty': 0, + } + for row in rows: + score = row.get('score') + min_score = row.get('min_score') + max_score = row.get('max_score') + if score is None: + counts['missing_score'] += 1 + if min_score is None or max_score is None: + counts['missing_bounds'] += 1 + elif min_score == max_score: + counts['zero_width_bounds'] += 1 + elif score is not None and not min_score <= score <= max_score: + counts['out_of_range'] += 1 + if row.get('score_type') != CONTINUOUS_SCORE_TYPE: + counts['incompatible_score_type'] += 1 + if not row.get('model_id') or not row.get('benchmark'): + counts['missing_metadata'] += 1 + if row.get('has_uncertainty'): + counts['has_uncertainty'] += 1 + return counts + + +def valid_normalized_rows( + rows: list[dict[str, Any]], +) -> tuple[list[dict[str, Any]], dict[str, int]]: + valid = [] + exclusions = { + 'missing_score': 0, + 'missing_bounds': 0, + 'zero_width_bounds': 0, + 'incompatible_score_type': 0, + 'out_of_range': 0, + } + for row in rows: + score = row.get('score') + min_score = row.get('min_score') + max_score = row.get('max_score') + if score is None: + exclusions['missing_score'] += 1 + continue + if min_score is None or max_score is None: + exclusions['missing_bounds'] += 1 + continue + if min_score == max_score: + exclusions['zero_width_bounds'] += 1 + continue + if row.get('score_type') != CONTINUOUS_SCORE_TYPE: + exclusions['incompatible_score_type'] += 1 + continue + if not min_score <= score <= max_score: + exclusions['out_of_range'] += 1 + continue + normalized = normalize_score( + float(score), + float(min_score), + float(max_score), + bool(row.get('lower_is_better')), + ) + valid_row = dict(row) + valid_row['normalized_score'] = normalized + valid_row['shared_evaluation_key'] = shared_evaluation_key(row) + valid.append(valid_row) + return valid, exclusions + + +def distinct_count(rows: list[dict[str, Any]], key: str) -> int: + return len({row.get(key) for row in rows if row.get(key) is not None}) + + +def count_values( + rows: list[dict[str, Any]], key: str +) -> list[dict[str, int | str]]: + counts = Counter(str(row.get(key)) for row in rows if row.get(key) is not None) + return [ + {'value': value, 'count': count} + for value, count in counts.most_common() + ] + + +def grouped_summaries( + rows: list[dict[str, Any]], + value_key: str, + group_keys: tuple[str, ...], + limit: int, +) -> list[dict[str, Any]]: + grouped: dict[tuple[Any, ...], list[float]] = defaultdict(list) + for row in rows: + value = row.get(value_key) + if value is None: + continue + grouped[tuple(row.get(key) for key in group_keys)].append(float(value)) + + summaries = [] + for group, values in grouped.items(): + item = {key: group[index] for index, key in enumerate(group_keys)} + item.update(numeric_summary(values)) + summaries.append(item) + summaries.sort(key=lambda item: (-int(item['count']), str(item))) + return summaries[:limit] + + +def bootstrap_interval_and_support( + values: list[float], + threshold: float, + iterations: int = BOOTSTRAP_ITERATIONS, +) -> tuple[list[float | None], float | None]: + if not values: + return [None, None], None + rng = random.Random(RANDOM_SEED + len(values)) + estimates = [] + for _ in range(iterations): + sample = [values[rng.randrange(len(values))] for _ in values] + estimates.append(statistics.mean(sample)) + return [ + percentile(estimates, 0.025), + percentile(estimates, 0.975), + ], sum(estimate > threshold for estimate in estimates) / len(estimates) + + +def stabilized_estimate( + mean_score: float, + count: int, + corpus_mean: float, + weight: float = STABILIZATION_WEIGHT, +) -> float: + return (count * mean_score + weight * corpus_mean) / (count + weight) + + +def coverage_aware_model_summaries( + rows: list[dict[str, Any]], limit: int +) -> list[dict[str, Any]]: + if not rows: + return [] + corpus_mean = statistics.mean(row['normalized_score'] for row in rows) + key_means = { + key: statistics.mean(item['normalized_score'] for item in items) + for key, items in _group_rows(rows, 'shared_evaluation_key').items() + } + + summaries = [] + for model_id, items in _group_rows(rows, 'model_id').items(): + scores = [item['normalized_score'] for item in items] + centered_scores = [ + item['normalized_score'] + - key_means[item['shared_evaluation_key']] + + corpus_mean + for item in items + ] + raw_mean = statistics.mean(scores) + centered_mean = statistics.mean(centered_scores) + stabilized = stabilized_estimate(raw_mean, len(scores), corpus_mean) + rng = random.Random(RANDOM_SEED + len(scores)) + bootstrap_scores = [] + for _ in range(BOOTSTRAP_ITERATIONS): + sample = [scores[rng.randrange(len(scores))] for _ in scores] + bootstrap_scores.append( + stabilized_estimate( + statistics.mean(sample), len(sample), corpus_mean + ) + ) + interval = [ + percentile(bootstrap_scores, 0.025), + percentile(bootstrap_scores, 0.975), + ] + support = ( + sum(score > corpus_mean for score in bootstrap_scores) + / len(bootstrap_scores) + ) + summaries.append( + { + 'model_id': model_id, + 'result_count': len(items), + 'benchmark_count': distinct_count(items, 'benchmark'), + 'evaluation_count': distinct_count(items, 'shared_evaluation_key'), + 'mean_normalized_score': raw_mean, + 'benchmark_centered_score': centered_mean, + 'stabilized_score': stabilized, + 'uncertainty_interval': interval, + 'support_above_corpus_average': support, + } + ) + summaries.sort( + key=lambda item: ( + -float(item['stabilized_score']), + -int(item['evaluation_count']), + str(item['model_id']), + ) + ) + return summaries[:limit] + + +def pairwise_model_comparisons( + rows: list[dict[str, Any]], + min_shared_evals: int, + top_model_limit: int, + comparison_limit: int, +) -> list[dict[str, Any]]: + by_model_key: dict[str, dict[str, list[float]]] = defaultdict( + lambda: defaultdict(list) + ) + model_counts = Counter(row['model_id'] for row in rows if row.get('model_id')) + top_models = { + model + for model, _ in model_counts.most_common(top_model_limit) + if model is not None + } + for row in rows: + model_id = row.get('model_id') + if model_id not in top_models: + continue + by_model_key[model_id][row['shared_evaluation_key']].append( + row['normalized_score'] + ) + + model_scores = { + model: { + key: statistics.mean(values) + for key, values in scores_by_key.items() + } + for model, scores_by_key in by_model_key.items() + } + models = sorted(model_scores) + comparisons = [] + rng = random.Random(RANDOM_SEED) + for index, model_a in enumerate(models): + for model_b in models[index + 1 :]: + shared_keys = sorted( + set(model_scores[model_a]) & set(model_scores[model_b]) + ) + if len(shared_keys) < min_shared_evals: + continue + diffs = [ + model_scores[model_a][key] - model_scores[model_b][key] + for key in shared_keys + ] + boot_means = [] + for _ in range(BOOTSTRAP_ITERATIONS): + sample = [diffs[rng.randrange(len(diffs))] for _ in diffs] + boot_means.append(statistics.mean(sample)) + comparisons.append( + { + 'model_a': model_a, + 'model_b': model_b, + 'shared_evaluation_count': len(shared_keys), + 'mean_paired_difference': statistics.mean(diffs), + 'uncertainty_interval': [ + percentile(boot_means, 0.025), + percentile(boot_means, 0.975), + ], + 'support_model_a_higher': sum( + value > 0 for value in boot_means + ) + / len(boot_means), + } + ) + comparisons.sort( + key=lambda item: ( + -int(item['shared_evaluation_count']), + -abs(float(item['mean_paired_difference'])), + str(item['model_a']), + str(item['model_b']), + ) + ) + return comparisons[:comparison_limit] + + +def descriptive_statistics( + rows: list[dict[str, Any]], summary_limit: int +) -> dict[str, Any]: + valid_rows, exclusions = valid_normalized_rows(rows) + return { + 'counts': { + 'result_rows': len(rows), + 'unique_models': distinct_count(rows, 'model_id'), + 'unique_developers': distinct_count(rows, 'model_developer'), + 'unique_benchmarks': distinct_count(rows, 'benchmark'), + 'unique_evaluations': distinct_count(rows, 'evaluation_name'), + }, + 'schema_versions': count_values(rows, 'schema_version'), + 'quality': quality_counts(rows), + 'normalization_exclusions': exclusions, + 'score_summaries': grouped_summaries( + rows, + 'score', + ('benchmark', 'evaluation_name'), + summary_limit, + ), + 'normalized_score_summaries': grouped_summaries( + valid_rows, + 'normalized_score', + ('benchmark', 'evaluation_name'), + summary_limit, + ), + } + + +def build_statistics_report( + rows: list[dict[str, Any]], + summary_limit: int, + comparison_limit: int, + top_model_limit: int, + min_shared_evals: int, + descriptive_only: bool, +) -> dict[str, Any]: + valid_rows, exclusions = valid_normalized_rows(rows) + report = { + 'descriptive': descriptive_statistics(rows, summary_limit), + 'observational': { + 'valid_normalized_rows': len(valid_rows), + 'exclusions': exclusions, + }, + } + if descriptive_only: + return report + report['observational'].update( + { + 'coverage_aware_model_summaries': coverage_aware_model_summaries( + valid_rows, top_model_limit + ), + 'pairwise_model_comparisons': pairwise_model_comparisons( + valid_rows, + min_shared_evals, + top_model_limit, + comparison_limit, + ), + } + ) + return report + + +def _group_rows( + rows: list[dict[str, Any]], key: str +) -> dict[Any, list[dict[str, Any]]]: + grouped: dict[Any, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + grouped[row.get(key)].append(row) + return grouped + + +def section(title: str) -> None: + print(f'\n{SEP}') + print(f' {title.upper()}') + print(SUB) + + +def print_table(items: list[dict[str, Any]], columns: list[str]) -> None: + for item in items: + parts = [] + for column in columns: + value = item.get(column) + if isinstance(value, float): + value = f'{value:.4f}' + parts.append(f'{column}={value}') + print(' ' + ' '.join(parts)) + + +def print_report(report: dict[str, Any], descriptive_only: bool) -> None: + descriptive = report['descriptive'] + section('dataset counts') + for key, value in descriptive['counts'].items(): + print(f' {key:<32} {value:>10,}') + + section('quality diagnostics') + for key, value in descriptive['quality'].items(): + print(f' {key:<32} {value:>10,}') + + section('normalization exclusions') + for key, value in report['observational']['exclusions'].items(): + print(f' {key:<32} {value:>10,}') + + section('score summaries') + print_table( + descriptive['score_summaries'], + ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + ) + + section('normalized score summaries') + print_table( + descriptive['normalized_score_summaries'], + ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + ) + + if descriptive_only: + return + + section('coverage-aware model summaries') + print_table( + report['observational']['coverage_aware_model_summaries'], + [ + 'model_id', + 'evaluation_count', + 'stabilized_score', + 'benchmark_centered_score', + 'support_above_corpus_average', + ], + ) + + section('pairwise model comparisons') + print_table( + report['observational']['pairwise_model_comparisons'], + [ + 'model_a', + 'model_b', + 'shared_evaluation_count', + 'mean_paired_difference', + 'support_model_a_higher', + ], + ) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Generate Every Eval Ever dataset statistics.' + ) + parser.add_argument( + '--table', default='eee', help='Table name for the in-memory database' + ) + parser.add_argument( + '--stats-output', + type=Path, + help='Optional JSON output path for the statistics report', + ) + parser.add_argument( + '--summary-limit', + default=10, + type=int, + help='Number of descriptive summary rows to print', + ) + parser.add_argument( + '--comparison-limit', + default=50, + type=int, + help='Number of pairwise comparison rows to print', + ) + parser.add_argument( + '--top-model-limit', + default=50, + type=int, + help='Number of most-covered models to include in comparisons', + ) + parser.add_argument( + '--min-shared-evals', + default=5, + type=int, + help='Minimum shared evaluation keys for pairwise comparisons', + ) + parser.add_argument( + '--descriptive-only', + action='store_true', + help='Skip observational comparison summaries', + ) + args = parser.parse_args(argv) + if not re.fullmatch(r'[A-Za-z_][A-Za-z0-9_]*', args.table): + parser.error('--table must be a valid SQL identifier') + if args.summary_limit < 1: + parser.error('--summary-limit must be at least 1') + if args.comparison_limit < 1: + parser.error('--comparison-limit must be at least 1') + if args.top_model_limit < 1: + parser.error('--top-model-limit must be at least 1') + if args.min_shared_evals < 1: + parser.error('--min-shared-evals must be at least 1') + return args + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + import duckdb + + schema_table = f'{args.table}_schema' + with duckdb.connect(':memory:') as con: + try: + con.execute('LOAD httpfs;') + except duckdb.Error: + con.execute('INSTALL httpfs;') + con.execute('LOAD httpfs;') + + load_schema_table(con, schema_table) + rows = extract_result_rows(con, schema_table) + + report = build_statistics_report( + rows, + summary_limit=args.summary_limit, + comparison_limit=args.comparison_limit, + top_model_limit=args.top_model_limit, + min_shared_evals=args.min_shared_evals, + descriptive_only=args.descriptive_only, + ) + print_report(report, args.descriptive_only) + + if args.stats_output: + args.stats_output.parent.mkdir(parents=True, exist_ok=True) + args.stats_output.write_text( + json.dumps(report, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + print(f'\nWrote statistics JSON to {args.stats_output}') + + +if __name__ == '__main__': + try: + main() + except RuntimeError as exc: + print(str(exc), file=sys.stderr) + sys.exit(1) diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py new file mode 100644 index 000000000..9a4ebfc24 --- /dev/null +++ b/tests/test_dataset_statistics.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from every_eval_ever.helpers import dataset_statistics as stats + + +def row( + model_id: str, + benchmark: str, + evaluation_name: str, + score: float | None, + *, + min_score: float | None = 0.0, + max_score: float | None = 1.0, + lower_is_better: bool = False, + score_type: str | None = 'continuous', +) -> dict: + return { + 'schema_version': '0.2.2', + 'evaluation_id': f'{model_id}/{benchmark}/{evaluation_name}', + 'model_id': model_id, + 'model_developer': model_id.split('/')[0], + 'benchmark': benchmark, + 'evaluation_name': evaluation_name, + 'score': score, + 'min_score': min_score, + 'max_score': max_score, + 'lower_is_better': lower_is_better, + 'score_type': score_type, + 'has_uncertainty': False, + } + + +def test_normalization_respects_lower_is_better(): + assert stats.normalize_score(0.8, 0.0, 1.0, False) == 0.8 + assert stats.normalize_score(0.2, 0.0, 1.0, True) == 0.8 + + +def test_invalid_rows_are_excluded_and_counted(): + rows = [ + row('a', 'bench', 'eval', None), + row('a', 'bench', 'eval', 0.5, min_score=None), + row('a', 'bench', 'eval', 0.5, min_score=1.0, max_score=1.0), + row('a', 'bench', 'eval', 0.5, score_type='binary'), + row('a', 'bench', 'eval', 2.0), + row('a', 'bench', 'eval', 0.8), + ] + + valid, exclusions = stats.valid_normalized_rows(rows) + + assert len(valid) == 1 + assert exclusions == { + 'missing_score': 1, + 'missing_bounds': 1, + 'zero_width_bounds': 1, + 'incompatible_score_type': 1, + 'out_of_range': 1, + } + + +def test_shared_evaluation_key_includes_score_scale_and_direction(): + base = row('a', 'bench', 'eval', 0.8) + different_scale = row('a', 'bench', 'eval', 80.0, max_score=100.0) + different_direction = row('a', 'bench', 'eval', 0.2, lower_is_better=True) + + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_scale + ) + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_direction + ) + + +def test_min_shared_evals_filters_pairwise_comparisons(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + row('model/a', 'b2', 'e2', 0.8), + row('model/b', 'b2', 'e2', 0.6), + ] + valid, _ = stats.valid_normalized_rows(rows) + + assert ( + stats.pairwise_model_comparisons( + valid, min_shared_evals=3, top_model_limit=10, comparison_limit=10 + ) + == [] + ) + comparisons = stats.pairwise_model_comparisons( + valid, min_shared_evals=2, top_model_limit=10, comparison_limit=10 + ) + assert len(comparisons) == 1 + assert comparisons[0]['shared_evaluation_count'] == 2 + + +def test_stabilized_estimates_move_sparse_models_toward_corpus_mean(): + estimate = stats.stabilized_estimate( + mean_score=1.0, count=1, corpus_mean=0.5, weight=5.0 + ) + + assert 0.5 < estimate < 1.0 + + +def test_probability_style_support_outputs_are_bounded(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + row('model/a', 'b2', 'e2', 0.8), + row('model/b', 'b2', 'e2', 0.6), + row('model/a', 'b3', 'e3', 0.7), + row('model/b', 'b3', 'e3', 0.6), + ] + valid, _ = stats.valid_normalized_rows(rows) + + model_summaries = stats.coverage_aware_model_summaries(valid, limit=10) + comparisons = stats.pairwise_model_comparisons( + valid, min_shared_evals=2, top_model_limit=10, comparison_limit=10 + ) + + for summary in model_summaries: + support = summary['support_above_corpus_average'] + assert 0.0 <= support <= 1.0 + assert 0.0 <= comparisons[0]['support_model_a_higher'] <= 1.0 + + +def test_json_report_shape(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=False, + ) + + assert set(report) == {'descriptive', 'observational'} + assert report['descriptive']['counts']['result_rows'] == 2 + assert 'coverage_aware_model_summaries' in report['observational'] + assert 'pairwise_model_comparisons' in report['observational'] + + +def test_cli_help_uses_summary_limit_not_top_n(capsys): + try: + stats.parse_args(['--help']) + except SystemExit: + pass + + output = capsys.readouterr().out + assert '--summary-limit' in output + assert '--top-n' not in output From 3dcf199dd4d321528bfa553e14b64a3828f27a4b Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 13:30:46 -0300 Subject: [PATCH 02/15] Add dataset statistics plotting script --- scripts/plot_dataset_statistics.py | 432 +++++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 scripts/plot_dataset_statistics.py diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py new file mode 100644 index 000000000..4d670a031 --- /dev/null +++ b/scripts/plot_dataset_statistics.py @@ -0,0 +1,432 @@ +"""Generate PDF plots and a narrative summary from dataset statistics JSON.""" + +from __future__ import annotations + +import argparse +import json +import math +import textwrap +from pathlib import Path +from typing import Any + +PLOT_FILES = { + 'coverage': 'coverage_counts.pdf', + 'quality': 'normalization_quality.pdf', + 'top_coverage': 'top_evaluation_coverage.pdf', + 'mean': 'normalized_score_mean_by_eval.pdf', + 'variability': 'normalized_score_variability.pdf', + 'range': 'score_range_by_eval.pdf', +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Generate dataset-statistics PDF plots and summary.' + ) + parser.add_argument( + '--input', + type=Path, + default=Path('audit/dataset_statistics.json'), + help='Path to dataset_statistics.json.', + ) + parser.add_argument( + '--output-dir', + type=Path, + default=Path('audit/dataset_statistics_plots'), + help='Directory for generated PDF plots.', + ) + parser.add_argument( + '--summary-output', + type=Path, + default=Path('audit/dataset_statistics_summary.md'), + help='Path for generated Markdown summary.', + ) + parser.add_argument( + '--top-n', + type=int, + default=25, + help='Number of evaluation rows to show in ranked plots.', + ) + return parser.parse_args() + + +def load_statistics(path: Path) -> dict[str, Any]: + with path.open(encoding='utf-8') as handle: + return json.load(handle) + + +def import_plotting() -> tuple[Any, Any | None]: + try: + import matplotlib.pyplot as plt + except ModuleNotFoundError as exc: + raise SystemExit( + 'matplotlib is required to generate plots. Install matplotlib ' + 'or seaborn in the active environment and rerun this script.' + ) from exc + + try: + import seaborn as sns + except ModuleNotFoundError: + sns = None + + if sns is not None: + sns.set_theme(style='whitegrid', context='talk') + else: + plt.style.use('ggplot') + return plt, sns + + +def label(row: dict[str, Any]) -> str: + benchmark = str(row['benchmark']) + evaluation = str(row['evaluation_name']) + if benchmark == evaluation: + return benchmark + return f'{benchmark}: {evaluation}' + + +def short_label(value: str, width: int = 46) -> str: + return textwrap.shorten(value, width=width, placeholder='...') + + +def columns( + rows: list[dict[str, Any]], keys: tuple[str, ...] +) -> dict[str, list[Any]]: + return {key: [row[key] for row in rows] for key in keys} + + +def save(fig: Any, path: Path) -> None: + fig.tight_layout() + fig.savefig(path, format='pdf', bbox_inches='tight') + + +def plot_coverage_counts( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + counts = stats['descriptive']['counts'] + order = [ + 'result_rows', + 'unique_models', + 'unique_developers', + 'unique_evaluations', + 'unique_benchmarks', + ] + rows = [ + {'metric': key.replace('_', ' ').title(), 'count': counts[key]} + for key in order + ] + + fig, ax = plt.subplots(figsize=(10, 5.5)) + if sns is not None: + sns.barplot( + data=columns(rows, ('metric', 'count')), + x='metric', + y='count', + hue='metric', + ax=ax, + legend=False, + ) + else: + ax.bar([row['metric'] for row in rows], [row['count'] for row in rows]) + ax.set_yscale('log') + ax.set_xlabel('') + ax.set_ylabel('Count, log scale') + ax.set_title('Dataset Coverage') + ax.tick_params(axis='x', rotation=25) + for index, row in enumerate(rows): + ax.text( + index, row['count'], f'{row["count"]:,}', ha='center', va='bottom' + ) + + path = output_dir / PLOT_FILES['coverage'] + save(fig, path) + plt.close(fig) + return path + + +def plot_normalization_quality( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + rows = [{'category': 'valid normalized rows', 'count': valid}] + [ + {'category': key.replace('_', ' '), 'count': value} + for key, value in exclusions.items() + ] + + fig, ax = plt.subplots(figsize=(11, 5.5)) + if sns is not None: + sns.barplot( + data=columns(rows, ('category', 'count')), + x='category', + y='count', + hue='category', + ax=ax, + legend=False, + ) + else: + ax.bar( + [row['category'] for row in rows], [row['count'] for row in rows] + ) + ax.set_yscale('symlog', linthresh=1) + ax.set_xlabel('') + ax.set_ylabel('Rows, symmetric log scale') + ax.set_title('Normalization Quality') + ax.tick_params(axis='x', rotation=25) + for index, row in enumerate(rows): + ax.text( + index, + max(row['count'], 1), + f'{row["count"]:,}', + ha='center', + va='bottom', + ) + + path = output_dir / PLOT_FILES['quality'] + save(fig, path) + plt.close(fig) + return path + + +def top_rows( + rows: list[dict[str, Any]], key: str, limit: int +) -> list[dict[str, Any]]: + return sorted(rows, key=lambda row: (-float(row[key]), label(row)))[:limit] + + +def plot_top_evaluation_coverage( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + selected = list(reversed(top_rows(rows, 'count', top_n))) + labels = [short_label(label(row), 58) for row in selected] + counts = [row['count'] for row in selected] + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + if sns is not None: + sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, counts) + ax.set_xlabel('Normalized result rows') + ax.set_ylabel('') + ax.set_title(f'Top {len(selected)} Evaluations By Coverage') + + path = output_dir / PLOT_FILES['top_coverage'] + save(fig, path) + plt.close(fig) + return path + + +def plot_normalized_score_means( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + selected = sorted(rows, key=lambda row: (float(row['mean']), label(row)))[ + :top_n + ] + labels = [short_label(label(row), 58) for row in selected] + means = [row['mean'] for row in selected] + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + if sns is not None: + sns.barplot(x=means, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, means) + ax.set_xlim(0, 1) + ax.set_xlabel('Mean normalized score') + ax.set_ylabel('') + ax.set_title(f'Lowest {len(selected)} Mean Normalized Scores') + + path = output_dir / PLOT_FILES['mean'] + save(fig, path) + plt.close(fig) + return path + + +def plot_score_variability( + rows: list[dict[str, Any]], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + plot_rows = [ + { + 'mean': row['mean'], + 'stddev': row['stddev'] or 0.0, + 'count': row['count'], + 'label': label(row), + } + for row in rows + ] + max_count = max(row['count'] for row in plot_rows) + sizes = [ + 45 + 455 * math.sqrt(row['count'] / max_count) for row in plot_rows + ] + + fig, ax = plt.subplots(figsize=(10, 7)) + if sns is not None: + sns.scatterplot( + data=columns(plot_rows, ('mean', 'stddev', 'count')), + x='mean', + y='stddev', + size='count', + sizes=(45, 500), + alpha=0.75, + legend=False, + ax=ax, + ) + else: + ax.scatter( + [row['mean'] for row in plot_rows], + [row['stddev'] for row in plot_rows], + s=sizes, + alpha=0.75, + ) + ax.set_xlim(0, 1) + ax.set_xlabel('Mean normalized score') + ax.set_ylabel('Standard deviation') + ax.set_title('Normalized Score Level vs. Variability') + + notable = sorted( + plot_rows, + key=lambda row: (row['stddev'], abs(row['mean'] - 0.5)), + reverse=True, + )[:8] + for row in notable: + ax.annotate( + short_label(row['label'], 24), + (row['mean'], row['stddev']), + xytext=(5, 4), + textcoords='offset points', + fontsize=8, + ) + + path = output_dir / PLOT_FILES['variability'] + save(fig, path) + plt.close(fig) + return path + + +def plot_score_ranges( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + top_n: int, +) -> Path: + selected = sorted( + rows, + key=lambda row: (float(row['max']) - float(row['min']), label(row)), + )[-top_n:] + labels = [short_label(label(row), 58) for row in selected] + mins = [row['min'] for row in selected] + maxes = [row['max'] for row in selected] + means = [row['mean'] for row in selected] + ypos = list(range(len(selected))) + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + for y, low, high in zip(ypos, mins, maxes, strict=True): + ax.hlines(y, low, high, color='#5b6770', linewidth=2.0, alpha=0.9) + ax.scatter(means, ypos, color='#0072b2', s=32, zorder=3, label='mean') + ax.set_yticks(ypos) + ax.set_yticklabels(labels) + ax.set_xlim(0, 1) + ax.set_xlabel('Normalized score range') + ax.set_ylabel('') + ax.set_title(f'Widest {len(selected)} Normalized Score Ranges') + ax.legend(loc='lower right') + + path = output_dir / PLOT_FILES['range'] + save(fig, path) + plt.close(fig) + return path + + +def pct(part: int, total: int) -> float: + return 100.0 * part / total if total else 0.0 + + +def write_summary( + stats: dict[str, Any], + rows: list[dict[str, Any]], + plot_paths: dict[str, Path], + output_path: Path, +) -> None: + counts = stats['descriptive']['counts'] + quality = stats['descriptive']['quality'] + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + out_of_range = exclusions.get('out_of_range', 0) + most_covered = top_rows(rows, 'count', 6) + highest_variance = sorted( + rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True + )[:4] + hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] + easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] + + def names(items: list[dict[str, Any]]) -> str: + return ', '.join(label(item) for item in items) + + relative_plots = { + name: path.relative_to(output_path.parent) + if path.is_relative_to(output_path.parent) + else path + for name, path in plot_paths.items() + } + text = f"""# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} benchmarks, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) shows that the datastore is broad in model count but still highly concentrated in a small number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This makes the normalized summaries a useful cross-benchmark view, while still leaving the raw score summaries available for scale-specific inspection. + +Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. + +Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust; the mean, variability, and range charts answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. That division keeps coverage questions separate from score interpretation. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. +""" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text, encoding='utf-8') + + +def main() -> None: + args = parse_args() + if args.top_n < 1: + raise SystemExit('--top-n must be at least 1') + + stats = load_statistics(args.input) + rows = stats['descriptive']['normalized_score_summaries'] + if not rows: + raise SystemExit( + 'No normalized_score_summaries found in statistics JSON.' + ) + + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + plt, sns = import_plotting() + + plot_paths = { + 'coverage': plot_coverage_counts(stats, output_dir, plt, sns), + 'quality': plot_normalization_quality(stats, output_dir, plt, sns), + 'top_coverage': plot_top_evaluation_coverage( + rows, output_dir, plt, sns, args.top_n + ), + 'mean': plot_normalized_score_means( + rows, output_dir, plt, sns, args.top_n + ), + 'variability': plot_score_variability(rows, output_dir, plt, sns), + 'range': plot_score_ranges(rows, output_dir, plt, args.top_n), + } + write_summary(stats, rows, plot_paths, args.summary_output) + + print(f'Wrote {len(plot_paths)} PDF plots to {output_dir}') + print(f'Wrote Markdown summary to {args.summary_output}') + + +if __name__ == '__main__': + main() From 5a0858cfd5c1c88e1fe9a234a1ba3e29b104de66 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 13:52:26 -0300 Subject: [PATCH 03/15] Add dataset model and runtime coverage plots --- every_eval_ever/helpers/dataset_statistics.py | 79 ++++++++-- scripts/plot_dataset_statistics.py | 147 +++++++++++++++++- tests/test_dataset_statistics.py | 44 ++++++ 3 files changed, 254 insertions(+), 16 deletions(-) diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py index e766edc85..8b2f2e0e8 100644 --- a/every_eval_ever/helpers/dataset_statistics.py +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -47,9 +47,7 @@ def load_schema_table(con: Any, table: str) -> None: ) -def extract_result_rows( - con: Any, schema_table: str -) -> list[dict[str, Any]]: +def extract_result_rows(con: Any, schema_table: str) -> list[dict[str, Any]]: rows = con.execute( f""" SELECT @@ -57,6 +55,7 @@ def extract_result_rows( evaluation_id, model_info.id AS model_id, model_info.developer AS model_developer, + model_info.inference_platform AS inference_engine, er.evaluation_name AS evaluation_name, er.source_data.dataset_name AS benchmark, er.metric_config.score_type AS score_type, @@ -79,6 +78,7 @@ def extract_result_rows( 'evaluation_id', 'model_id', 'model_developer', + 'inference_engine', 'evaluation_name', 'benchmark', 'score_type', @@ -236,13 +236,55 @@ def distinct_count(rows: list[dict[str, Any]], key: str) -> int: def count_values( rows: list[dict[str, Any]], key: str ) -> list[dict[str, int | str]]: - counts = Counter(str(row.get(key)) for row in rows if row.get(key) is not None) + counts = Counter( + str(row.get(key)) for row in rows if row.get(key) is not None + ) + return [ + {'value': value, 'count': count} + for value, count in counts.most_common() + ] + + +def count_values_with_unknown( + rows: list[dict[str, Any]], key: str, unknown: str = 'unknown' +) -> list[dict[str, int | str]]: + counts = Counter() + for row in rows: + value = row.get(key) + normalized = unknown if value is None else str(value).strip() + counts[normalized or unknown] += 1 return [ {'value': value, 'count': count} for value, count in counts.most_common() ] +def models_per_benchmark(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + grouped: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + benchmark = row.get('benchmark') + if benchmark is not None: + grouped[str(benchmark)].append(row) + + summaries = [] + for benchmark, items in grouped.items(): + summaries.append( + { + 'benchmark': benchmark, + 'unique_models': distinct_count(items, 'model_id'), + 'result_rows': len(items), + } + ) + summaries.sort( + key=lambda item: ( + -int(item['unique_models']), + -int(item['result_rows']), + str(item['benchmark']), + ) + ) + return summaries + + def grouped_summaries( rows: list[dict[str, Any]], value_key: str, @@ -328,16 +370,17 @@ def coverage_aware_model_summaries( percentile(bootstrap_scores, 0.025), percentile(bootstrap_scores, 0.975), ] - support = ( - sum(score > corpus_mean for score in bootstrap_scores) - / len(bootstrap_scores) + support = sum(score > corpus_mean for score in bootstrap_scores) / len( + bootstrap_scores ) summaries.append( { 'model_id': model_id, 'result_count': len(items), 'benchmark_count': distinct_count(items, 'benchmark'), - 'evaluation_count': distinct_count(items, 'shared_evaluation_key'), + 'evaluation_count': distinct_count( + items, 'shared_evaluation_key' + ), 'mean_normalized_score': raw_mean, 'benchmark_centered_score': centered_mean, 'stabilized_score': stabilized, @@ -364,7 +407,9 @@ def pairwise_model_comparisons( by_model_key: dict[str, dict[str, list[float]]] = defaultdict( lambda: defaultdict(list) ) - model_counts = Counter(row['model_id'] for row in rows if row.get('model_id')) + model_counts = Counter( + row['model_id'] for row in rows if row.get('model_id') + ) top_models = { model for model, _ in model_counts.most_common(top_model_limit) @@ -443,6 +488,10 @@ def descriptive_statistics( 'unique_evaluations': distinct_count(rows, 'evaluation_name'), }, 'schema_versions': count_values(rows, 'schema_version'), + 'inference_engines': count_values_with_unknown( + rows, 'inference_engine' + ), + 'models_per_benchmark': models_per_benchmark(rows), 'quality': quality_counts(rows), 'normalization_exclusions': exclusions, 'score_summaries': grouped_summaries( @@ -534,6 +583,18 @@ def print_report(report: dict[str, Any], descriptive_only: bool) -> None: for key, value in report['observational']['exclusions'].items(): print(f' {key:<32} {value:>10,}') + section('inference engines') + print_table( + descriptive['inference_engines'][:10], + ['value', 'count'], + ) + + section('models per benchmark') + print_table( + descriptive['models_per_benchmark'][:10], + ['benchmark', 'unique_models', 'result_rows'], + ) + section('score summaries') print_table( descriptive['score_summaries'], diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index 4d670a031..e274bcc69 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -5,6 +5,7 @@ import argparse import json import math +import statistics import textwrap from pathlib import Path from typing import Any @@ -16,6 +17,8 @@ 'mean': 'normalized_score_mean_by_eval.pdf', 'variability': 'normalized_score_variability.pdf', 'range': 'score_range_by_eval.pdf', + 'models_per_dataset': 'models_per_dataset_histogram.pdf', + 'engine_spread': 'inference_engine_spread.pdf', } @@ -343,6 +346,93 @@ def plot_score_ranges( return path +def plot_models_per_dataset_histogram( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + rows = stats['descriptive'].get('models_per_benchmark', []) + values = [row['unique_models'] for row in rows if row['unique_models'] > 0] + + fig, ax = plt.subplots(figsize=(10, 6)) + if values: + bins = min(30, max(8, len(set(values)))) + if sns is not None: + sns.histplot(values, bins=bins, ax=ax, color='#0072b2') + else: + ax.hist(values, bins=bins, color='#0072b2', edgecolor='white') + median_value = statistics.median(values) + ax.axvline( + median_value, + color='#d55e00', + linestyle='--', + linewidth=2, + label=f'median={median_value:g}', + ) + ax.legend() + else: + ax.text( + 0.5, + 0.5, + 'No model-per-dataset summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_xlabel('Unique models per dataset') + ax.set_ylabel('Datasets') + ax.set_title('Distribution Of Unique Models Per Dataset') + + path = output_dir / PLOT_FILES['models_per_dataset'] + save(fig, path) + plt.close(fig) + return path + + +def plot_inference_engine_spread( + stats: dict[str, Any], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + rows = stats['descriptive'].get('inference_engines', []) + selected = rows[:top_n] + remaining = rows[top_n:] + if remaining: + selected = selected + [ + { + 'value': 'other', + 'count': sum(int(row['count']) for row in remaining), + } + ] + selected = list(reversed(selected)) + labels = [short_label(str(row['value']), 48) for row in selected] + counts = [row['count'] for row in selected] + + fig, ax = plt.subplots(figsize=(10, max(5, len(selected) * 0.45))) + if selected: + if sns is not None: + sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, counts) + else: + ax.text( + 0.5, + 0.5, + 'No inference-engine summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_xlabel('Result rows') + ax.set_ylabel('') + ax.set_title('Recorded Inference Engine/Platform Spread') + + path = output_dir / PLOT_FILES['engine_spread'] + save(fig, path) + plt.close(fig) + return path + + def pct(part: int, total: int) -> float: return 100.0 * part / total if total else 0.0 @@ -353,21 +443,54 @@ def write_summary( plot_paths: dict[str, Path], output_path: Path, ) -> None: + descriptive = stats['descriptive'] counts = stats['descriptive']['counts'] quality = stats['descriptive']['quality'] valid = stats['observational']['valid_normalized_rows'] exclusions = stats['observational']['exclusions'] out_of_range = exclusions.get('out_of_range', 0) + models_per_benchmark = descriptive.get('models_per_benchmark', []) + inference_engines = descriptive.get('inference_engines', []) most_covered = top_rows(rows, 'count', 6) highest_variance = sorted( rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True )[:4] hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] + model_counts = [ + int(row['unique_models']) + for row in models_per_benchmark + if int(row['unique_models']) > 0 + ] + median_models = statistics.median(model_counts) if model_counts else 0 + max_models = max(model_counts) if model_counts else 0 + top_model_datasets = models_per_benchmark[:6] + known_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() != 'unknown' + ) + unknown_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() == 'unknown' + ) + top_engines = inference_engines[:6] def names(items: list[dict[str, Any]]) -> str: return ', '.join(label(item) for item in items) + def benchmark_model_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["benchmark"]} ({int(item["unique_models"]):,})' + for item in items + ) + + def engine_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["value"]} ({int(item["count"]):,})' for item in items + ) + relative_plots = { name: path.relative_to(output_path.parent) if path.is_relative_to(output_path.parent) @@ -376,19 +499,23 @@ def names(items: list[dict[str, Any]]) -> str: } text = f"""# Dataset Statistics Summary -This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} benchmarks, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) shows that the datastore is broad in model count but still highly concentrated in a small number of repeated evaluation families. +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} datasets, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`{relative_plots['quality']}`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. -Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This makes the normalized summaries a useful cross-benchmark view, while still leaving the raw score summaries available for scale-specific inspection. +The new model-per-dataset histogram (`{relative_plots['models_per_dataset']}`) adds that missing model-coverage view. Across datasets, the median number of unique models is {median_models:g}, and the largest dataset-level model count is {max_models:,}. The highest-coverage datasets by unique model count are {benchmark_model_names(top_model_datasets)}. This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. -Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. +The inference-engine spread plot (`{relative_plots['engine_spread']}`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are {engine_names(top_engines)}. In this snapshot, {known_engine_rows:,} rows have a named runtime field and {unknown_engine_rows:,} rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. -Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. +Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. -The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. -The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust; the mean, variability, and range charts answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. That division keeps coverage questions separate from score interpretation. +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. -Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. """ output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(text, encoding='utf-8') @@ -421,6 +548,12 @@ def main() -> None: ), 'variability': plot_score_variability(rows, output_dir, plt, sns), 'range': plot_score_ranges(rows, output_dir, plt, args.top_n), + 'models_per_dataset': plot_models_per_dataset_histogram( + stats, output_dir, plt, sns + ), + 'engine_spread': plot_inference_engine_spread( + stats, output_dir, plt, sns, args.top_n + ), } write_summary(stats, rows, plot_paths, args.summary_output) diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py index 9a4ebfc24..b9f2cbd6d 100644 --- a/tests/test_dataset_statistics.py +++ b/tests/test_dataset_statistics.py @@ -13,12 +13,14 @@ def row( max_score: float | None = 1.0, lower_is_better: bool = False, score_type: str | None = 'continuous', + inference_engine: str | None = None, ) -> dict: return { 'schema_version': '0.2.2', 'evaluation_id': f'{model_id}/{benchmark}/{evaluation_name}', 'model_id': model_id, 'model_developer': model_id.split('/')[0], + 'inference_engine': inference_engine, 'benchmark': benchmark, 'evaluation_name': evaluation_name, 'score': score, @@ -139,10 +141,52 @@ def test_json_report_shape(): assert set(report) == {'descriptive', 'observational'} assert report['descriptive']['counts']['result_rows'] == 2 + assert 'inference_engines' in report['descriptive'] + assert 'models_per_benchmark' in report['descriptive'] assert 'coverage_aware_model_summaries' in report['observational'] assert 'pairwise_model_comparisons' in report['observational'] +def test_models_per_benchmark_dedupes_model_counts(): + rows = [ + row('model/a', 'bench-one', 'eval-a', 0.9), + row('model/a', 'bench-one', 'eval-b', 0.8), + row('model/b', 'bench-one', 'eval-a', 0.7), + row('model/c', 'bench-two', 'eval-a', 0.6), + ] + + summaries = stats.models_per_benchmark(rows) + + assert summaries == [ + { + 'benchmark': 'bench-one', + 'unique_models': 2, + 'result_rows': 3, + }, + { + 'benchmark': 'bench-two', + 'unique_models': 1, + 'result_rows': 1, + }, + ] + + +def test_inference_engine_counts_group_missing_as_unknown(): + rows = [ + row('model/a', 'bench', 'eval', 0.9, inference_engine='vllm'), + row('model/b', 'bench', 'eval', 0.8, inference_engine=''), + row('model/c', 'bench', 'eval', 0.7, inference_engine=None), + row('model/d', 'bench', 'eval', 0.6, inference_engine='ollama'), + row('model/e', 'bench', 'eval', 0.5, inference_engine='vllm'), + ] + + assert stats.count_values_with_unknown(rows, 'inference_engine') == [ + {'value': 'vllm', 'count': 2}, + {'value': 'unknown', 'count': 2}, + {'value': 'ollama', 'count': 1}, + ] + + def test_cli_help_uses_summary_limit_not_top_n(capsys): try: stats.parse_args(['--help']) From c78b85595062691e8b20d7c9381db52870046f72 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 14:03:27 -0300 Subject: [PATCH 04/15] Remove generated dataset summary writer --- scripts/plot_dataset_statistics.py | 101 +---------------------------- 1 file changed, 2 insertions(+), 99 deletions(-) diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index e274bcc69..461f70c22 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -1,4 +1,4 @@ -"""Generate PDF plots and a narrative summary from dataset statistics JSON.""" +"""Generate PDF plots from dataset statistics JSON.""" from __future__ import annotations @@ -24,7 +24,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description='Generate dataset-statistics PDF plots and summary.' + description='Generate dataset-statistics PDF plots.' ) parser.add_argument( '--input', @@ -38,12 +38,6 @@ def parse_args() -> argparse.Namespace: default=Path('audit/dataset_statistics_plots'), help='Directory for generated PDF plots.', ) - parser.add_argument( - '--summary-output', - type=Path, - default=Path('audit/dataset_statistics_summary.md'), - help='Path for generated Markdown summary.', - ) parser.add_argument( '--top-n', type=int, @@ -433,94 +427,6 @@ def plot_inference_engine_spread( return path -def pct(part: int, total: int) -> float: - return 100.0 * part / total if total else 0.0 - - -def write_summary( - stats: dict[str, Any], - rows: list[dict[str, Any]], - plot_paths: dict[str, Path], - output_path: Path, -) -> None: - descriptive = stats['descriptive'] - counts = stats['descriptive']['counts'] - quality = stats['descriptive']['quality'] - valid = stats['observational']['valid_normalized_rows'] - exclusions = stats['observational']['exclusions'] - out_of_range = exclusions.get('out_of_range', 0) - models_per_benchmark = descriptive.get('models_per_benchmark', []) - inference_engines = descriptive.get('inference_engines', []) - most_covered = top_rows(rows, 'count', 6) - highest_variance = sorted( - rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True - )[:4] - hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] - easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] - model_counts = [ - int(row['unique_models']) - for row in models_per_benchmark - if int(row['unique_models']) > 0 - ] - median_models = statistics.median(model_counts) if model_counts else 0 - max_models = max(model_counts) if model_counts else 0 - top_model_datasets = models_per_benchmark[:6] - known_engine_rows = sum( - int(row['count']) - for row in inference_engines - if str(row['value']).strip().lower() != 'unknown' - ) - unknown_engine_rows = sum( - int(row['count']) - for row in inference_engines - if str(row['value']).strip().lower() == 'unknown' - ) - top_engines = inference_engines[:6] - - def names(items: list[dict[str, Any]]) -> str: - return ', '.join(label(item) for item in items) - - def benchmark_model_names(items: list[dict[str, Any]]) -> str: - return ', '.join( - f'{item["benchmark"]} ({int(item["unique_models"]):,})' - for item in items - ) - - def engine_names(items: list[dict[str, Any]]) -> str: - return ', '.join( - f'{item["value"]} ({int(item["count"]):,})' for item in items - ) - - relative_plots = { - name: path.relative_to(output_path.parent) - if path.is_relative_to(output_path.parent) - else path - for name, path in plot_paths.items() - } - text = f"""# Dataset Statistics Summary - -This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} datasets, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. - -Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`{relative_plots['quality']}`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. - -Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. - -The new model-per-dataset histogram (`{relative_plots['models_per_dataset']}`) adds that missing model-coverage view. Across datasets, the median number of unique models is {median_models:g}, and the largest dataset-level model count is {max_models:,}. The highest-coverage datasets by unique model count are {benchmark_model_names(top_model_datasets)}. This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. - -The inference-engine spread plot (`{relative_plots['engine_spread']}`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are {engine_names(top_engines)}. In this snapshot, {known_engine_rows:,} rows have a named runtime field and {unknown_engine_rows:,} rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. - -Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. - -The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. - -The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. - -Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. -""" - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(text, encoding='utf-8') - - def main() -> None: args = parse_args() if args.top_n < 1: @@ -555,10 +461,7 @@ def main() -> None: stats, output_dir, plt, sns, args.top_n ), } - write_summary(stats, rows, plot_paths, args.summary_output) - print(f'Wrote {len(plot_paths)} PDF plots to {output_dir}') - print(f'Wrote Markdown summary to {args.summary_output}') if __name__ == '__main__': From 7277f21670e20b2c92b64cf88480d60285e12d9f Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 14:07:58 -0300 Subject: [PATCH 05/15] Use log scale for inference engine plot --- scripts/plot_dataset_statistics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index 461f70c22..8e13b7625 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -398,7 +398,6 @@ def plot_inference_engine_spread( 'count': sum(int(row['count']) for row in remaining), } ] - selected = list(reversed(selected)) labels = [short_label(str(row['value']), 48) for row in selected] counts = [row['count'] for row in selected] @@ -408,6 +407,10 @@ def plot_inference_engine_spread( sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) else: ax.barh(labels, counts) + ax.set_xscale('log') + ax.set_xlim(left=1) + if ax.get_ylim()[0] < ax.get_ylim()[1]: + ax.invert_yaxis() else: ax.text( 0.5, @@ -417,7 +420,7 @@ def plot_inference_engine_spread( va='center', transform=ax.transAxes, ) - ax.set_xlabel('Result rows') + ax.set_xlabel('Result rows (log scale)') ax.set_ylabel('') ax.set_title('Recorded Inference Engine/Platform Spread') From f0e5dcd8ec09d85beb9831265127dc7e37bc0d26 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Wed, 29 Apr 2026 14:15:59 -0300 Subject: [PATCH 06/15] Fix top coverage plot ordering --- scripts/plot_dataset_statistics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index 8e13b7625..b2479699b 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -197,7 +197,7 @@ def plot_top_evaluation_coverage( sns: Any | None, top_n: int, ) -> Path: - selected = list(reversed(top_rows(rows, 'count', top_n))) + selected = top_rows(rows, 'count', top_n) labels = [short_label(label(row), 58) for row in selected] counts = [row['count'] for row in selected] @@ -206,6 +206,8 @@ def plot_top_evaluation_coverage( sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) else: ax.barh(labels, counts) + if ax.get_ylim()[0] < ax.get_ylim()[1]: + ax.invert_yaxis() ax.set_xlabel('Normalized result rows') ax.set_ylabel('') ax.set_title(f'Top {len(selected)} Evaluations By Coverage') From b460da8230635e64c82b5bf932ea7a522b73237d Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 09:45:12 -0300 Subject: [PATCH 07/15] Group score summaries by metric identity --- audit/audit_after.json | 262 ++ audit/audit_before.json | 426 ++ audit/dataset_statistics.json | 2407 ++++++++++ .../coverage_counts.pdf | Bin 0 -> 16555 bytes .../inference_engine_spread.pdf | Bin 0 -> 16123 bytes .../models_per_dataset_histogram.pdf | Bin 0 -> 13934 bytes .../normalization_quality.pdf | Bin 0 -> 15186 bytes .../normalized_score_mean_by_eval.pdf | Bin 0 -> 22251 bytes .../normalized_score_variability.pdf | Bin 0 -> 34031 bytes .../score_range_by_eval.pdf | Bin 0 -> 21724 bytes .../top_evaluation_coverage.pdf | Bin 0 -> 21142 bytes audit/dataset_statistics_summary.md | 37 + every_eval_ever/helpers/dataset_statistics.py | 35 +- misc/dataset_statistics_summary_writer.py | 109 + misc/eval_hierarchy.json | 4187 +++++++++++++++++ misc/eval_hierarchy.md | 459 ++ plan/backend-canonical-identity-plan.md | 115 + scripts/plot_dataset_statistics.py | 9 +- tests/test_dataset_statistics.py | 64 +- 19 files changed, 8101 insertions(+), 9 deletions(-) create mode 100644 audit/audit_after.json create mode 100644 audit/audit_before.json create mode 100644 audit/dataset_statistics.json create mode 100644 audit/dataset_statistics_plots/coverage_counts.pdf create mode 100644 audit/dataset_statistics_plots/inference_engine_spread.pdf create mode 100644 audit/dataset_statistics_plots/models_per_dataset_histogram.pdf create mode 100644 audit/dataset_statistics_plots/normalization_quality.pdf create mode 100644 audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf create mode 100644 audit/dataset_statistics_plots/normalized_score_variability.pdf create mode 100644 audit/dataset_statistics_plots/score_range_by_eval.pdf create mode 100644 audit/dataset_statistics_plots/top_evaluation_coverage.pdf create mode 100644 audit/dataset_statistics_summary.md create mode 100644 misc/dataset_statistics_summary_writer.py create mode 100644 misc/eval_hierarchy.json create mode 100644 misc/eval_hierarchy.md create mode 100644 plan/backend-canonical-identity-plan.md diff --git a/audit/audit_after.json b/audit/audit_after.json new file mode 100644 index 000000000..0dda99be0 --- /dev/null +++ b/audit/audit_after.json @@ -0,0 +1,262 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "metric_id": 1021, + "metric_name": 1021, + "metric_kind": 1021, + "metric_unit": 1021 + }, + "malformed": {}, + "top_missing_by_benchmark": { + "evaluation_result_id": [], + "metric_id": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_name": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_kind": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_unit": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ] + } +} \ No newline at end of file diff --git a/audit/audit_before.json b/audit/audit_before.json new file mode 100644 index 000000000..7b432cb88 --- /dev/null +++ b/audit/audit_before.json @@ -0,0 +1,426 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "evaluation_result_id": 37071, + "metric_id": 37071, + "metric_name": 37071, + "metric_kind": 37071, + "metric_unit": 37071 + }, + "malformed": { + "evaluation_result_id_pattern": 12588 + }, + "top_missing_by_benchmark": { + "evaluation_result_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_name": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_kind": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_unit": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ] + } +} \ No newline at end of file diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json new file mode 100644 index 000000000..dffd83d39 --- /dev/null +++ b/audit/dataset_statistics.json @@ -0,0 +1,2407 @@ +{ + "descriptive": { + "counts": { + "result_rows": 40495, + "unique_benchmarks": 59, + "unique_developers": 794, + "unique_evaluations": 178, + "unique_models": 5299 + }, + "inference_engines": [ + { + "count": 39618, + "value": "unknown" + }, + { + "count": 450, + "value": "ollama" + }, + { + "count": 150, + "value": "openai" + }, + { + "count": 54, + "value": "google" + }, + { + "count": 47, + "value": "anthropic" + }, + { + "count": 33, + "value": "gemini" + }, + { + "count": 30, + "value": "openrouter" + }, + { + "count": 26, + "value": "deepseek" + }, + { + "count": 18, + "value": "minimax" + }, + { + "count": 15, + "value": "moonshot" + }, + { + "count": 15, + "value": "ark" + }, + { + "count": 12, + "value": "zhipu" + }, + { + "count": 12, + "value": "qwen" + }, + { + "count": 12, + "value": "aliyun" + }, + { + "count": 3, + "value": "kuaishou" + } + ], + "models_per_benchmark": [ + { + "benchmark": "GPQA", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "IFEval", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "BBH", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MATH Level 5", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MMLU-PRO", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MUSR", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "RewardBench 2", + "result_rows": 1379, + "unique_models": 197 + }, + { + "benchmark": "RewardBench", + "result_rows": 1025, + "unique_models": 179 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "result_rows": 1020, + "unique_models": 139 + }, + { + "benchmark": "BFCL leaderboard CSV", + "result_rows": 3350, + "unique_models": 109 + }, + { + "benchmark": "GSM8K", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "LegalBench", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MATH", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MMLU", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MedQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NarrativeQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "OpenbookQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "WMT 2014", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_lite", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_mmlu", + "result_rows": 2844, + "unique_models": 79 + }, + { + "benchmark": "MMLU-Pro", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Omni-MATH", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "WildBench", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "helm_capabilities", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Wordle Arena Word Set", + "result_rows": 75, + "unique_models": 43 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "result_rows": 64, + "unique_models": 40 + }, + { + "benchmark": "SciArena leaderboard API", + "result_rows": 114, + "unique_models": 38 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "result_rows": 46, + "unique_models": 38 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "result_rows": 50, + "unique_models": 37 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "result_rows": 40, + "unique_models": 37 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "result_rows": 38, + "unique_models": 36 + }, + { + "benchmark": "wordle_arena_daily", + "result_rows": 92, + "unique_models": 32 + }, + { + "benchmark": "fibble4_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble5_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble_arena_daily", + "result_rows": 82, + "unique_models": 28 + }, + { + "benchmark": "global-mmlu-lite", + "result_rows": 912, + "unique_models": 27 + }, + { + "benchmark": "Easy Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Hard Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Medium Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "fibble3_arena_daily", + "result_rows": 75, + "unique_models": 25 + }, + { + "benchmark": "fibble2_arena_daily", + "result_rows": 66, + "unique_models": 22 + }, + { + "benchmark": "apex-agents", + "result_rows": 74, + "unique_models": 20 + }, + { + "benchmark": "ace", + "result_rows": 32, + "unique_models": 12 + }, + { + "benchmark": "apex-v1", + "result_rows": 19, + "unique_models": 10 + }, + { + "benchmark": "La Leaderboard composite dataset", + "result_rows": 5, + "unique_models": 5 + }, + { + "benchmark": "Anthropic RLHF dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Best ChatGPT Prompts", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Koala test dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Open Assistant", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Self Instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Vicuna", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "helm_instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "appworld/test_normal", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "browsecompplus", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "swe-bench", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/airline", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/retail", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/telecom", + "result_rows": 15, + "unique_models": 3 + } + ], + "normalization_exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "normalized_score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 266, + "evaluation_name": "v2_Semi_Private", + "max": 1.0, + "mean": 0.5578856391307715, + "median": 0.75515, + "min": 0.0, + "stddev": 0.44976366617156693 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 262, + "evaluation_name": "v1_Semi_Private", + "max": 0.9999805606556713, + "mean": 0.7136057730617251, + "median": 0.92835, + "min": 0.0, + "stddev": 0.3413295062389333 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 250, + "evaluation_name": "v2_Public_Eval", + "max": 1.0, + "mean": 0.5578486693149027, + "median": 0.8591871038330539, + "min": 0.0, + "stddev": 0.46020565690537485 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 228, + "evaluation_name": "v1_Public_Eval", + "max": 0.999984448524537, + "mean": 0.750460640659595, + "median": 0.9602438445183996, + "min": 0.0175, + "stddev": 0.3138616973551216 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 191, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5524884816753927, + "median": 0.5604, + "min": 0.008, + "stddev": 0.19526001389051642 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 0.9312, + "mean": 0.6721155963302752, + "median": 0.7076, + "min": 0.0, + "stddev": 0.16692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 0.9401999999999999, + "mean": 0.6615788990825688, + "median": 0.7104, + "min": 0.0, + "stddev": 0.17084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 0.9375, + "mean": 0.6427752293577982, + "median": 0.75, + "min": 0.0, + "stddev": 0.24460198666555008 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 0.9582999999999999, + "mean": 0.5703339449541285, + "median": 0.625, + "min": 0.0, + "stddev": 0.2059801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 0.9031, + "mean": 0.726408256880734, + "median": 0.7636, + "min": 0.0, + "stddev": 0.1625125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 0.7376, + "mean": 0.20235045871559632, + "median": 0.157, + "min": 0.0, + "stddev": 0.1699218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 0.7097, + "mean": 0.13904036697247707, + "median": 0.0839, + "min": 0.0, + "stddev": 0.1515138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 0.8323, + "mean": 0.2820403669724771, + "median": 0.271, + "min": 0.0, + "stddev": 0.208463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 0.7290000000000001, + "mean": 0.18597155963302753, + "median": 0.1161, + "min": 0.0, + "stddev": 0.18379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 0.7737999999999999, + "mean": 0.23962385321100918, + "median": 0.165, + "min": 0.0, + "stddev": 0.21479676048452157 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 0.825, + "mean": 0.29009174311926605, + "median": 0.2, + "min": 0.0, + "stddev": 0.24897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 0.76, + "mean": 0.24009174311926607, + "median": 0.175, + "min": 0.0, + "stddev": 0.2138372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 0.77, + "mean": 0.21591743119266055, + "median": 0.14, + "min": 0.0, + "stddev": 0.2171396175036615 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 0.74, + "mean": 0.21238532110091743, + "median": 0.15, + "min": 0.0, + "stddev": 0.194452693868985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 0.9065000000000001, + "mean": 0.7661733944954129, + "median": 0.83, + "min": 0.0, + "stddev": 0.18657086363085557 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 0.97, + "mean": 0.8535779816513761, + "median": 0.92, + "min": 0.0, + "stddev": 0.182740318362281 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 0.96, + "mean": 0.7979816513761467, + "median": 0.88, + "min": 0.0, + "stddev": 0.2273336991546167 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 0.925, + "mean": 0.7347706422018349, + "median": 0.825, + "min": 0.0, + "stddev": 0.24427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 0.8067, + "mean": 0.6783633027522936, + "median": 0.7258, + "min": 0.0, + "stddev": 0.14843039998882532 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 0.9959969388355802, + "mean": 0.910949171600733, + "median": 0.9723906516748102, + "min": 0.0, + "stddev": 0.16788751393048792 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 0.9983116129372659, + "mean": 0.9052860681766953, + "median": 0.9794227826729278, + "min": 0.0, + "stddev": 0.17750828285090742 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 0.9978872247523358, + "mean": 0.8712378709255851, + "median": 0.9528616366965585, + "min": 0.0, + "stddev": 0.18715211182331667 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 0.7746999999999999, + "mean": 0.3809394495412844, + "median": 0.3552, + "min": 0.0717, + "stddev": 0.1568359888890471 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 1.0, + "mean": 0.5, + "median": 0.5, + "min": 0.0, + "stddev": 0.2926814601721238 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 0.9987048455669116, + "mean": 0.8673404362764129, + "median": 0.9486161556437762, + "min": 0.0, + "stddev": 0.2029161256124978 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 1.0, + "mean": 0.7561073394495413, + "median": 0.8079000000000001, + "min": 0.06280000000000001, + "stddev": 0.16896574532662487 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 1.0, + "mean": 0.7637614678899083, + "median": 0.8125, + "min": 0.0, + "stddev": 0.19862042242738473 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 0.845, + "mean": 0.24573394495412845, + "median": 0.105, + "min": 0.0, + "stddev": 0.28751797503234583 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 0.87, + "mean": 0.2646788990825688, + "median": 0.13, + "min": 0.0, + "stddev": 0.29552705211555524 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 0.85, + "mean": 0.22678899082568807, + "median": 0.09, + "min": 0.0, + "stddev": 0.28410639873751836 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "GSM8K", + "count": 90, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6740333333333334, + "median": 0.765, + "min": 0.028, + "stddev": 0.24790177694247365 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "min": 0.169, + "stddev": 0.1866150109050233 + } + ], + "quality": { + "has_uncertainty": 1603, + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_metadata": 0, + "missing_score": 0, + "out_of_range": 100, + "total_result_rows": 40495, + "zero_width_bounds": 0 + }, + "schema_versions": [ + { + "count": 40495, + "value": "0.2.2" + } + ], + "score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 266, + "evaluation_name": "v2_Semi_Private", + "max": 77.16309638, + "mean": 1.3257351367669172, + "median": 0.09789999999999999, + "min": 0.0, + "stddev": 6.199066844791538 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 262, + "evaluation_name": "v1_Semi_Private", + "max": 44.25900135, + "mean": 0.8916221425572519, + "median": 0.30084999999999995, + "min": 0.0, + "stddev": 3.2441508923523688 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 250, + "evaluation_name": "v2_Public_Eval", + "max": 17.6, + "mean": 0.6595584, + "median": 0.08, + "min": 0.0, + "stddev": 2.152394923590425 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 228, + "evaluation_name": "v1_Public_Eval", + "max": 7.7201, + "mean": 0.5021848684210526, + "median": 0.3319, + "min": 0.0012, + "stddev": 0.8240755952564907 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5353568527918782, + "median": 0.5529, + "min": -0.01, + "stddev": 0.21529016446306679 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 93.12, + "mean": 67.21155963302752, + "median": 70.76, + "min": 0.0, + "stddev": 16.692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 94.02, + "mean": 66.15788990825688, + "median": 71.04, + "min": 0.0, + "stddev": 17.084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 93.75, + "mean": 64.27752293577981, + "median": 75.0, + "min": 0.0, + "stddev": 24.46019866655501 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 95.83, + "mean": 57.03339449541284, + "median": 62.5, + "min": 0.0, + "stddev": 20.59801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 90.31, + "mean": 72.64082568807339, + "median": 76.36, + "min": 0.0, + "stddev": 16.25125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 73.76, + "mean": 20.235045871559635, + "median": 15.7, + "min": 0.0, + "stddev": 16.99218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 70.97, + "mean": 13.904036697247706, + "median": 8.39, + "min": 0.0, + "stddev": 15.15138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 83.23, + "mean": 28.204036697247705, + "median": 27.1, + "min": 0.0, + "stddev": 20.8463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 72.9, + "mean": 18.597155963302754, + "median": 11.61, + "min": 0.0, + "stddev": 18.379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 77.38, + "mean": 23.962385321100918, + "median": 16.5, + "min": 0.0, + "stddev": 21.479676048452156 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 82.5, + "mean": 29.009174311926607, + "median": 20.0, + "min": 0.0, + "stddev": 24.897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 76.0, + "mean": 24.009174311926607, + "median": 17.5, + "min": 0.0, + "stddev": 21.38372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 77.0, + "mean": 21.591743119266056, + "median": 14.0, + "min": 0.0, + "stddev": 21.713961750366153 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 74.0, + "mean": 21.238532110091743, + "median": 15.0, + "min": 0.0, + "stddev": 19.445269386898502 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 90.65, + "mean": 76.61733944954129, + "median": 83.0, + "min": 0.0, + "stddev": 18.657086363085554 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 97.0, + "mean": 85.35779816513761, + "median": 92.0, + "min": 0.0, + "stddev": 18.274031836228097 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 96.0, + "mean": 79.79816513761467, + "median": 88.0, + "min": 0.0, + "stddev": 22.733369915461672 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 92.5, + "mean": 73.4770642201835, + "median": 82.5, + "min": 0.0, + "stddev": 24.427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 80.67, + "mean": 67.83633027522936, + "median": 72.58, + "min": 0.0, + "stddev": 14.843039998882533 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 169.87, + "mean": 15.127064220183486, + "median": 4.69, + "min": 0.68, + "stddev": 28.519051991371985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 568.59, + "mean": 53.85339449541284, + "median": 11.7, + "min": 0.96, + "stddev": 100.92943454619746 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 212.99, + "mean": 27.425045871559632, + "median": 10.04, + "min": 0.45, + "stddev": 39.86152829724822 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 77.47, + "mean": 38.09394495412844, + "median": 35.52, + "min": 7.17, + "stddev": 15.683598888904708 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 109.0, + "mean": 55.0, + "median": 55.0, + "min": 1.0, + "stddev": 31.609597698589376 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 355.17, + "mean": 47.11669724770642, + "median": 18.25, + "min": 0.46, + "stddev": 72.06972033379084 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 100.0, + "mean": 75.61073394495413, + "median": 80.79, + "min": 6.28, + "stddev": 16.896574532662488 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 100.0, + "mean": 76.37614678899082, + "median": 81.25, + "min": 0.0, + "stddev": 19.86204224273847 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 84.5, + "mean": 24.573394495412845, + "median": 10.5, + "min": 0.0, + "stddev": 28.751797503234584 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 87.0, + "mean": 26.46788990825688, + "median": 13.0, + "min": 0.0, + "stddev": 29.552705211555523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 85.0, + "mean": 22.678899082568808, + "median": 9.0, + "min": 0.0, + "stddev": 28.410639873751833 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "GSM8K", + "count": 91, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6556373626373626, + "median": 0.762, + "min": -1.0, + "stddev": 0.30260192099278316 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "min": 0.169, + "stddev": 0.1866150109050233 + } + ] + }, + "observational": { + "exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "valid_normalized_rows": 40395 + } +} diff --git a/audit/dataset_statistics_plots/coverage_counts.pdf b/audit/dataset_statistics_plots/coverage_counts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6ac4008ecd57c03cc296ec0e43acf30bc6503d4a GIT binary patch literal 16555 zcmb_^1yohf6R-#pzb^Qu!|P1j_xjIzI)Fn%;$F&;?u}xv0oJBwLg9D6^T7+0lr|I<&fGM_I}5DZ`ZHr#%A@P z*L>2wA(QJ2#u`iw%j6svxAUisRAXkANls3@-7xV-3D-ao$+%_fw$sz6TwNtc>J^gk0@j5t_o=jc-epWN*Dde7B*x%ewrj zt|=qR;RS}B1?{+G=|C3K@w!%{e?N|?w#4%*!mTe_JjiJB`PJ3jI}Pr>WmVPQXZ<7O z9qZ;FtgPSt_DbmSB_92U$~4q>3NJstq@Ut8F%!|#A+WmH^X0YgXHS6(4vM+Yi!8^_ zv*=>__>y;29H1-H5g!>=-!@e;Eo3Xa)vBU8QQu5C&+VEwHqkNWBa%LGJkg?W$4h-CV7)7^HfV7&k%cK~<_K@T zW&N}HXP>84V{bpbbYy{aALV^Eraf>Yos$y%1qRp)kK)MiS_BOh}u~KSZPY zwS7$^SLK`j*_#~Iq~3_>Pq*A6h~*q2DD#RM8jZuXFETa{H_)0%f7XoCaC&%8v8Av1 z{34SZM%!0=VED48y_Na!JUA4hiPW^OkquT?(%~8|eM7ArCva~%bN4K@4sG*YZhnH$ ztRpH-Pi)npRk%z0*CRX{8>_F}?mi)zb#A@eO++&X`m_i$+MZZTvDTPWbwMpGpd@Z5 zd-h3WV_V9noZiB|eVd-MoX1Y;-yN21l7lJF-Bdr-J=^U%ph~GNw-hnpkwf(8m54+C z#Ix6D!5M?1wI#WBMWy}u3mkfj-Q%|W6ZSu_M!L^K7u~D!(`^*xV4Wjs7y3uO@r(Jd z*bk!j+G(#AUo*zIgbX{YzI0j^VP6iiBzvs)Sr6t}FdQ(~kvoaA~xo>i3`pb1$ zge)C(xYyFnEO{y^Rm$1Nu?Z!|i6#Q>s3#AoKi{Gs5s~?tp^H^r*9>{3a{C3*H{+m(j3fqv_=53C;{>_V9enU!p-ne`u2qP5yxa1C1I2!H@Xi z2M#PB2-Jrf{)Bag*;j<1cXT9?Pde7uQqqys3H=wIsP7rmGN2E%6#xE_NSJQ8+{UZXs{=SUBn={;PS z@JY-r^VnL5K@;C%@}?Q5E{ll8i5dE_?K$ozk+$TC^?5IRXhiM~N_=4vnwlp}=zU+TPxAvWQ3;EMooDl^^;lq!9XlON-^AVl*CtpmK(C@0EfH+j6#s zElJP*%Th#8CSj~uPSQ?wFHZ}=1WuUW_n@Zl|^Wqa%PTlhj5&Z+s4Qhu1zy&sI* z0Sk2>-Q*scvHV0Bk`|2r_B0b?<_?}=Z0cvNC(TY>;Wkeq9Yw6HW_iGT?vo-833m7! zR&F89EOK*rYdW=}y~ET8cb?k{C+xQQ$?_>I-c)S%Zl@|^m@ zB)ZuB)6C?^&A)cba)c}ae&J(+Mv8h_yPv8{hcjO_oz3w}VgZSBeB}A{r1uP&%&rh| z&&jk!M4cJi|DfwXzlO7P$!Zk~zp=yyNHpL7<~Lfs*%?ogU|Wp>5^!h9v+v^51J?q%PG2Do1C9~xesnd#zB`#?DC5>8l?9vk6GdZm1lbyh7! zLQu0>fl+LL{cUk-RF=G0y*b$s;fStK5hl_%qJ&WAX2&8sqU>w42CY=NfN(u>`6&I}G} zoTE}jF*Ca7w#%g2HEH&6!yTs;W|@Nv?>!%HF4(vg7ed&y!bDC|XdCD9HeWNYc<-9O zJF$7s@$g|ka)Z#ilEu@LraDEcGzL;S;j5>qWag^M7V7rQBU{(5Z&+l%4TcXx-3 zn-p&Z;f0zUlicDIiz5;p@Yugn5LC&r3a7=nrQ0f73r#y_^Gf?XLNs-r-J&iX9k(KW zkM3ho^NZCJ+enVcP>J;y)WSOM*K$?ZSqgHTwrCxLOU3v{=rn_zi?G7 zuF@G7K_s)9dz9t+X8hWlNe!nZi8Wr;2j}a~p1vzz&?fWnZ5s@@Czsx; z`DrCaUl`GhEzJ5*leZ8%$oo)G^3s`1d%>G&GO8?0^m64+_cEgsLxOtWZ6Q2zc`|rj z=nZQ!MVC9P8b`yX7*1ctRScuIAg<79XoX*GPA^ieW6TZ|k87g7r#t&*+ivPQ)5TVW zF4L~G8#K1~F%gSoP5gEhUM#FPn8tZ`s)7lkYim6WB&+#9`A*F1?LP6LII|_`-66@o zu{iSJx`i+&nF2LsHA~WJe7VbquJTu}jIRj0oP|QX1QoxEh<(;wWp`^)=f*FI$Yph+ z6DRxBy+Y98)_DB(r@b$6Ya0xdVi*xD90t!xe{dM_FVzcMZqI$n{xap2K7BQTQ`TKbF7qU7#`8<``wwAvX65-85*09-j@;C^B_^nwT31?<20WTYiqmZg zGDx27Fns%Z8gXv)0*#e?T5d@0fUlxbmsSbu)OiNZks{2X0TQXX$T_7#s#kt_!C}4q zt!%{X9Svc6pY^q?4NaS8jb4sBOW(%A3-C<%2QT>e{$UTJ;}zrS5d<LA6 z8Ou(dT{ZiCuW7*47txI=tI1Wpfs&klinj}4*JRJ+u{mGl7|bn&Q&UvPQ+3_s{z!V8=!_prT=Ce+$44K8p9+%aQJkLC zxqsJckdDkB#lRRDkox({yW-4V&FJ!GCBAnNG7fEDR|#I5XW`FO_>%kE#>0lnGz>t9bmAA)~SE>(6cO z&ZtE&bXqO;RT|RhLoKW&q$2J$GAib;lPKbxoh+aVt;C-@x z?mCqXv2`D>+sa7t7hqR$gGlN+nyAan_)w_ujc!<>`>TdIgT-t{$zPnm9vdn|+lx15 zzjc3$cyVn1NaOrt0W2B-UIYB00Z2Z<|82$58cGiM2!f#*ikkBeuPZ9Fqk1(@FYcD1 zt2_`$!deE)wlKmgzPB0kOq?frN4mIC7t(Jkn!?t)D2~Na*5UMR!3$bv+jVi5bt{y z19sLLSM}v&ObtefIdfrtdzGQ-VX3}Yt!zV4=$5d4~!)NTZwB#vBIwcZPkKV49?2@*am!jR(DWqkM%`81IwJug5$VQ*Jl*eSC1Ei^3+H z&gylse{vI)vBh#zH)na{`hd`M-%r zt0Tb3bAA;f?sW0h`S^_mjC;ev{t!w=e?Yp?ScJ&}VsK5Xq3msUq7l7#)T)|UuqdvV zj3E#a>pg9J50A2}prZUlirXYy=K1E*xTuPjz+>Oue)kSbzWVXoc)|@Tuc{TLA2(`M zxUL3mx@fFyoAVfP*4~uUFm^dXGZ%a+Tv8=a?!1b1HT291cWvH1cP30heuDObmG6mi zJ1UQp9hwDFEXaXi`Zo5`k^UUL)4`yz+0 zL{U9tKW0Y1gBy8;<|U0x@=+SsOBk=Xu0`a-#-XGLBYnQ`hSN01tNk8SQP1z5bg(fp z$HGo*fjnC1U)b4fkB1=Iu7vQDo_`$U&8jn1IJZxGVboZZ)I=%kO)Fsl%r~0}9ml)b zcj{5+ixrWJ#4ULnq0jG`hO*oacf##J?Feql@`Wwx-!&o!8pw+TN$)*Y?w_njh)43hsp4f6FqvwnU5~_%uK0HhL|7{H(Auvg`9p z)9jafM;ZqWGqCUyOOVco_y=B!UDS5~RCD{Trc=O$@$@vS+#Cs?{jtGTyN7Lcct=Ya zpPMT_@y))Fqc|VuSY52|>*!@f6AGi$PA*_!V`2K}&dB_U;F@z|*73Z#h4_(+9x^-+ z!{_vPqlANn?M&0njFNp~nyyuMwm}?qxK@lxSxO&R5BEi#@^mJhqo#=4eFc%YfeJmn zyJHd<`uW7SGZWdK5pSRhZfNMEBUCe`5xujRg44HY_26M8Nhe6_UwgldqlS0-K8j^z-W5hU!O7A;c26+z7K~E7qcWd7V&6w( z!;8a)3}zgcZen!VR_`pa^rpSz^)_?uO|j+VERHbBZjz%Tb2Ed21a&dZJb^l5I*}~H zlp*E*GiJyK(?Ncj^TfHU_Z2E6{8M`#M*b~p6qb_&lJ6g6EvupEP!A`n+@{<)k7HZ1 zo568ROnKj*Ck1bo@syd;xdtsXYxif1;G)LVC`Ve+!s^c=++-e*yCy;=46x$1i zy?2V{z9IjBEiLvVx=srE#Tac$-#;v>R?~3n?9@uGccMd(ha~-YSU+7p>1Cmpn>vi` znQy8%8|JU!`*k&TpS}Dk^Tr8&mp@(=_rcuvEgCVUw9m^!4aWrs0bH%pyV>!#ID&Z zV$1m8s8uA2X(FdpprL4Gr}F*gnPrL2QeA5-9K>?u5&DM{txA+6$tx&u-%}OHR<2&W z_ZTDEzi-4v{owTN1>UE+)AgMQ?h%97Z#dcID-CiuSL<+0k>^X~s(SG0_ze9^F&Pz= z9N%JQ=iUX1N{Bq`SGP6PF1kQlEm2X+M2v+>EQcEu`X7P~v3QZ_vk0Oa+d?;a;n%a1 z7gu*AW0_v~S7ue;xe#O@^+rvzm!Et13d4|#eJ=I-2{mG$p2e{n>^`{*u-?+EwXsjB zUw2j=9Tq>?m-W>m`cWw6idVzT=UtxH+cn_^66pvbOj*Ok2+;|#_IGX5%pO^C+h$r8w^6pFr?n%>u!Qyw&>()0q z(Ttk`O5PFQ#Oy!0XLq&>GMUctQ{(kpIonNzW+}bUUaQgiN=_g+^hQLlRaMPki*c(# z*N`0=@x66pj(2a)_kmv2CT*s8Wi&zr&TCIgt3SV+et~_A8rv<)PLMdi3Xr! zH{5c2LsIvRn$#b5(UH;W<0OQtC|APrGZR%Dbv5_-ik)}}_nSmjpHEU$Df($rCwxBR z%d=W_iwvUu#Y1qsoaI{pnQTVjjl#2w6k6GYCSDHjP^yfrC28MI4teR&3`m|PadLaN zWO={my}B3PTwLw@?p^lLgb1zJH*~7YlP%}MD4w3%Ry}$gL%jd4^3K-1zABt~2BxH!pr3YExO;#Y72 z-2IfmQ`hlA7Bsc}vbOzCU^v%qoGh?!t;egk#(jtHrl7l>9v`AxKN!R5qkc2KnZSnr zPO*r#icykt#c;As7G)`I;>*09@scYaeUF6n&|bnqA(ki@{SOrWU>FX-Fu5NLGrqVY;dQs5@h`%f6DSZRWHJ241CrEGY z+e&iIeKo#8r9fI>m{ZJgv@B9O?@oFbeVY@v~A~rjW(`YV@pGFI4v+k_G zmx@iDB^R~W?uvcr$ua>YX$RR0Q}Rt8rY+S=*kqfIYp-frykc0lZ=%y`*p6D?Mi$sT zSGzDuQoowPPOU`NsA?aqrZ)N3LRE_7T>E7T=htZrx>BaO)Gm$pd{#<7Y%Liq+&+=xcfO%b(9EsCtNn(l(Y-wgccKObe)k>SC@0W5JSO7I`fcRcZm zMRbQ(Bp(P67lhSB==@E_{O6DN=E^|dn7`rHC1ZW5#nNj&e`{>ZJp5AgOZnY!2jC}A zxv+{!qs)j+U6)edv-(P(!gUE5-}YSJmZ7cjOfIJ%I9}Ei7g717pj1$`gwTMJh5D^A zHzCEDaB($i3o~X^L-a{uVUa+KMZ%SuSsqTRwE1ub=Hnu@pXdahX01!Q3YMO{cD5Rk zbWA5e{IT3+t_QDLIr`4_OPj_!KTMOu;I_0-4coutDO@9YXLC|2Pdo|rVTA1d(+7rc zqwEHQbFBqQGeknqX?L$bWxH+tV3}$-DQ-l92tWP7fRLf=fKow0mGorWgp8j@_X&Yb znlbv)-T}V6<2TFIY~$-X*k1~K_PUnQ8KnForZm30Ft(7=q_qPaYR8V0#IWhlea@~=t9j=k|D6g?2$wjUOA-j6*(OV%` z&0X~DFQpn>m@~i(fo8c{uXg)_IIaG@A7iLzTQK zeX!zyo%D%#e>wM~c za|Df*rz^ag@4dO7*>3At>9a$0vPz}F*Od|05~e)OiyE`A)iu|Vu;j@%CySpVcS*fm zJ@1r;OOVtrNKKy_YcYF@+G5-qeoM`fh)P);@3s!BVDm@LN2G)5PvsxndokPc;mpas z_ggzSwIP=uVc{B<(DI)yIMAv>B)F19p8yl^VQ6y0j=~=C&7XFe9Ya)iJyZc62n>!RM6^)qRUA9T9Mcvvid5^;(4SPLflOvax*)@fsNk z-vB4gwBJ$hJ=wLY8a1Tcqx_z^3DYme_sP2p^+NsF%!nvlxGFwG6j-MXm3tVANgkWG zAq|%-nAFN$?=)QQBBy_pFDhoB_>_B5x&D;yW2TJBC~LOKCQ~|nyv&sjmyd zzY>4^8l4F_1Csl_rjJ0>E3pM@zY>esAYIc*8rJB`=R3x)pHJqyB8|Kgc07n#vmkM; zpEN1^+&0DRm+t#T+~uwTeQI~L#S>VrrQ0paB5E)ZU+!4WNy`ouH=5PZrCXMU_OMgD zQ}77yZ50###4w+=;Xmu+OWZXJU$%^ICvD!XUpx0CG@$yvcLAq2F0m7F*uXI!+R0|_ zw@F{{jEy|=rHHo?mMw8vi}rp;{ViKD8Mm5WT*-`jufAp}rR}I6$>DB-7`+w3HTfuP zjQzBk-!)u+4}*_bxQHd*MDYEKn~kYFg$pN;_ED=PfWErx76103)4C(`0tJU%bDx-V zdF7L@Qy&rM3_oHPt0Y)3N*>QcB~IV}C_FN2N*imOX$i6e8=!VzeM0kP;_CIB#;GvVYnA+{$@_pMbI1kg<+W)rloGq z;(w$KVW(43J^Uqkj}1;d2wVggsW$VB%NUW;u#Ergt~{xZQ``dsT^5% zx26w!+b<|Q21cpA>fBT)OodufKTBYR$XmqIShzj8ooUEYr2j}+NV}?f;oQIU+H0xt_o8yLBbu?J=RQSxw%tc84eRNiG+2xp5XY93W3ei_9Ys*p=gAcS)!h?iSt%#52pLQCkNBsbkz(Ew16EdgIU|sH*0y;yi`_+G<7JX+ zY=lZ&BYCO*{WXPj9^a9(!nl{QV8#|z3;e^T0E{LH zE+bLpD#YLToM}?*hO+ir=ia_vNcN-Cb_0z3ou@hN#&6%Ft-2)o)QQ{kgXTq4)rsv_ zUFS>1bbab{TT|*QCOeIDb+Tz(A7U<}WTe&HiCd#(Wu`DUILQ38@U8b)UrWjKsXRJr z-qw!~^GF%^+Hs9*YA1Fy!Vcp_; zZMiv6;70fM&cy_+ThA_En|~dV!8I1MMW)Z`&k)6u+EONq4$C!2?l9xxAYV;n^2+DG z+cXzEsZk`Irgr&pQ)XFiQ1|0?9sM%q#fDqZj%dy24Qvyw?pq>fRD&1aBYovoCQjcm zg}UAiWvM#*^<@61g5#chS{n!)vv zUeDBm$#ZvzJJOhv&pdl~%m3<**Pf`61fg35mdg;}9RJrZTuA zXye^QuPYi~W0(AS9^N}o$JrH=#Y{%){Vm*{{l#W{@3`}hYQbmjA>5~$0cQ-h6l<++ zju&Y#ekwm+7k0Oo!37e3O#AG3v#}hEZ8qRVZMcqDu21C1Suun08KPiL6uEQWNQ7X$ z3-8#-sU3PF#Tg|oybs;wtS{TfX3eUI-S?}$yYF+YuVZTucTvUU z!r$8gjV-Je`iHrVQwzYUw)j`bX*CK>$Cwx0@-J` zxBP*`;B_4%VS<;Qdvopb{gcd!!D34U-I@3UvAS?seBXE#>2vE+RDr}3=X_8;)>2g@ z?;N)HU5TA{le3!>6id?(-tl@#JaiuG5bbjL3$@Q3?)tdwuxDH0VyIH6{fcZXua366}1KI+SjML35bO1IJ9hIFYi1Jyq zAS*zqT1t`z!?d%wiyeIDU(>2JEuR~2BYP>sW6po2!<3iyf)>8n%CKhVsI^DxxACHO za-*2}>d)44H?Q9JC?X@1act3H?>;UPF2tq%rmZEj0#eVPb%u+BqIAldq&-zI{Be=7 zERi#73-5y(zkaeiUu)uIG#aRElcluZunNHiJ(0|&@hxSF|O zoZK8;pa@WK7ojjscT=~63=wbwz+$MH*aLYMzvKNrSn7LX1rI+z8p?x0qM=AQ91i44 z!1=*c3J|@DiJJ@NAo+q94(vJ5)Wd()3?VR(Fm)g&1jw*B$Pj_RWF3Hn2{2>izf6@M zAP7tbV`*stY#9{Nxe5Yo6hnV12&C%|t}W{&puCQ$I(5^oC^M<~qE0ifFLDo}H? zcCi5JmOy3*@Gr*00*VII(bdAk!T|!az*t$kL17LUpgTWs?*^oy02wGUP?#(fyyjJb z3IelSgo2ln+E5_?VgeNcoRb+8W)6i}0E-1U1SHW|Ltq%-1!%X0!t9_hdjKUs0~F>2 zg*iiEz{IWqAyAk*6y^biT?M|S22w&`moesU*1+1qP5*0l$$=<-eD(34oPQD40e7)U z2H?A$iIpqBoPUv#q^lX2JtG823@R^86%)^c6#S?*)LN1^K@B1z_AF z;0!n!SOk#BgL3c=WH<=`gn=+%Fa!GfI3Ly0ZtGv zP*5m)&j z6tn=KP2iaSU=S2w7(v{E03&`Ba1}p=1PbbxazH)E#b1Fy{Xl{s(Lc(8HURw&GzNhG z`aIw?2m{-HDTg>AAOSxX{O1$g3I`Gdi2k+h$GktDhiyN#;n%((W3bNgL&(7Mzf_W6 z&-PMK1b8UE2%2EvD11?)wi-)jfH6i8kG!T%X_u5;*gmCfZ#%ZQ25=!eoupiDPhdP z*aFxi;F1qALfrwUctGS~o)q9Z{u?X}WOahk6FU&qa5j!#IOFMJVF>~36F&qntA~G}{Ae_wnU>I>FfiT%|3DoM zVF)w=)Y#u)d_tfS{s|KRe1G#9Ed*%KA7lAJpZq5b^_M*0{DPoW|9xHoI1r-z2?Jx%KVhi9 z^ban;C-fK10)l_Z7cPkSTb6=o{y%jCE-3hyKEr_^`7c_8`2N;sAuK!<68cLw07^h( z{u>Vvz*hK6|A58)4HE+Ha{d^L-~+wT@Awb`f5{R7B)?1efw`Z*;{(F~ zU-<0iVuG==a5?x-1vD|@4Pb^^ zmgn$R9#HWgpGw{yD!LvP9(XAF2SD4x!vjxnhGM`^2u#<`!rI>17K+`jCAe7W;ys`y z09i$4fEB#A2Nb5}1h62#{Z`n1t3l0vV+ZlK03hEW_XIq^-4FI*I(T;%Pl6R756C}2 zUjuJ#Zz1pE4N!yw|5zMG42eS^F;FxXE{2AqF(~lE3{VFkBY?;qiu=K-oU^kF@Pz<- z{jL|V`llk*@y@m$c2LANabjdb@*^k?F#*gY6F#zbuAd{h7+G1`Su&RzO?0-XEVKH?Z{=^*CZ_R?R7jo zT*jN-urfw^>c9$@GnsviwZFJX_xtEgeb`caX#vfZ#kOz$o!+}To(NcEDxe?TNf1wYnUB9>$b4u_gu8x#UZBdY#Ah>%d8vY9 zy+d2C!B@2%XZs);cX@Y1(>euTT8oJb%Trd(RUQeW7v72E{kv@O7syg+f;n8)AKO*H zi|dg8P4 zHFwi2V_nj1?h_MLBZmirkpx|pi(t>%Q?m=k1+$6AtK5)=)5zoMU3?jq(QX67_ng>n zM48duVwBjImHG*B{u7s{*EAAA#lmI(!@OV3>@ zH5~9aEG!VyTp3bOv$lL<&pdmT?qPYA;cSW`er! z^_J^!z^=BG(a8#X>6u6=7k@U_m?Gqnu@dq1CQ}$&;OxN+M^2rWx7{4=T^7v>PQRX_ zzkidHi(KcmjTtrjBl^?JFF#pUJJd1FH3&LVMRBaW{Pgj#Sb_6O#VzX>xQBkc^9yE= z9Of)l?V0_PHRjW;U8Y5Qqnc26SlVEcgV0UZZG47c@R(USzNs;WgGxEs z)ji9wukRw~0U0ATwI(Usk^9ci^+(=uQ^m$soo`o~WsnGaL_Wr69BcLpe^l)1fyIMn z4xOr~%jV=Ie90&Bhl}WMMHgU8uO~C>Fg2)KM4G?x5+VypW4?Ln$bJeT#)qX>f>}GL zahaj2x)jyQT?a95)frstmdIbzt_Y>OSu*w@YXvH>ZQ2 zNgt^7gqO?mZ}Sa%$~qNNR!X`9GP6%$1hAD>>u z>kjbsd*0xZy1-u<(6HKH>%O;*Q}X{*894d@V|i=JnG zJ@A{p(uT+9QN$dp`&!r5g*)bC4vdwagIrsMT@&X5-ltAvB+lLRd?e|#NdI~v3h%yn zva-zQsqxv#QiCZQH>`K%gU#J;P8#`Orm-Cxw{s3+AO(XfLIRHanTQZ0FbFh(n*D-- z*~gA3EP((b?|(#Kjh14@AUGA}oAe{>yUFAqUL7;yTPu}s)jhaL9_ccwI?fwfocU<5 zv1sM`wJ_?2c^+oEVux#l*M+**N;WS9deT~V?2YUXV0JF!dM(V+O+td-)28k800aMT z#!$#oZMb$3dJj`5D=TC))CO`S)YSZfSU4H=h-+;QRD>K#d-~e*>$TVS?8=1E)RUl0 zA~8Y2(f>_aN*fpV98NQ}1d)~-+I?b9P2kbxuUlSnz5D=PZqRjNFt-OVUw%3wFzOwj~?L&4`JIWo3NF%#sE_}F%{7j_c$rXQ# zvqwku>OG2|9|Utc(+p1*H<_beU?W_{z!09h5R`0(n)D~1CH0#gfmFTVX)*D<0t;)JLz8@{j z2o!sxZuz%TkYM(QxN$@qi5#Vx%WKIJ+qvm~Z@OSX=SEkmWK zMKPCtYf-f@F2l2B}#z6y@Z$(A9Q1jb3WA+Ovm<$A4e!em7p=KR- zUa<7F2KG#&@mFh(BSAbm&8qE|?WvazI#6DXnqg|dI#&4b3tZwE6kD$frHZMo^)i*K z#!mVVO&M?8^~l~ zsv%AGNq3K~oS4r#7#BTimSLQo7dP7BGlU**E}My*?R61Z)9B~lV}7YOE1B74IU2(| z{-$qY_}q})Wig7IIikWRChBKq#Sf;M9zJvNrx!4@rFCvy&Cf&yhTadvD`nUn5gSV0qciPtOi;1{! zHu|(<^2I(A$4rew8OI!0mC~~LlVAGg+iSMo!}6B1A$%0p0?p-)hGJ`k(g{2&Y~$-4K{w)looc^4Qy-4P2GQ5QaZkd)yl0} zh@;PrLEo#ezSCjFsaUMM!SbVK^eu`pmK$T2noV*B<5l(sjdYjfh<^9^p3VDpe7$#} z_rMl!*iH9~$^)ABB_f+Ga!bdxb%nA;&&fV?2^EpwUGuP5v?|CV>qArxh0-1se$PEwcP32SGNdU#*wG79CEGT)EGdW(IBn)jZB?s3zenL#_O$VX+qL^HTijly~Eo)&Pt z-j-d=Yt-Y06YJtP6%wd%*Pgp!Ut@ogH+y<&ZQ*c3?h_NKTBZ6N@4T+&aWygSe4l#W zJ(?QAkz-DJddda@wu}R{bxlzdFZ3EPV=jvyIQ1V7`C2@_lJDa6urjvt(u}bki|QGT z{WQ%TegnT2MnqRXP zJu6&$S^1PXVc)@tP{v3(jbLS24ZCXSfq9YIf=o{y`=Y{l{b^hOePxcUUi59>IL@^@ z%d{-QnXn5?r~`%0<6j~jUNTIF*W>jm_CR-g3sN>Rjm|z7p;7Ti_MeGn%@x{X#ko#? z<;=mS2NiFS9dti!?{lqv2ATV^FEQ$@2|BW#?cm<(fLm4UQycWo=gwM_P$#(gv{RiJ z+&|c(cB2yof@ZA}f@P4sbJbVCaHM!*i{r$Axg>+d$(&cs)E8m?c|4eFV&C2~-hNX) ze_V&QsbD4iQKn@$e^R6?c^hh7;+qmWV#XxH;vt8(ZWx8TX2gPN0r|*Vl8*40j5q$lb%qnEHABxqh0+`xr?ZgPWKRp-bYshb_&HXmFZD z4=6O|>7CNNw{W&Gz{Uh3QoxiZYn1l6)06cs)#yc3so95`Hd=MzU9-#7O9x~+&3y^% zuH8yBhT@XdYXR==t!x6ipE>o4ZbmwN&#}5NON*kKoa*%$xDYqUb<7Rf{^hA<-qX!p zFMG|>N$3*Xx!a*j1Qzw*y429pco+=K?G+Xrnf8SyACE(9zFYYd24mk@(?a$z6&W+2 zKXyhuyj&Th|KKBsBt>C+AbYPOL`LU=YLomv!m(r5vdU=pt2~tJNt4Pd5X#HXL1<(- z8Qjw-pT~*}tv{&NO0AaA%AFXBcYBf`Vy+elODf#gW956{MR;EZraIX$^*!Zh>2^=} zg3h|%RsD6VvGM%Pq*N<$$%bKay$6yfNXSZJS%Le96j~!%j_w(hiej!RSg=gHWb=-_ zWcSwDqwKfXlBUHT7>&MogAn;(8vA{BUfF!T^6v9>yDgEjkCm%BC=a2{0-xBYS5zMP zesz4JGgwmQcz(CGgPH!r6CBkt6}3FHB)BB8#z0~I!P`p39gpEc&|F%>T^54}<=vQB zSeJ|CDG#j7slIt4#3}lfmhL;ONdK9`eFUfb?92PKX#F~7K3)>?yFU$k_vCzS>;v}a zZ>q@prRm@2EaGEshufd=sUQ2YA^Lo+Cel+F=ckAJNk>XPRyZnTGpW==Zu|N`z38oex>0Ru~!U@fB9{O z`lt{G9hZTD-MM)^VY>&)?ZXd6mQ?p+i#hLZ?e91v7d+B3-4c>^LuidIS@oo{fBSH7 z?XlsK{cjwv_=u18H#2&a51+6e*GEJwckr|uYuP;y5iPW>%ouAUr?>3sE$M^?*{9JeCt!c8YFWH^-6ioRj7HOEr3n%v2J6uZ|lxYigOUaeu#;~WOsbHhI%EeFgroR#YlGxUE(TAz11M8`DmD>N-aQ_ zJ^sr9f6;}i>r4>+uU-;^W&GbSGAX48Un=IBVbRN@w(xQ8L}_w2KTiEl-{)g^ut$!K z&efxH)+Vc?OWTKH;#zIj+YO~;E+Tn{cpPmKNPkbP?+*hh5cW4 z50biRZjhi%DueqcBh^tO0J{g3Q~Ujc%XH_|?!+ssZaK&aPb`{WVpU}*GP_rDgsk+6 zLcz@%@0JCvOAA?DmJ5N`jXd_NC#svudnH+0`3p)V3XLkN@~CK+(nu%v)(9p|(MgT> z?|x~mKzW1pz-V)jZ%Q8do?b6~%Pteqp?gY?rW1n8&+VOXov`IGLdh--bi4~)6WUa$ z@2`}<{fZ}l7Y|$NN_x7;%40%Yr3B&zjIWC{dp)KcGT%)Dr@oL07%GaA&;Y574D}CA5Th+ecmb!WSlhoUYb!$C_{HyS&qVIl zQ`Xc)iinV129Gsfm%z>+{dgKT)3ZN%@SB7cQ-Hc!BOZ7EXB%Jf@8@gz{t1f34F`+j*&ultc=l8<@ zo`kvjXoXK>=BupiRtJ|#zjYe=Dvf%7cWziyg!-r$>LcF4$l6fGMzqkHDQ<+BkYZ3h z<(f)PmDnx8&Y2R6A7^C858A=6Yq`*{s!LNO845@=eipvX(5wAG$_{i3~rOq8mZh1f&Fj0!D!;>$mM7z0lWFsH@xD=;)M7f$T25mPg55Zt_lzFE4ZGO z^M%P*Xjq}b()I8aNfFvXn#ZSJJ`kCAHsv##KNrp~xsl-eZQ%gx`}jn|j$4BJH%dfu zdc8{y9oTK9ToA05j$qrco%wiAL%jMa;VsV_63mlGrjQc<;%~2=bOv0C;)*mSj6rt& z#4)YNOmJsNT3Dm~vWBl>3r#~Zzrwwd+kR^2)K+6*)zWY7S9XfFog5Bi%ln|O+I*x! zcA2H4wmEhU&lNXni4W^ozPWb6=Ss)-{$??1&skPMMrLfyZMBeKjH;WzIz zAAgmeD9~4l$lkwC{Qa?0CyQ8LQHrUA8@(o1h`M-F_{qTj=SxxM>xr)S)Xkl0XqTAi z&^^KjM+3-wH+v(b83tps#{M_6SO^;G?H_Kc#>S} zYTYjyKqbCp*JjhN97qVggqAZlr3j~rbZ1d_K~OH4o7A&1xP{2tTk@mJ?S_KN-=7V= ze~bLs)S;)Henpi1wllqjH1@SR#?=tDG;Nq!iJ4w$AwOSKantAWF?Utolq>JK1S&ri zcx?847?vu4y-I?U*wRLk%AEd1Kq)OXI$-y z!-Fg&=z6Ksnd2|t6t}%lVz5@;Hd^hvxXY<7nxjX{)TuaA4WWpu+JiKUX;XYp-Brm& z&Q&)=kmz)*$e|>>d(>3;p*jh26hR2mvl|h6&3QXj&s&2o&Fotm&uB+2(vEU}8~<^) zW<0gjzOBP&X4+?65+5SM{vJUzi3ACW`G>WYlC~NFObr3A_)IUk1v@vPCmfc(b(VKr zM6l63MDayk2<4f$y%!EyFds&Wbm{tuM?aIAwhMTSj}0-h96l%zV<6i$7$kcGeMSL! zGh%NDuWnJoQa3|l9^V?v_}8~t4@Jt{FTU5x(3g(qzmVoQqlBojkNSGkW}bwn(foCHPNm?dRZ9hZmf;ikI@fjC zg?F0eh-2e>DYvr9yNe^A5qxH!?PscF`0Y=6Jqnj$%ldrm!?-1L+&!{%^s8I7h9L~z zGM8W7S0D9s)>~IThgf31dq!mW7{&9vD0EER@|3yS5(=SRqJg|^X`NcN#4Vj2m$NUN zpE*qhG0pe*drjr+vU4F!U=X>Rgk&W0K{W6q*?<1dB^s|r;FY6N0(Vw3!>wOUND?G` z!JXAPUYMS?HGkl)7KEc=MRk9EVqdex*-h;)xS26GnbE24tp?s^)OXQ^S{JV44Fgv* zEc^_;*=a4kyk*>us}teZHe zUPled?XL#oDtNr;BaU0Gq_0M3r4ouz6QeV&iBHkd%QO%+i233qwPZnc$l>Z0%caSg zH;V~sylEwFdOd=M@(N;so9uIA9~UGiH*CU=gGrej zRi`B%xQcjx($zs#?OS`+F8f5v$gj?*Ir&A!@Eh~{hIt3wbL|aKiV9kuw9PR}iX--y zjxYu2QQB<^JeOB|uW_5qx}}>E=9S#D*mmLQ$Y%0~^U;&zzRsL=G2QqNjBl@Jhd$A3 z)rq0TJqX?{Ea%EfVD_q8qVEz<`>daSwt!t~jfBu75;hd}AGT&NaVLST8LE&UzlKRV zWsF@_HnN_5nnr)js3f!H{ypd;Yqg|xop`hXr z;=bvPHmjpYm=_Xwd}YWUh3b+3NvvC%ujpk`&`*fZk??|Pp7GVwO&TFzT;zuX7*lnGy%xKpkd zF6G%@WgfrM-5Dg3&Aq3j?#kt*x6@|b$*Ni(Z3$A9bViQnM}iY@0glSnB4?RL%X~M0 zQ+WO6%l>0ytvYL}Yc;*gAKrPV6b-8 z?7!?7pL*?753GOvATj0}Gd^cdEym6Uisoj@9ujW9n#0S);rl((NvQl=+`B=ybZyD_i`w!c_iGlZ~Cif9$yzo zRmMr3kK(RtE-1dGLCCd}v3ko{%-TA0@riB&A10;E_(MthV}e+&r?Q=aLu}5thE!HS zb;g$8h4*c%o8&VZ79{plfuDf>-aE$qLw=*7{oC(CyDmQSF_!W~RBtUOH%sR2v$mGT zmhBa=96U*Jl+6?(gM9C}IroXq^m!dOb-a(UXJhSL7Co<8sMIXg+ic37SR=R+rGK1; z0^hPcYcTB)pC8K4PQHq+(|Hx^PV2UDBd;-D?MW)aH_kXw^pMvwM88bsMC~Kz3_rrU zP@XNhJ!>+1z{7Jk=S#o9QlI48CiA!7+gGovU?Hi-9=&m;;Dyz~Lc{T!(PL>{-#81! z1EUq!NjL^lsVwpz_QEhRCxKL!N?|26nD@ncT#eY*wBpy`fRB$d32M%CGm zW1q@}GX>^|nwHG$vMEA0CS55`VI@!T6@FsW8JBi8Cp`X9m2^o{eyWvq0lL(!CUn|; z#U;MLAX$Bn@095S*=vF`&cnR}y6f4n0B5%cNBsVl0N986dn}RvFqW8@<4m9gei`yA zpB$za_~Lsk-M6rHY7kjv7i%4gV9R>19A`<-#RVx3wg(g|aH&?Q&aEcBL5jN~gP-9r z?>!H=ZpR`pTfkxjs_MK3UdYWwCoI@3n$eWrn&pJFga8Z*kM}3dgvMrFxFpc&>}GXhTJMCEzBjzSLTg{# z)y#=9LfB~0M-!Q*vpxa491>2%kx&hZR1*Oly8chqXv84sfTakz->_9m>1vOeV=|Qu7=)PE;r}f zH0-O<1+zUf*YlpFyQW=qJ?;7;q+PQ-rACHQ|HTvQYOM?R?l}8yIcdBVY@uQ*LTK8^F@(bOb3{s<{imLDXf@374Yn{nRTQx074vx5{O%t~NYwY+m@5-LL95KNSlpyN5Gir8{oXjEiVU znKX3oYe?ayN)L4%;uJU1{_bIIs$Nh_qh>Vds{Dcb%Awf{E_NpmWJGHvj!`Ehj`hSp zdGhH|(}Sf=@)QLd5}S62-&43Bmj;M80bueL?)V=i6)j~WZKI=d1bYid4L#6W2<{#V zb`}J{S-{jSw#!Hi1_A>+DB|6%2==ZXE(9n7%sh3VFkMee58{0UkO1ILG%cKfQ~iJ9 z{n1(e$1#3UEEWS5MFFSwk#IO1IG_*5g6H=^^cof(1bgC9e=#^5oa?~R(5%72j%m%og0Ng@= z!jz!mfV65*a7RoVDght|l>x;87%ZSTpn(+>W(|b_-EE*S;Fbn}(jE{NsCIzD9HB5L z2+SG450Hn#+@LT5Ko}qbg?T~|2%z_QD9jrQ^8xq(PE7)r5MT!O)*g0%;NX1u{XPS6 z9De@3`@cc{ZKQ~*B{eL7(j6^q-2u(~%K*u_TY(ofaKLcEKlb?ziTT*`9N_@pyEKt z`v34i{$Ft*#U-HNgTi2-7z`GCAQBQdC>ARYMc@!n6bkj@0bm#j^pCax>}S92K0o{J zd_#WL|7s850PzqXfF4mZATjU}MOXLa1J(rg`N0wRhM)ki2;!Ci6!G%`r63Lvz{k!vk$X^!J1U4H2r~Wi zAzA~_j%YCe{P#Ce(;y72|Lxhv2>}`SDe%rWI2DK^2r&J7-A}qd-`jP+tl{^zpkheU z_&LbH`+tYVe_xy{K@s4R5AeBaPz1P40ZQ7ywER)h0b zk^WKgfnv5P`a%KgCe{)|C9vfMe*DjqgWo^nk}?8HOh$gM2U+@AFNgR&fd9IZ2vd*) z8o{hApa>8#Ob+>fZ3bkz+s%Gs;kW$fe|-IBMgIf-wtetFASm8X4u3?}A6G45C+)4l zi~^V(pvc6_hn_$*Br>^u6$10ScnZJ;LNI3%0umfoM;8xAdrPPnN(=!P!-+ucJUm?8 zkHKJ0zv{$X2)0KcFd);h_Ot>nB>vvb)!GJXX<_96kok!MXi6kt@8YZoUgQ)~Jcfh= z?gI-~?1TZQA9upASU_Pr+F^mrX(vn^4%p5g?SK!!J??~|Kx6*19TEu|>Yp$; z5)RCzKVd*T08IN27y<dN7a$uVegJ;f-NS<5L3Bk3G!Bh|@bfEaDMS7bgh8$o literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf b/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bb2102a3c78924d2a0507f8d0dcbffa6d8984a04 GIT binary patch literal 13934 zcmb_D2{@Ep)Cr}rB%~5G_TB8p$ZqU}5VB-27`w3~loGO5C}kHSN=k|>l|4(*LS$dE zBuj)s{O>!_*C+q;_59C2p6lNCyyu>C?z#7#bKd)&mtPOBDvppqL-_A@L+_MA5KuVO z&Ds$nBLjsQo^~ffVafzAg0q`F6sAY8CwfAW0Kze-oE(JcY6~Jt{nkO%&D9Hvq9DLb zbgdnUHeOKd&r20wFEs-%f)^2r{t3_{czF@Yu22kk34s|n5Nt`V_E5?7yJR;T1ELqy z3}A~_1w$UQFtpT<8O&r9Z0|4a)r6&^s>3)h2J4W<$^CsH>;{oLd z`0EgDNdzS~Uw|VV_>;t8B+v*X28xDDNnnB60#F1nI)H>H6!%j`MORlh;0gia|E>}6 z_J#_ziLUlu4p788TUC-XFlZ=D)fvzPo@nD{O9W;2^db`pE)d`3+C(h_PApr<;?mv| z0`&A2vIXp>3hKAXMj{WQMd`2aquH^&wtvQbcG1(%FVxjrt6VX%@@;)$*QmjS2mgHg z)L`G+>e~7WuJ3b#4>0Cjmh`dq-;&kETV}rN#&%ba;vLMzRn#qU$V2-^t3%%22G$E;!_ z-XxxTW=#J*r~HapSN5FJtgM`U?A4uV#~IvCbHUFJCkWXd--+>r%7yQqC9|4~&fcyt zS~2o6pUOA#2v2OAO{ojy-man?!@FHWOY_LV3K{c=lrsb5TqL-7Y{)~j(@P(!A z{qeEZGH#h=3cIPX1x!;xMw~3TS1*7b=|@JdVSBU39)<5#E-tt;KE{0gaOjl4Sf;zcb5d(Az2b@57Zzutf;5mnQJ}e2#w9 z+l{h)x`+I=u~1jezlL}Idn*I;F7Fny>2|?)k%^du5D@$OEDmQhaPxcBGqT z#`32LsT~|vpbJ@0!F@4t)jqDY?SMo_)_#|x$77o2TW{PQ-fR27>~)>1bQ7}(`9KN6 zv_Z7phawtJs2bW^%Wh}a8+dzp$=OJ*k?p*48d z$iW47wcob)2~F3HykR!;1|E_Xlf=&3FNocBw)_QJgqWMmNb+_0kO*@sSFPh;KTIz1 z-6F-tB^!T4y4w`S8eieQsE*dU z`q3t+Xvj7R{6dy#C-}wdyc=F8RhyoiN@T-q$eP<_mpIHue8A9k!*R+>815sQY5yIs zB2o4DmWVX%^*PNPQ%U`}rjqCeO-n`g3{BE+r1b6v4}D}tJL zTummj{$eC2-15-bnosc9c_p;O&O=R?`OT)~(ln^&PV%+iXSu`keJ`_`g6)J3*+h%R zs^fVjezc6S_p(2-FD-P}JLIvsD3ac^R57||r1+C$*n6|S+10qRL8pCmW>T-sj`wye zjMpdcVxRI=vt`WQIbH6AJ~qsMhO@WkiB0j!*QJq9Ukj$OIeikt)Ai&t8B1n?8B6tp znp3ZI4T~RJtleEMc2E^$yn`jd-QyxAsLxF?CQcpP7Go0p>LoN|QssDZB)7mdKjHLL z0hUYw~PPA%*LbXPCX1lcxMVte!|o%a(j=sQc`owz6Bd^KCEaxz;W2x(}n6XV7FL zP*SK3$40U1AkYkI@e2levrV=gi9rJf{XYylMz6txMv-n=mkn}0Y09_#qgzON2G4snifG5jc2(v<2T6hiTw>n+s=fL#(qAx z^xLSDhg3p%dZcp8!)!#kjz?M^Nzi;zbKT?4F{=-#FQGoMIyV)&lnf7Zk<+r^?=L_! zrbu^!96dM%&|dvZF*-|4w#JsFYgdmcu8{u%zvv<_UMB>)Cjr>*$`EQbCs!r zPrCP|{phl)*SHW&6GqsfxFjMQ!ywb?vu1fexJ-Bsevs;#X|qZtGPb0Y%uSODj0(&PN3wAeZ3J7i?`xfpU ziIw_q*)jTME_ejPL?4?u^xXU-*%sQ1=+0-1xylgbV6oSH6^oThhXN7^YF5j&#j>CA zmMaijx8#sdv`zTs%84)Dt;PMPULG}1TJkGgNH45=+~2Emcu1!#v*82Rm{B*@(9yn2 z#quz%#@ROLZnv-%z67T38ZW}6wP=QlY&nU=o97+O460S@8!OTFo^^Fx>ST_q&N$Vb zZ07xK9BoV$d@*k-*rL;f5t@OAngn{ARbSw8Xv^M_YJ-)=?>DJ_V>#6=92M#kWW6k~ zcabKqIGLu1g{>VkZ*x*eqbs)RbgjUJA`K&zxY{SuG7$SHf$G!>D>xg+`y7!V&S7gF zA5~+sK2p-*=5MybcV^s|+vw zXW34d&?7r`IV*C8r)V2q#7;iS}fVurf9$}in*I{N`R6+DdnJ{&p5 zI?g1@lW*mBk~fFob{Bc~`5;yCn+L4;+gYQ&vzOlYhbN7xNq)YnPBL&4+fV(7fC@3R zY>e;pX*g1p;zXCql)rNL``3L4zFtiZd$r^{7w>ciXlT7Qd?YZea9E_Lko3VEi8PoL z8P;JwQDXerb>UT$O5|9QzBHHr!u+Y)`frC$J{#~*zOjWaz)*VA0zsn@f3g2!HDb9C zbofPYh(zli>sGJSX?Ne(PuJRExeFN(J{ntr<|oc363gV`aU$yCkJ%j6PLbv1VV{tP zp}4w>I?FO1p=Wd=KcwtWb&AUbXk=9jSt4A5+w~ zJ7;@l#GYU^agC87lbdPwA2?Y8QHQys&L@7EeOHvxW^k#bK@0sZ67Tx*>l|IRZ5Hil zX#i`WQ>;o4ODLnLNIUnZD$9#l>)^)xAZH&;xkAhIf{Y+S;JAg~E}KfDnH|a;(FAnF z^H|#57kjS=Ej)Bg8`X_G+-g7BUS`2z1|>SkD@CT)acdMTFlkU78hYH(9ff{Rmp*k% zvf|qF1@Cj3G@o1^dJgA8oV*9BmQv#bu|hgt5Jgirf*yX|f(#<8t$-cF1brU(*Emi>dgt=)2{FM{diP3VcQ2reEP9)En9Nd zW8o+bQ31=QuMH;9yqJ_4=JP{?!`?3#`v~OE1Gj@74(#_!;~7;jig5{HWgudlIF|7* z&%V`^l)e^|-dG`I&%i-963UbwuU#JR7#(||CD)_Ux1wZvo}M-X-EGB|DWMl8OH`g1 zcl_+b(BC=q^&6_q(9ERY@nEaTC)%Fsb2%M(skb{XI=0n+f+S(L?WFUgz0YYnNzKK8 zV?Dp^MMM87uaKhZhwOeIT=SAoxm~3wg&1#fH{>4VpLtLT(obVh( z&i&YPh1kW z?max(u9SW6CuZt;>0;j(#|LC|46!)@Z5`e%cIcXc8#L0jI?pRKlym9~OFid;zmfH4 zmTkpPid2TE>RXZNImScxL@4S6sVe9=R6q~Rh*jQ8_vRtpFNibxY#%^h;>_-|yXD(K z$Jeg%FTcWBB$F=%OH}>3OSek1Wbk>jISNk?6s0%CTL~7Xx?=m7Ldz>iIMo$z9+l& z$;^>sjE(me!ycwvhw3sA7wrr!9=yBHN}GcFiAG^7(fWb#D!~01QF5k0k&Q39+=srz+@ajDqJ8($a1Nr*PGwa-E ztJ~^6Sfp;DOYjtElPWxbDWNGP`nxTUp88?86C@Cy^(B# zm8l=1HoPmNxy4wwh|iJezgWvE`Wmh^aE7wwy(>c2MR)#g`*uEIyDl0*-Jihw!<`MJoKg`z5gpYMFw`S3Y8@l%TU# z7;CPMZW-ODwbu>}&)o65>Kg2$$Tbk&!CAm;jC=OYg!@&-i*k|L@vqW1Go`#v1)V5G z%%gjpB;8)R<8Ft9ow?mU3dJ*+*Nm+#oLMP* z|LwrEeCuOVhb=U?#Zv=jhr~;r8Or-k@2rq7ueOA?_1;*32!fsQofh;)c56_?uEIV@2%$oubF!7)sELT zSMs@GYZD+S8!t41SLadFD`t?3|L|1s#srh>ST|LjtupO(_5&l$=ll|Lsds$vF|vMd zCO(*_^6>N3peK$y$KA*6c}!6X^Sy7|LYIYBm211pl(Jv&}9H%OBLF$#Zq_FB~({;sEK$~iHYykLY-yi%iaitkVoi~)R7k7 z{L!UVTG=yBd$!Q-76BVV>Te!j^boP&EfNX>9y%#^8MMLAXBOcKp3p>Ji z&5j;Uy-xGE+gs;HAM=S>^G{!@lMkCJS>M@Du1oiydECD=W&ZgFecl;`+Lux`UiVMG zy5O2~0@t8f3-igpJzKYM97&Vld05}=T>^KPc2FKnX{dTEbWn^d29wpK$K?)R`dW%f$6Xm~~2Tpw1E?|Ne-$Dah z#agJpSYH@DMKWMeOPBX9DcFncYWU*+rgQw(+B5d#uQcJo+l(LSG!?;4iS`}GO?K^E z!@Df+Z4lvAD|(VcxYbkb{f*xWlD-$ z?n?GE$CDlXx(3hNZmY@~LW~l z08egMW4QK+1gekdWNK?7Zzo=0%Mv@xN=`Jcm~c;~ri<^8+Rv4Eg*dinKXJeTeofbn zfn8gU=7x!YRKrJ+Y~~Mochxe}pNuv3AK1P6erbiO@}hC}7FydPq(UJ6A`ZYD#W7JU zGJFEoO@G(WqBENd;(Or1X(19r)wwWrc(DJyYd5pLu=oiLDVJKiAC8g{V;o?3q+fSe zY{u1`&veEyj9=!*Rljd@2iV)=;!WOW3hw<;B$oBTw}|rqm5u7XAdOVSz906JeR(?A zif80EJgr*@zeNm#l==&620o<#CI!DJM+;+CSkaW$jYy9iUhR?-(Rm%Bb%bOlx+*LtVyop_7ZP~g7Y9wYT;;ZlVKwzrkdSC)x~Vn?ir7vC-9WfS(wnBA{--Uw)?HB%$GmaMXF{Oyd593 zp0&Ek`sTrjurq=-3~Xf4(*DT%4#{05K31}dJ0={NBNXos8QxiFwU~a*%9ULpBWtd4 zS8P(dW{+tOPwG&#gWyoTHK*B$A6g1CAMIvMKfXF~`S9+mcI{gzc#D7xiTR6wOhr$F z3_QO8LCVr|!-SWT)8mFB7gN_x@{WlKHdqDYYpQ~2En|0{R@W)0!m;LN; z<`MBqu&MPBhrlIcg_ePH3fIw=%E+Yfox!{Y_pi=(FvsWeEwhcyzPVW_R^oZST{q20 zE{^|fiu0rj;wdR|Hpy;WS*5F}&gLm+irwR|w?b_1)O{k_nq{Rw9iGTq3>@ILxi7f!er6MPZfKYFC@H)Bb@(HK&*CF#vRs~@q~-H4 zOn%?ZkJ3G3)~vC4+f&gmGAm7jnSJF$Ufj_h@pd&_QFTPjv);B8Tac!y&W%K0id~qn z(wIjf^on$l+19qn<@4Nf8L?UPVf?HqYKYH#kG?-Gn_6%!fC&sBskRu|79kk$O838g zx_F!hnOBic1w1H953_wSE<={`0}qO(d0~cnw)}zHULu@LN-MhaqQl{?+9tY(Aa6>|aen)E^{5?e^!# z2|(0{u^dFN+cz>S_zTUlwU0D?3iC1TVG+Bh-8)>1FD_6lciCaF{j*1xR*XT-nqnu|peF+Uz&Ym3MB`#pvk+%&D@< z;)F@n#=RcixZCy)S%V?=r!z)F?0iH`s7aTDaHTvxyTgyzET%4n>n4-$qsB)jU&TK| zM=vly_Cc(^e%d8-uOj$x#iI56r^(i@S2cK3iaZQE1x=KcB?4FXPmlJ^{h%+-N}&I9 zlF?g4u!z4p4ZPgM2v}L=>C_R3~7++l+;c|yVgUhb)O z-&`K#JCzS)W@C+-@e4|0N++I+?%5+1TK3%j`SGd~gj=mf+G1RvXz8hSc-0*ydin=@ ztaxA2^6Q&f6^0800PneCSD33kr>)bj(hB0nmp+xPd|4gj82z?NRk0`Ak1>ckk(;Upjrvvu9Lmp_GJex%qOAqYs*>D;njqmQ_-Ks z{9L)JE_D!l_Pn{45&GEh)v-&vaCd{KL=@fG$*ewA^Sj?;Q$8A{p1ikTc6p0IZxMf@ zB>zI6m{_e#hY)nZlmo*C8JD!t%c`cfQ_oU%OPdy@zr2$NeQ2w3W93+!;kA6@vlG>k zsiJ+Cmsre10uM*?CpH$VV8ZX1UvIGy6=t2g%5%CvGOd0*bV$EYIa${@r#_?jPVk$Y z1rxJk-pSf)(3VRE4{HSno4uEg95@;}`5qadIy1O0$r|dJ7RFyeKP>r{%=~oW62JX> z-+SWQg$DNTk=?_gU(V?`<@C19PR1RZiQlFh_a;~Ij^kChn1fl$gXS;3C$Hq8zOEF! zF85N58M`OM@c5lFP2)Jz%9{o$mGM2ac9ZNTO}41K+KES|Z6@cMRZm>89OJm{ee%or zo2;0uH{AZW?viM)&a!YPUMrZ-C0%A6i|TlHPAr3aM^RN&$o!kn799!dx_$O!*)k?m z=TpN$S8->YRc*yivW}Ga{Qxpwx~&!hMn_*ATUK9w`eC7`%{TG>5ML5wOES;(1NrZ+ z1)f?ty()83ewSAy{}vOsRoE=~7v~Td{U(=M;IYgfa>?q)(C>SMi_nvAv26n$D@X5t5$mG9YjFO6`I1JZeaJweKKG}RomJs!ZHLK_*d0cP z1{$nXVS;1lpHxPe$lmdf>NzB9J}}AlxZdVv1ApQ<_kjdf>a} z%)(F1#ZSHM4zk|8%Z$6+visphDq~r3kxvGkZ(E2+q~g=!2W9T{9}W9K1^cE ziJqd=M`Vdx-l`79PFJ$VbYyRysYqM%Kil52v`RgxL)hYU2k`EHoUS2oe-Sw8=xMpq z0w-CcY)=(#Z9D(`^a)w;Wzs@IvrI0%t*xA7$xZ?50WBKQedZ8(WV`d~bU%}Mbrtun zxYH-R8!D%7?&j49m7St{lR?{g#T2eW8xX6b%(tM#9>h4v=a2GtP%3A7=ei{6$>{Op zdTv9U#^Yp!U+jr^aZaBFh*61}X5~ZIG=K7nP_8{Si9KQenL#6e9% zTAP_}gVtULpTzG2g|ApoUY@A<;-DIG>ZVU23k%+@(M0IY&La^xQKJ_x8#78FHIi8e zM1|QN4?8fuN|cJoDYQ~y@DON}$b5RHgC*(f(-^$|ttL7r+V_v-1fu%G3+jjM5?4~n zIE=oeiaAPWC%l?jdm?{%ss2$z^W3+LZgu~ei7SwjJe)B*(-A^5F1$YRlZj_neIhqq zYN-1l7uHnoyO*uG_Pt654buU4)gJCB&Z)C*4q6A&qIKg(cU_Gi?TmZ;xc_0}-T77O zL}j}z9(jS+`Twl4pP7V|^gx&r!ISv2rKYQDs%I*yNG1`SbqvA4jO^*9>_8v`Jz?5} z^)?cNfxy5Hc%r8bndI)}MusBRlL!sGt-UBoiXa1E1s^520I7ceru(zA($CyJaY;!G zR2&5)^daGJIFP#smjqM$K=L{SFEWXe|0e;5gAN$@`_;05z(B$DfV4y)#R5i97f5P^g2yOE zP#gdxKw;M4fS@p2pa(z)xBzaFfC^yg428KsVXgpID9jxS^MJqrR-OQBD9jtkZvm1J zVa6m|F9%?BKNC3r$?&8od*d|zpPBq^Iw<%%l#ovar5e=9@vB2r#|3iWNUul4OBVa)R(~ZGMf(0Tag@Z~;Vxb5e0*XSReii_R zkwX8(1z;Qf*86Pq-CRR9?*GCEXn=Gm1>i@~49E;DSR|5CDAzz5Efx?AgaMr)7+_g} zaFC!BAOHmyxKHV~j`yn{&P(}((kSMxa8UQ@XiUHui*Az{IF!26wWt}Dj6kub(n`>|@P-YMi`uBSqd^hU# zd%sr0@3^31w&bxf$w2*2df)Hy03M0}PneE@<`#JPqO^2@ZTYjM2RM+QEh8ur^cDaf z0aye-Th>q{#fm}zTZ~-yRYWM@btrg0%{h|dML;`^T<1fv{K$1y6lZ{>aDxCwobsv$ z;Nc8Kt#fk$oeG7XD=^JJTW(NbZYeO(xuDkhxkFLw9?Jt*L_hD7p{R96fOTFU(@%#( zd87t7t-GsrZeCC{g&SnuS)tcCg8mEG=oFsbP)cYC^rbjB^v?nQV|h0WFX%2+5l~8m z@O$DsNM^&&C?bCMs=q!SfGH~iNH7}$6akuam?H9jMFafiKWM1`K!ZyFVIPGc`hUfQ zO8|ZpAPLrTa=&?$-y)rV2K(Dm`UeOqe|?1i071QO4Ev{h{rRN@Op9a-#ty&^0o6K+_iIkDAXRe}uqX<(5z9ImlBOxG^ad&p}awb_reNYkzxCBlN>fq(&?kNp}x%|2( z;YPL>hQNTB#@5>g_$uM|Ztk{rP-}vX6ToJJ1b|9mKyq`%gWquo;iZvqz(%2wI0PCl ziN=V-QG#%|pzv>+@g)=OAaEe&fdEEz{RfrAV1Sir2mJ*D<0E5Sz;VEf;LR|s)Sq}LaQB11UwjcrIJm?91p{OOck;hsNW`DxK%#)d@y&QxAhmKc z3?~IFxPQ|_fp7me!>~a1!zLISjsvXXzx$%mpxxU9!(uj%7mmgO;p!$l40x8e8HWCo zFGlhYUBNL};9P5SU#UNI50^xO_6z*|(lf9M;D7X$l=@>`!LgJlg@5b8`dD5ykjxxY_}p05nGwlkVw7AbU}^KLQCT9m3DAqN@t| EANzzB?f?J) literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalization_quality.pdf b/audit/dataset_statistics_plots/normalization_quality.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c6865465f2d27054d4c835addc9be4356a167b70 GIT binary patch literal 15186 zcmb_D1yod9v>>1`5=yHe14@X%Ff+^y-6<^~tqk2Ugrp!!DoCmzAtee*Nte$(2=xQ_SCzUS;bckg}9-IrBOT9zBm!v|q~It0xwhrpq{PzPgc zh_Emers?5m28BssU9h$e7EqWP*22sgiU0;&go=tn%CKG05zcTKQ!9H5D0=@c&cQ^(%mu0s zkd>ANSTS>Vfx;B*02U zg?^_mS+~dSyf{J(FTm^Z$`;AR=$*~Nef{Hzx>dr|U86WMEnC=|>d!e9osK$BO?h>) z#IB6cRiY>J&_<(@+%_*G=B+?z@L~1wvMXe&OzBh6OF8Z|$rn|Q9X8Ln&EB8Z{&=T^ zBmAtw?UMy3FJ`6+Xb=a?H`FI2#7&GgOP*fn*CH6~&)9fd{nh)LP<7$kesBMKxoL_* z$-}VA125+LZIZ%~QfZ=0(18^VIW-j6}o!jg1vgUzlbe2`)Z!SU2bkPK$ug@a$+Ioy2wjNVIv=}Y)$&F--(x~FFpCne8G%qiV*JJ#!b=#rXVULdOjesQE2zD{_jI<+%3$xe!2 zmor{tYEB*v_CEc(RoI^ODK~0%86guc<*3nJAGUgXRW@MT7`QzhF?Q_yrhya%}_hPkD zu3mLl`h;#Pfx70-zkg+l&}(m^TXVYPmAg)y$kn%VKL-uU2ZO@1ne5AOFWN89lugKsbB+&6YslP{~b z?kyLOT7R7NwJ)e%wu5x`1eH_H%c=H38Ob=t@XM8kXZ602&sp}*Hd6#irw5h4gVsEI zGyCq!X2f@XsC&qckdEX~k&H{PJYVqKlR!|uj0AbCTK?-tP=bVwGEpiQAM1H*5m(8c>KRrXD>EP@2FgPdp zh)ay~(CB4N*|)dgGUQ`A3~Cn?$puQ<{7Yl;h3Gc~c^B*(dNi<&a7bDCGMIN15 zvaT_GMlxWxxE_=}y!^o(UF7iTbM^L{l$|LDD~_2FFtF@9!Tk`k=P^Mq3P)i0e#D(U zR|^95p@u(UV6gek-D1#yi~SE*i@IGw_L7&F$m@|2p~0q>nQOxsM2; zU4-f;@lL4t4_%cGCYm%zM?`3b=4>&+iYlGh;?hpi1++{N!GxK@!s&btVE)$txgP?M zd;)*LRFnj+o|l+#@mhoTOD#XVlyzIq*t)6RcckvlhZK29xfczOeXV7@z7T46gXx$l&<3s$}Jr^-=C}JP8XwC~P+*R<24^^C{`Gz3R_J(-xAX0YzhZ_x#M_-V(!Hw(ZO9R&*dQArc8Jx!M^m-kvagJO3@J~t}a_4PUv z0mY|Rsy>=A;TMKABJ$J6s&nQe26!^~B`%#vwc`&+kXB)3VU#U(e3%*@6LhnudlT-K z&7H(urZ=R)5?<=0ViXRWWTMk1PzYf(BP-WwXyd)wl31Wp%lyP&G_r~Qq3%rcmhGe; z%f&YNF5|9*8z*gu?%Z0SY(m?Xd$6+IU>WDxt_&m&uc>h}kf=g`^nO35xAWMG`oyM$ zXNLsG`oeIUpP3*hr93@N73pBw-@$gEfsQ>(3GvM%vOhI3s*90h78*Ik}2EIxwSRv7Vtb!pfN~! zOV}yUNBP!3B7Ksb*qx4&7%!bZlC+`exNcESXAR$u*BJ>kmxM*5t7%P~syI#!S{~DK z%WdekS+~pQDQhwwRS8QYoTk1veWOi3eLPz3h~H#iVLJDQ$42UzFEiT%%L6BN&jdYi zzAihWlFc95hRrCMQPbc^<+c_ta0uj*#IG*M=dSd_rVJ;(7M}KDjVv6c$UKx5di}CE~dpQVWe6#hLzpWdpQoLhvO_&|dcFE9_#H2>+^I@rzVOYMR%jjUUs`YMMw14XPysLI6sB7y}@ z^$?EQZKiCJEsXBsHO^&X4p;%drsn&e5P{=U9U-+4FPLHGTsjTb~gv!6N}tIp0#c@&>(piW3HK6hK; zJlo}0D=j+EtjNUrW7j9%Pr5dfjbd^09VQUqK1?frRsX+3Fca8(<6c#|xO$W*m2b$1I*msySj0?fZq|C< zjdZMkV&yBcM^j&C%=%QP&PSr*I={@mtfa^NLDoG@A0ZDgct?mmiSih97YG)V4sz6)gk7nj@T~lkio|T)Cle$etVD*eEk2dMwGguJrSIEaGJVW zJi|{xz8%@4LAS6|f~s_bBS>nP%wLC)T=u@toP%|G|5momZ#X1b{oY-=(KF$k6~VFc za(z!~EYyaH-o)2CqeSc2D+>B}gJw&^KdoyrqHOvHcqF?FI)jhRxy?&^xHswsvpl}U zW*gg)ntw*O-${Vy9jgHcTeY*=;ytTsE5$RZ^K)CvOidZZ`XV(l4e9Pr-nGpr@Gw8~ zXgG6NMVT{n+D=nbR-4V@_()A{>#f;(%_h{e!^$wDR?&M;Y|-s!4sHb%5fwKU^en06 zFDuie_ri{~td@(;FQ^R^ecZjWemi$*lgKYxSL^@MQsyquQi<1vCJ!4mi8OXI@InwU)Qo<7$O zbH4iV`-FlG$}g)Fq%s>d%bk~Re#NPO-ZJIB!dVj_t8RoldU7`Kc&LQ3zpS{jWfke)CPSip<^{I}FRL&bQbUBPRgX6FX<2J$V%O_u)l)iW9r1K>!kI1eCL`LJ_-CI}m z`9d4$P9CZ9Nvouv+o7FW%|9}~~KcWVT@G-1ndUG4eb(r@P7E~n9*WTlgI?Fx_T_kJGSr2~x&^Va&eyAfs2XdSLs|;Vr zf_^f#;FOhP_}TZ(ueXHj)l=!f?P=`q~_cI<( z_^@54mB`f_E^My-0QHvR1_pn^*zlY(qkh&BK&=ddK)iyF&^BD#Ep%j-C=G%WViN=74BAkm*Fl9I#m-#W7q) z3cj%Wxkvj=Jg?OU4<@6k;&c*vi9U#Sx$-Vw-1fSowQVvz^X%LG1QU#K(*%L$Q{f8- z$a=s|1MxRmy^OkoeLXK}#TL!BIKEBMPSUx&o*eH*xoFe1r{{M zhdG=Q&aYY$;-YkeBx41z8JXDmLOcGDcaGs)H{{aT6CysKY9*mxj8MjmeM7>k)eW(y zCqHL<#@OF0`(}rY-Q|s}6H}YI7xL!|aM+N*ce*Re!y5_$uG94cg(gqrl^U z;AgShm%k#TCU+kXyDAPG>5*+b`^8G$E-uV~C^I38t2e+@)JpJH;v;C(Ihl1D2VuCE zH8->Vd}fsTxm&u13cNyTiny634viim8|Jh_7PpZs@3Y!44F#XKE8cxQu_)GAtZR9I z1`pU53H(hm5GFzX5(;dKD*f3@)e665S_${geJ7l2*dSs9|4HfR z23h>8wfM#e@gmvEw?qtlhQ7sCN#zyiHtx*KcKZv92|epmvoX{vxNxdUth|PW>;Ntg z*cTvCe-Ug&qJ+Xv!%1&!2?X%)`aQX~u)Hl1!BXa1kzV!S!cDudW>t+IG}q8&ra_!t zHvQUBRWh%)3!^tUyt3zEJ;hgRBA(LMbygl45~b)(Uoi`R6l`_bqhWe!hr4d8I@CZc z5iVd=((rzm^r%RC_v@)MZt1e)F_#4$?pF}`_wEozTsIAHNa{O{MwMD#dOMV-dSMZH z@%1*8%UaK_eMs0`@eAo0E-UH4=*F)LBqyMe=k90flk@Kw&r3&+7fXzruoqBGa zPMJDL@K19g57=bDF_^z_L23$MbgY=B5x$O!Sa-?p35wsn(xjHr#XxyVA3r)+S*Ze+ zn;N6+psTUVSLn!1vfCuA@_d52Qo%=qK6>edH}`U7EG0ziiyQxVDeJ~{N|_}88~LXf zs5PIEU_I=+kt)n>MF|@egC063`z7ee9bLLVnWwycr{+O88(H)2%?`&%^exSZW(JkT ziPp0r)KAZDsT?|DMYh{r@nG{|ZzcYm((;ven>igYPk%Ai10+8nxyJm(?h1)OT;x^b zMG{a3K4ODR1)4+71saG~$E7@x3n=0tCM((Y_z-wuya6?isJomJN#e=A7DrJ2RlaBc!Cd+Ogh%2L%9f`WS1iAVrya# z|fR1F}2BUoC-)3OFFUtC*wl_Ak{~RLG2$+T68~;GA7Cx{O=qjXovK+^5Lsfjnc$X$*pX?YeowGvX%y93qi_vgq( zW`^(^O{IwL(Vm!U^Ye^fb#WbgR=S3JaS}C-M5K+(JpfAC5^M` zag7hXJ{ON|elnQ9e>BTSyx}#!iA$bG`wja{J%N@B4KTN7kH0joT|yAXIWws{bjL9d zD*0!@Bq!=-0^f5nMxqk>2kz+#KF}AEOgN<`;c$0iY~0xFMv0klrioxnk&aErObWsI z3`nlTtC2!oi+HgO1K+u&Lu@+I+y`jjfQ%8z{};Ol?kI%xGaX=t-)GnXzU+Y3$B9@Dwl(M{9POD!+tc0%p%y|^PAQa*7d_10HiT(S3QeZ`NV zx+Ki6-ui7xpQ?7hSIXFbq@*eGR>kAIVt$n(5(64m`d3C=B-AHDMOEp|OwJ%1!YKp= zh5XGHNIqB3aC6co%!M+YIU-c^kpc5GeNDodznJ3M=_>f$!#dYRGi9~Ur@d@D*L%88 z$~ej?BSF@RptY5D$gZ0^f0ex3nnE&1^e%F2m@?&Qn&GQ3+kwDrOa5a?LcwRX-mE=k zzi*kgNIP^la#)O%C^4;Hz)+@NF)y!DYU1^KX&<*YM=@VdjxrYa^z-E$2`E*yiK^}3 zc!62+xR%s;Q>pAuanzgqh5`cOE$K(=|iGO6LpbUYgMDXu2gbT!fS z6`USpF$R< zLjE9nYcRpnKpJQ5Ir#p%DPC>*1bkNW(`PD!samgk?<{^JrKxg{mWRzLsqHon6<*t< z6qU*i-p;%QPcbnVzH9^{s4$#^GsqbHHpHV6T z)7C4Gs~lVd`v+}M%nwlEjjIBAi3 z<5RBB_6BElKU)X03h%^teqBC6+Z%mX=WQB0%}ya#`ha^O!wGy7*&Kg`BskrU#lmQo zGN$SU?hQ-l0n8tebt3rxA~1nxP(XB$UKb^T9TVRMp5D-hw`$T7tO3@e5*&HpokW`wZm|j73#XQ<( zA|AP5-B$!sx2`|nEFPh$TfJqpeb+Hd$;hsnY?YFnub=bel+Pj0Z!)Wu)v5^DN4alj z-y451N}+m_uNUmYZbC|pJ6}F_E6*}vu++^+MB?zA^|4Tiya~+L9tfBs^ zgW&LtF;!I7p(MWMv>Kh8$K1sNnzNOrTRF5xntqBs=J-Njf90joeAbdg-{+lNr0Yi#Z7?Up;sS+-E@a3xpI_7CL5$nawRJ54jUetcN$cT)TqC+(-O64 z@$yJl`T23lsY!cBBT22i&eZ7p)P|6@+J@~ue!5;(X9d!Z^FcD#3ddu2$lPe64pD_# z$(YHN-kxbI;fzP^JP&GlF4;O0c(|c*gxz#&jr=UtYHe$e6o>4YU`AIikgzR z1=&^_Cs%>T7MAJn11vmJrvuF0&g&3Z-SHPFXK|wl5i(g%+6+-mz~v!lrxx1pzTgX6 zBZbgGj8;C92Uj4Yxxe}vrCb7^-v!7i@Qi{j-8~x(+=#)GHMFs1k9ur%Mf3qv&MHL)|IqY zJHPGf1wDC0XWP$=?xf?i9l!tZROKb%r;c3iV;UEcl}EQ;c8M2@=z7)aw#C(#Pjnh( z>pVH>oMEMnl$KI;C2I?pk)E`=agNeQlgRQLTb-nIukxcqrmy>mU~X~!D;?L)PkxIV zz8dy%#?zj$HoVVl`1qUH)WBlR_KV>p0#E(%IVBuvaa3-#s}%1ri669*uH?{*Y#ku< z0m(Q}L-XIy3@SP1|hIr-T2 z%F^tc^vLu#%w8E!t%zd2P%_8I=B_@mxGq3T@>|sdYr{S6@p%(xACPqbA!>;VWV!F1HTYPVya)=OfU6b1tg>%B}&}MR`HOjdm71E~~FZeDdYacqq=m z*>xxV4CN`$jZiy|vaeA+<4)Tuc}rY_1Wz@tpD@@|sIdqbFHmRxSbC&3B)NwP2Z=hY zb$YzTNEXIEbG@u4R7WJ+>-O+z5rgq*(m+lmP&_ewi@zSnGdes$XGI<@=_xrU^C{nb z*Orpi?1}WJw?qEM6eRl0m6wN}2jOdraC3f4)#osjhdG-@N;^Fay{$bUQ_pshO5g~q zRXJ4?bUG`YsEUQ(N$r8RoQlvkTY{{B$kkiSwQV{1Y09_^OEHr-jQOg<{%tv}iOvtCFqOUX*|Zu?ENGnv!Qy(cZv3dIR< z&nUgS+zf7O5UoJ8`pi}MOp7@Qn9s3GvWJkIV%RWK=1 zqGuPm(qszNd!AFZ9VQP@Ql$ z0-xNF)KK#+MUmBW^66x@_Vn6b62rBDWdcER2CW0QJt*Zx{0+C^isZnqS86@M|4jXM zR6?d!Ps7T$L3}ntXp-%1VHx+lDJ6O#^b3h#O5b#DgTe(o^E|$m`?vGsX$j&zIg53> zqY|&A2S-1z6fJChmSAF@)L$o;5rY@T^=7nPovP!DfXO%JLl!|IW# z&XJsw_eyQc3su^JNrn1WzB;7!a;^!T@ z)$DCaO4OqO3R(O_$LlVVDtZs$k9F9`EAYN%`OS0p#A7RDn1(tOSl%5+vq#(Aa=j;a zI9Ew9k&$OgJL$gZ<2rp?)YiCW5zYODz=)J0c~1xA#9sKNK^br9p}NOcxK|H zK*#rt7+tju7gGbJoElOE-El|RVdmQmpRPGrDxOFVQ@uM)5_5ODKf1Vh?0M_c)o%px zQsxI7^1y#})ZfdX_iCGAl2~W6?>#wHSzR^V^Ab2KtgW&pxLR<|E>f0Q9AFig5_Z3j zK%pQoaDud%vkA`1(ZvA=h3{8DYq%P_>{VEU1b`r*g0%zcl>d$Q`((-Qg~{A#Gz!X% zM4+GuUS3|H*q9d$)*^%Gm9Z{3tG(i79$vtWf=h$7Q7S4bU{>Bs%&L&`y zxd31o7>FJG!HobniBeccc{3{uOQ0Ygj)uT=_J5$@Ku)Cq6xdstNZ4E0nt_W)!^O<* z64+73y6;H@C;U;908J3*6tEw?FRZJYWvk7XX&~9t^|};M->c0$%xmWEjK+3iyLb zkj9TbIOb;`-@o@$z&$+R{U7syKFH%9P7p89kVrs*=Eczux;n_s4br-p|$W>sU}R2h#XqWZ?Qw1@y0ZwF(ptCaxDjrwbfn?DZ}J zkLC9s7KkF>d&W?}kM7+y0Tuz+gY0{G#CJ~$xM;+F&@zJp;b(8GIj|P@d!VaF?33E_ z{)jzl5Wu4&_NiGy_ukC{yjVez`_yc~$g+pa4lwigo;?(>?H+6|ZU7(O?6KnjMeY*= z!ws-3_J%k@k^5rdpvZkf&cJH=es@1)?Gtl>^6iWEJxuZK69gmI-fkCQ%@w-0Lj!tl zzytmr%T*}fJ~a<8=MT8@Xugqm4!q1vWZ{s|GK%Bfd3kS ze|{welac_C!c4GGIEWY~f%spC0WolY*e@*nlED1K*Ds6XA0Q~*er)^)2&(Ge0rUIU zUob^0Q!pg});uWk-j`6WK#<#Ga=%<31>XBHSqk`?2~2u8fDFaa*1^Tr${6Z~hi)82JWk2_eX54XR<`BT&ARvI7-2a23Q7B-knL~fV zz%&W`gWB)I;7EROA^ZkIqX1j~4g=nY{tm+c9`AP;KkuL8_`zrDU-*C@fY#&g|Az6xF+gSa?=a9A z{su!J!G#3={lo{D?@xJ=aLgZbfe`hFu8{no8~QgsKHfiU$;)?u{`k=Rf5^cn0D8-R z<3l0-lnuql`v+YVKj@tPJr~W(|0fJk!XM+1f2=EBG=}#N{?G#GKjvciK)?QPx)==T zeSd=;ARC~*-|frG&nNK5Tz&yyqxl;?AnE-Rh6F#;`8Qo)>hC-QaT+)U`}a6FFA}K3 z{2hh@87`^0>9eL1&6h=HN)+F`>J8(32a{wz-Gn4 m0d#MBeqX`f+yT1h-uAq?vkMmIvgb}fXM={YvdXB+LjDI_ct4x~ literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf b/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf new file mode 100644 index 0000000000000000000000000000000000000000..14dea7df2f137c9e8488875c442b033217bc6d3c GIT binary patch literal 22251 zcmb_^1yoes7cU4X-Jt?9bTbUYP}1GqCEZ<;A}ApxB}z9UC?#DA2nZsf0@5i+S%4z> z?nV6mpEz%QthXMn>)hG*oU`NXd-m_dY({lySxy)iFE-=jNl0lUHVncIakX&378Zs; zwf)?zAy6rEFLNhXTL@I$+}7F?!UGi0gouh_Tf10+1qJ?9LDtp93j#kb0M%EsaIm)Y zg7E+PD&y-Vr{!htWewr|MNl{Q^0M}Dfgr%I*ibDyb1QonTL|CDQx8{5Eo(1`5x`bj z77)eS*9!twa0Wz>Jo%J5`BZ?I{7W3z-wJ?ogVK9g1JeBxAF65X>FVua3FrgL5Aav9 zwz4;uboB)|as&VP`S`iuFm6FOKLWzX&Buk{=Ho#CjG$70iacO1o(GuE(iD|5@^49_1p!vk$uC+M!8`RiG_f?F!}8W@C+wX6y75Q@0W>$w$PDv4r8oU1bjS_$kHhl>ddagc38m7UyxU zW<_8d#Wa6u5GfD+c+~TUWJjsjHm5`~MKty=G>dvNQfVs^%^3tR>NYfd+Z(&!)Ax)6 zS1Ffjr2IWsio4&=%_^wAQO`A{X6qng#jz@RSuA_J^|c-}%KYO~XJ~1z^ zQyeN{H*^FqFpE{Vs=asr*;~#zc8Oivfz?!8jbgo;T;gFj=Ewk-ztdA}MNtgC#R$lh_P85W8SK;6gJnMsbEznY2wtw=E56vhoHAKjq{m*m z!rj(}o;y!(@pwF}REPLW7VG(BQd;RkqaurmK@CLX(rD7kaB=Itt9QTcLQl1!rJ(|+ z72ky??ecFcCw+J*XT)}vHt_jP8S>{3Xo5S}9K1|b2AJ|!1p9lBwnV>#X9rP@&X+W% zDIZOmw?%$@-9Zz~c;lG}b!<2P+o2t`ogUpC`(~LM1wX8eJMDsN3*pQj$R$LRd|sN#wtThkKz?3<4MI;=LY;J1l0 zx(h$88tupKb@{(Y^c(3ooVf>~oP0;iQCm@S-pJka{rN`L+xirGxeYoBA8ifkYxVP+ zR@c3BV~ZAW+(iU4O|Bh&l|#&2lpik0l+fTGNcM?M(&OLW|K@cA-ZD+vM5>6vkLtY( zHEzT+739?J_qR+B{c^wFn=ef#-C|{-oiV>sEh}J1o~7<<|7xJu(uOUA|5wi1`MIWr z7Vm8V>D*}1*ee>Y8gX~yrVqDvr}l!&Va2%`)1T2|FFH2S)>iB+V2;aee^o)GUsEl! z-5=Fk@aHM*zm~1S^z5kd>8Et=trZV5`&cwboY5J*>zJK4yhU1PckzPxpLE@S8WkK| zQ8v#o7ae5p4iWVvS9W3Nklg&-#UajEU8tsES2)iXvbWN)hmhplrAbl|tgn@k3=00T zu~BSFL1BA4jPsVK1x_7T$?odbYDueK$MyQBd4eAh!8ODay<5)$## zZzgz~9E%4h0mG)%}O zBoJ(J`ZwC;*JjOo<6I~D$#V6FF^g~8`$I?#jFs6$Q@3*7-wmdi?OXPwN034}%A`RF z{~^`L+~8HEmmT$kr5gD8$)cLgE^}Sx176#jZWYtUUODx#_IaB_i}&B(oXltZBwL4pKe84d-MdyG)gZn=UGZ|vn=*fd zZOg+CajWi z;_})#w_bB<{GsTILZ({s4RQ4mkd3h*Q9xzktXYe0_(x+8roB%!TjX-BtYzth@%gRH z*AZdYS)@4?V+w{kf0i~E`l`4mRmcBQX45o@XU8me(9%g-E%FA&=Le9WFB%G?_nH<# zKK+`mt!sJrhh+RbG|ia_0c*P%c;`=?i&)#X3o7iNvccDp&I#;SdMEMDI61>K)39fj z`utGTIm@*7Li%rD3~FB+!|3~pD2%X0CD@xc`=@LRVK0XwZgj}5X-iss?<~s$4J~uW zUByZxBSyy!^o4;AOK25oV5p&&RDW-BfNZ&iq+2|^5am)b)h)v}_nT%<3_!e){xQ!I z-=HuL&?mF`3#iYW|1CUIAsX$WXrTq-^D|Y9o<_xpC@($Nj=AIxAu3s89Xh_RVPdFg zk#|VTjeG~=r-zl==KSgTvL%?4>)4|n<6#;JN{Sc;S>3NX*x@_z9N4>uOs%w52`uwK zrTR|p!DuMPg*8OOlJAokF8>7c@$qD}d$Uix&bLUr?)~xusja-hVy4!gazva-{9r8~ z$?x+ZD@$JN9zU8ZUVA&wSSjJ*HxF5I=s&GE3xLiewY8c-8f(fM$Li_A0AEkx&WB-N6ZvBtdMJ2u27 zq2c8IrWu=w6$*9!V{MryJX`7*n4A)UW_QQs}H(KYXO~#pD)f|%jve` zbvIl--AY*_dS4n(6PA4G3RbjMk=P_ZX=~gY-Do-z<@^zx1(97RKsYMP^3RVS!$tFxWvEA3<$~!JD$(xDNCmPAz4^^wbeoIQmoQAvw z6@%iT0;>cneN@8x$d)9#p5t!H(<}MV!dA|`V(u3Mbne&U-MSK%@8~tC_quWtZEA|) zsGG{YL1gDql-YI5)~ZzFn}ZVTUmsi`zBdiW zARxeL)<5hv>{5<1#oxRr=gLsc_|))ubrMzQT5!P&idWE-YaGw?55=nX#fMVqEbOje zEp}wvOdo!2kmD-gEewR1r?D0c z0RXK309uHcS9oX=m{Om$!jxtQ)CXiW@D+|mx$X4icS1Pj80?rV=mo#bd-u;IzVi#b zUo_(-`EIo>KLu;fX?~JlCsyXEn9@sHYZJTL2-VznMSQ_FlbjMv2QrC^3x>Ve&Cxs}14UhX)52#@{paAyM+*@c9HM!V&!cExSmfMB*YhCi>?a7;1#)Xjep@R>C5t zkFN1YXRSoHLFRhfbtb|$W;L>ETh{N29UQ7jgpHC220t~*wDxeMQtE%Fvgbd-QX1sk zS#3Bm;!)Z#%vy3ks9nwib6= zMT%?m%9r=tr3;~f(U&sKDe8OY*|6{Q$6V=5?+QtfS0qnZD?w9tDY=kdY2>bwdp%_p zm-KByrQF3kbL|6LU8>r8Gj{^~@ld5gNIy|OGc2zfmHP?Bpbxou7d?FH&UtE$`pmZ| zZ@KA4xrMNb?;g<4Y)2S(^KRVVv1Bt~ylRcJOg*!=!ciRKNRs@btRdj?)$DPxZ;T=f zgcD5MQfA}Np6}$(qmk28*zDXmJiI_09dj8;Zy`bTfOGzLZ!u~r#G%75o0iEApUayw ze}1Ili2wHJS*-$%A|a&9r$oDwh*FI1C0{b%4Qm;GUi8lhnc?=MUfxI=2DKnUve^}b z+;Fr3Xr%Y_4#&i@%_>$zMmWYR3Od@neRPxfw6EI5J#IrJBr$uDt#i8awhHc~L?SGW z36k#_KFhOguD9uh-oL1PPy$m--Ql?F`IJyLG1|%Mbt0+D1I}bB(s&uI`jEBK;wn}# z7Vei!9D-T}Bv#ygnHLqDT^47(IUmU5IL#=hQ|Bl2d8)~U$#W9=wBHhXX-klHyW>`w z+sGZUA(r3_OIZ@Dtv52P5uYXmwLTVFHtHu`+s(i2%>&M=PQBRzu>Ur<%Iiv^o7|WfgITN>QO`+M z-CZ_d*sGK5*SdU&7UjAm|B*htGUw@BPsNXW2@zP`U+75iD;*O&UX^PlR3F|5_Qtb% zdm(Bvh{UD-$}2X?_ecc$_bl_|wdLpiZ^7VjUTtpm3hr~nMK(p)OW`)ytl`FHH@G6t zVJ*6~6+k%PmQs*cgOgNutS#(N5)r(z7MCbfC9R1#;;G(mQI+AA@uI9?UOXNS{~RrO zz{%w)Q^9$8{#zuOM5QPO7_RgWKWvknw%v*4N4cpouMg@D7DKPRl&%sbq8*zuN-21C zzofdT+WBX`TJas{d%>fdihg2Aho4sreRbq|ceI5WjAWEd)jSS)u=>%c{L6wwuI%~O zN*VVVPGYS+uP-L6XM7lv>We_=&G4EDTh5xaOTO$9)yi}~ANJiaVwzJXH&toFzQUy_ zUt}cHE&TGH|GBKMnmrJBQs3oI!rV@T7w(nxhIQR#<9YWE)`Pv8beP&Cj+w_{6Czk2 zVVqad_Q_ni;#<6glD+BDuHBfg^esl6Egiv>nqlVovby&E$3Z+@#8EvQgC48>8JS5E z=2fgyZa;~`MD!+|JKD-0-M4(~$#ly$hvm8hti{RwNwjRTU?2Zz^5lgBir25Lo{qEG zf0D7p?O5~<<@^?x*qJ$$jz{C~XzpBe^FzPDv~a_Z&kF%!!P%c_yqJPokT?vuDsrlO z_|GuR!})nML5D#hwqcSz?J$?p(&;LdZ~A-GE35P+=o1<|ARVW3vJWQAx)pBSMvpWUdpxW$ zkz(H)NE}>QQ)&}Fyy#y_oeG}ba409}Rb*mDB0Cbq!_W8M=0n6Z;XMb&#egC+bjusx zM0Jzfr=P{P70gjb_)cHt^Sauxh;m6abn?1%B*vm~0Z+VkRM{R4w5HjeDK-BxMOe=g z22_|PHjW|)iA+e43=hA+8HCbka+ZeSuFa4dLvC)@hzwvv@QyvhdnAP|b(>>|q2))L z{ZVIIZ8 zc$DODMc~>)7nK#%(jvl(!9u_#O=w)=H5-Ktj5kxA-o5E90=jbZA@ zYBA#`y*&DmEtG1}g3d=)&uGRzou+Tcina8Mt7iC2Fj=E%4C;%5b>r(xfYd?fst!tfqYprEm8Rhg>aH;(^t`QU)FxSe`Q%qNSM--VNe zK)#>8e9kr@_4YSXzj_>=_gGF6RChAZ-7;gNsX#N0l2-Z=Xnu`-NxRdlvIAIF@-@+l z ziT6ncV_Ns?56h9CQf*~TFxAfM)-nW^+WXyZbC&h*4}P+jX9sf)DqH+*6gU!m0Ovi! zID$x&@OW{Io;w4d>)b@i{o%xRuVd+G6xY}DoDw%K=Vuebop$PwjVR{}*Zd2yzIT1T zxANr$dBjic;#qI!Aj4@6^-1;w;b;7G7_an+67}7nlO_5eNUn{(fu~l+r&U#kc3+ND zX7RqKNTz;6)25?uJXgm@UqjetUU&9MHpaAcbv>KTqPRWhEfLL^hRv{Xr?;gP3vtmR z)U0HD1|PMECIw1|HTmBV+$*NjUKhM!6O`SSekG%Yn#MA7Yz}L*8dc8XVZeBVhpue- zqaDZ%6S4b_6lCrJ9$$IVh{)fPGI}``U35inK$pD}H}JCG*b_Z-P}6D`>+ZG$Zd;{B zU`7|be71=!RGdO@@>ag(Gc{#Cxcqc6=>#QL+wCEO;B(8K-hotJHq9H@j47WqXx6po z+E&{Lwy2h>d#wiFqEnRZ3Wt5W>qbwo_N+*B;)ujQY2*iu6h;UEl3qdrmUzwr<3tjg z+|dANvRQhL@#aqA_R9qgw@+f*T&nrv?N=zW^ z#;wba7`?vl=Vqot9#SWdh0jO z`?0TnBkI$*E61M7B;$FrTjRN@B(C3M5%eDiCsr05=Iz3`(9;Vz*%VhgH$L%S&M>Ai z4NORb$NwqTT48twdu+sk61D+2K+hRE!tQ(GvnX7 zIMN%sD{kWG{hiijz%f~4;hk+8PTZXv*J5>@QbQ(;oN`pCvV|PUWis;^Q(pxHaX$%i zd}i;XzrOmZ@&lS>O26w|1C!)Vb1)X2vI%(TqL}@(-etElLp&G3Iu*aVn1Qz|cmuy-E7Ptu0wf zpHRV4i-aKsRbKFY!LQkfXgg^v5^0biOW0W^$|~^^F48besUL!2P<7Ue=X(5Z=y=0g zj%t?%l!wWn=rOtS%J14DF7ZTGy?7W)=DjfB%S^-nb20iO$qGI@-4iqaYxEDzT_5v2 zem;*{_ok9Wy0~E3_gmcP?49(Fa(wH_^7dM;92e1Q%;90$roAa+K0Q~fGaWIr@Sp6{ z{M@2|F-$97ww22$jVK)pR8Shyu3=gfr(v6_vL83*;nCV)TT~%5tk+w2`QHCZCi-K# zh7fhY_wDPQ-8)yVJ)3ivdVs_hNU$!9`wU%!h^@ry5XVIeP-!E|8n{pTdI91#w>~3# zLm&)g9{mbEFonfMaP;VeYqa}RxNRqKa%;>Jjl?%&< zTG0W-4At~ca;9Lkz4@=z6MbI|HiXUOwBECPxt{)rqj=Ot){??idd5L&>4l>mc>%Fu z#_DaZkTj2Z&FiWUCOKZ4&!x*qy-lb&h&5KT_&y((voLOOc~D+8MdXg!k!3RD8vJft z>~tK2*)qv4grZKKgzQmEy1T1adp==9q=)}b+9&vj-DJza?qvd(BB3`(^hJWCd3n#U z#DGXtNTdcwPH$|kK_ZJmudL5*zC*1?BhW|R^ROAvt(f{6obi4ggp%x?21s?6Lt>|Qi&eVyqM)Ic6XpIIY9qCR{rF2>N`<3Pp#mA*M%8my27GK zt>l*OEi9B1c7!@U<(=;we>hiiHFR6dsK(}_FO%M1zSU<`Fqb5E;pXD|>H^N6en0c* zzkS>v-yFYmL?4mv86rEaS}G9LXI@zMQC*8AkJCZC$~Bxr61BCelC$}ydG1u^YvJV} z#)Rq_f`{kwqlj+HaVjh==;vnJj#Ci_!)a(^Lejo|dsm$|tQA+^qs03zTH59H)+T0$ zRRP9wV<1VeW1`FyaX213+X(GyyJ-Z!#qHiFp-w)CX7Pc~--VfBOu8oiSe9)%U(QQi zjy30vdY*_u5HTIk^8Kkp_Oe^i@^0nN1{@wegBKoTHQ_sam*pA+F?Ymhi!Tc;LUf2=_uIB2}%K@Y+ywg8y5$~HP z*}_E}rb!yaZ^lOoQVyffIPd1};%&?vp>(Z1FD+@y)M&IjF5mmf?ia5b2x{P_$cV&&6dR|~_slwU1tCuv4O zEo+PdJCo_FtsZ?yNkZle!jOgeMemn*Gv*%abH$-g=PvqZ(=ChZBshnW;94UbFYige zzVTL(Pv~AkPHzi~E$(H^rEvV56y@d=huFkh1CQL>d|T>2Z{uL(@lKkN=5wh>idajn ztvIav;LeUMZ0*2@wT<*=9ViF&S23nKZa#cpl2tqw;V|62ik;4nasksHqk_s{1C1J2 zkR$}f=8B(56N>j*W9k4jB(d60Z5IOZ)kD8AQ?*>oPgr z8_27qHyCplB4Z+Iyo{UTAkCmR+}|sC8=Q{mpA)p1Ar{HN`jG+Ay>ul{Zx( zD;R+i_G~wbu5YN1*Q_30`w>(Abr<7il7ZOY2INIDmpRM8`+G44g*aUN&AUNO+Rk{K zmprg)^_L$)u|K$7%Bv7h6f|~|Jx}o-rNToB4Xv0C{S`{ydz8n#cqkh~K?d0Q z&ajqYd1a2nKd29fn-QUT<*|^KBBtFO|D|{2JxhMYtZ>X|02P9)jx2jL}29^r?db zS1yoe(lnO5kmbB3^*&mR>E&marSG4+?5S(-moT)!Q`j|;5`E2xH>#83@`aY5{N{^m z2Lvt-*Q}736A3!!L7Zils@~&_4#VAR!sa6se|XoQNq@0&<%m*o+Dw?xT&dt?A65u7 z@DUv%foo@ksA#a^%T-Oh-m)K&Pjf6H86QNsp$)+I1$Jb3?`#-ln^#fVyTvigzkIzX z{6ZtI(Qtw%+9sjn`KbCz+()0-*kr7v;fXOJ6G08AH0>BQQO1Y;4xLPnkvL%+y>!G| zmRtO&3l=7)s2@j6Bx;E}LeIs%1 zijX_c(AQ@ckDeW(bd8&2A+aSAn9B=0gDpigjlh}CUbdE7h~ivkhHYtySisTDc%M_@ z>vr^WwX{#I6p8~MDV8X#CAhX!8wI-hUAr6!CDXZI!N|-=_raT%eiieEdso4QvX%A3 zDNP?~&cdh_L#|k%a3Lp)Ov`Kc1MYU;Xc>Hs?W)iINUcY+ zL461zX$U#ypr6al+m~MG_KHfU?_Ay*{x%1n+#mTMl9iV&1b5>m&iwFZo{p1$+@2tN3KuPCahsZ@mmA>{|D z;5qdhy?$O)6#hp~R`iyCl4ssNPh4S0$UF1yPE}Y_oX+DJN@4W!q2P<-(%52}H{^RI zaXf^C5_0SD$mFUd#xg~6%UB*gDS)ZuI_s6FG<@OXnBRZgqL$GjpjNmtpX6Sf%weVw z41G|JGiK?3<3;2|Hlig}KVt;rgXoYq_on84z+Ihv%jJ(x4j*J#@(Xt_plLrAMzR#o zi)50(bC#7X84U%O7u>i_du03KsE#!U!1*;1<)dKERP>LuM3zbno!SVdH(#y8tGd!+ zT`7etTfSc9Aojt|HWxH6d}zL2?JRI-)GdzVmRvq_M*IgvyCmeB8N!14{iLv3Yv;Wy zi(g9plU;86NKnt+c~4bNs3-VrN1wJo?`1Pv=gO8)ZoYum^-#k`*f#H!BcJPQH^Jht z$S3#qO?Tjli$}#%-b&*ahGn}LzS+w=r^Xs%Jj^KJ7zwiywHJ!cEP^Dm%KRX66@~>k zaMBvBKTOnMjW#e*;J%u#vV7RyT1jD$vZ$7%evAHi1PuCFKRM3*Hc-uMlrJB|-AIGFA&4h<@ z>Ba9jYIp%}H)d|J1eC5rhik95#Xr8-G1z==Qj}n%V9PqLDAL~4uXFk90cXcvYm~8A zCQQ)2u5*407e}Q3-RmWKp90yrWK$v62Td5EBM0d5Ay#3oS?{m#A?odP-%eJjDSn1) zzTSs$I2bzjPYT;8eZ!^au$K-`>e|`BVrQYmzoMsS=kP_F&F-=6&_Wf*wmcbMC3W!; z*;`YI(8bs5uWx7GXW7F~l~UWTvcDDu*v^yb6{?R{0oL9Xc4fYGrS*-9~M1ODV?z9+L zpwytV-KxDsf+;ZZ@~UB^#x-xH{{<7IL+W-=4gAYJ=G zp+XuVgrvKWOb%dY*|$?s|GnWa9D<1d;Z-^uk#=;gTfK0IikQ*}H7Qa>xd~dHm#pGy zpmoGs?Z%09)Ge&~bb+*4AxP_D($`CYoSV(}h_Q9P`3TI_GyV)AmdOgeRe5EDRQnN@ zxu45BxGHU5O~y}x2|xYIV-ghjZeH&`+2p<*Rrf<*NoX5=bHFm46s;Zql1laSLNCJ| z(#H&Ys^>1);~l+gO5e>HX+~XB-n=%tTQ&go4;5oXqB#;gk2nJag!A#~0y}hkn%q3R zI^2BR=PWG4NTs-n<+&nJiF@Zl6(sc`JyZ`$eytuID&S-M%^TQ*{A+8D{|s|-I1jj4 z$_+;&4liQ*y*nu0nwtAa@^9{36{uz(io#RWWiV4vr7KbO?JuKzW+LZCA`TKbEO>_X z8LQU43Bo+vM(!_zp#lc` zYj^q7M(%_6xc8B8MuNw9&h&z4DtG|fQcY`QfuZ$UpA{Y^N$nmvO0cbLncX6nC#*0j zsb)P_7b8`c-s<~$Q|;Df?x@9P@I3=BGUXIyV@aO}R+fRxA}K6O((-h2>UFrHDdVlo z57zKSK2D-`SxI5sC%?4Rchf)Z5!(52A03NPBhL8}nWyW?p$!ffR@_!>=?viF+tY7{ z!}nMYr8*~@B#U0sJwc(P$oP?UQ&B^_%|DyAAr`Ie4z+Ll75;CHJNlnI)Z*VX9zh7| z_?4twWE2hzjvADzn+eQvABmfWGxN&K3U#9#1IK`Btl|cWC%#$4Cjqh)M1I> z9`ylsmKmY%oJfyV*~^1xv5+L?s2X&hs`rufMyfPf_t9%7y;Tlo2o1j(uYT>wpfQlN zD81;>l@su#c6RrZ@%jUtk|6QU*8-Ma6@LA-w(CgP!V1qv854 zm3Bfov;-=-(D5;Ro`X68kuf%Y-#{qc=&ED}rMiS`%EIiNh4rmEYvG5MLOnJ5jsqWa z(bx;I%Oze-R~y)-iTyMVUi*5ENne^1i3O11BKVosBf%X$+{Qh!U2$6ute&p{Z^l*% zkDifdY@y$|jiOhh@~Rqooqa}EaAV@ak+kz3Sq~e%Ty?`k^TMePNB0>{;W#E;nr1l* z>TrcIC#i;9MB5St^d)m8fh<%NKRs!0BE#QU+QY$rVm`zQrM+MQwxa#%3yJYkd%%mp z5&<*irAEJ=oR_(I{f@3p0sFWF%_^ONp0v>3JIYI3@EL1I11o(o8_seo;>1M~k2JlO zHMa~j%#`;67pc?Ytv?c7w4Sr$zNhAjORg-6{y?8epyvZy5#hM{W4ZjChL63omk16= zclS}-BJ_%oSPcmlg2B#mKtNRxA58*x74VzciG?3dq|UIP!IYm_>~Y&~M#^*F4k){q zTkw_GpJhR+(ZY=;MwkO{4!1_5>oLa{7h?v4FAkB6!UxIzJDZotN0L(X-{v!u9aM7^ zjQdtoT|%{#Eelo1f>9jUZp@UZ@V7kkctbacM0g|+iAUfp<1Mft2Mj6FKSVL0gyQ>( zLTXVt;AWO6o61m_iof)0-0oCHsglK_00jqy-FRq==wNBnJI(>6g0S&sEiB`l*GS$u=!^2q2KT> z4MDQa2oeJ$12t!f?J{bD_<(^Y4zO=PZhF7^tU%-?fAmb(L*0v5CAENs?txo-n`ej~ zMBr;)TOPrWOUdE4cqI&t(IYXVJV}*ZVHn$HMxEq@?zhG5Ef{$l?B;Jaj9d#Z%|{bj zqk8r(paNskc4NF8*S<~DumxKoQypqjZK7RQ&d3m5+54ek*;AfAEoS5jQ`1zL*Wtv} zf=C(k?{h3s43tB8_tc(WPxT3lvnR8i)iVHR3AM9j=y1{x5Y=^ zG#9`h$I2o}3QOm=y-}^V*D$cu;}1j{y=~*M!yp@Ayz1U( zjU|F}L}R)!^vN})3KsEq5Aq=aNW5p5>-=6TLs@(HW8t5V@d_6wP8Y@XZt0-Ds=nqO21CZ>eMqL;$GiW zytViN#$fWnexq58(O${tX`~oM?gyc%j}|0}CFinuU*@;z-zM}G3wv3ryyWeoy)Wwk z+a@VC<@hdy-tj1!H!kt}nwi2j9Hw5a!c$~nmC?LSE1H*BfD_3`k|_sUXQ=tv+VttW zOF5Kjjt3QqW=PNzFL3(&KQ-e_Qt+Ucz?1>^fN~640K%eQM+Uiz}!6>g` zPWp{4lTT76`pmB#s(%&SSaug%T6=%gP2UIq$+_9)dgW^O^xbUp0Da$!cosgsV(wQJ z%APnHnA>WqI*1m#TI_c!X0jOBH=vA1u)p2M&&2$$dC4AzfAKAy*F1$ZQHhMVSNH7U zUIVGm*fjZ;ewch^8pP(}lO!f=IqSsB)?URA@=O@3jEa=6zFLj+F_&4xJp2W|#&o2x$vd>8_ExNdwN$OM#<|REKHNoy{@7#YcKrzDT zsen7c9wBq1dhXr+-cC`Sc-G+0&4{fIdjTF^iHlwrq* z_e-Aqd)6**1hoogf$eBCvV=q7Hcwj}DNy-Bw{_@&qO?+xr=3K_} z`<~|m>N0xy2A@Cm8GMUt#3tZu^y3OCS>Q^^4^OW#?#di2QGtzzFDr_(nqHw3-a(%` zQjXVzMYiY{S-3{Ptumx1h|{F&CAx*MU-f^JZ4a-q9>WpFT}s30scfsg?pnmmdKDj? zsOKqL+@R7GeW{GkNuq0%MA?Wm8c>BD@Ib5!I zrF==sV5|1M&$An-(I2D8kaQPPC=!ntPB#*{m!4*p0(?R!bA!jI;NA zNW50WAfVl#FZD&^!k}5H{-evDh4y-IX(=^tyuLUY=|%fnti(au7s#E4wJNheT;4fRKmp49n&SjaSZ{z%(rYJI?H2!_QP>EB;e3XJL*w72D9= z0(xRf|DRFLEDbw}!*lNYsuf>3CeR*hg+ekXgF#sfR?yv~)m z9y3`e^T&{ewkUm((twz$D}sCl_2e&6BVO3F7tv(lSJ?WqkSVu zH)Oe_0HcLYz+F8(P)_ygK2wIQpvd)T+V;M(%6t`%LOU_bH`JBn{Tm^*THOqYv;o7Z z>Z}?Mu0n5FJ3Ytvf{!X9xj`-2M*%lR26hk8HdM@!EZPFwbARIu!Luw-tN$KTfIUx? zkmr7eBH$sH@2P#lk8rH4MEUA3Fj>qgp|evMV~g>OI30eT#W(I~r^QP0GxY9h`KHsr*>g5#Mp1NcQcu*%lU(2rT3Be7ZON$mOXU%@pr1(MV}L%sm2DyNCEWVL+ZG2jd!aWLJGL+{Se?>d^{t95W>z z72gEcMEfm4jr~*n7^>4_g^!9ES|u;;j%D3t+k&A4RUdR z%o>1110+z82l!>)e*z{%{Hs_1UrT*1lf=77_@R7NfpDF6;s{ZYnehVFtyU1x)L?Fi zr?!SgDIbv6R}rPW2bGEnZhm!6=lS=M_4BTJ!q{}Jw$2LiV_HAr1igpun>P$kW-zQX zI$kf`mn23Mg%QZ2X1axVNvay=z>4*m=bBJ)MJUI=Q8ZeE}@ zy7DBq&B4zJcWS3NV^5DbDp_>rL;SnhRH({+1sdq}mRV4U!qxPjWkWTV#LjX*j9ByuBxM?668i~p1xq48@d8(<9#QQTMP z-z<6P;(z3<@`m{}CUFH!)kcCa9I8{mQSB1A)FL|t$(xi9__7v{U0))oM~;8hyif2>ciO7H=R|iJEoN4@ z+dMVDf%;D`D`VxdHe3aRIXBrU+8C-&H(c$ME@j86r7UA5r!0>p)z;2F?R~s`h?XX0 zgJca0_)*FK!o;tfD#wXZppxdE*1zuL)MO3R4cH|-?9H82v_Y$TczQ|MnR@^=p~~hb z_dEy$HWaKNZS864VejVU>H&d)KuHq<)$+FRI!@ODG5|u5s<|_e^yJTWe^r+JmF0w! zj}HOigacVkc({Snh`@F6*EAD^pOt7J#VCq1E79i~j zh%12XDKaiV785Xi%m32G{3c;TrR{BOtbyb!V73(_2poV~o^IxrKq44GH#aaVi<7mD z*U7Diy{(-W1e{jd`MKFyy8vWs4|`WD2zZpp-`c|!0(Et<#)f)bcZGm6dK(~%3-G_a z51_0kATt73PyyP7tUw1PUkti0uY}0euApr1gYA0d>70P#*~NI*=6x z7#;io<$xbF16g38diGXcc7SEU7xLc;WsW=g_wSYd=SBZ=h2y?M*6)D6PUg0rfa%ZT z0TP~;V4@sBz#pJs>);no9&kP^W$q^b%eee7K5VG|$rl88{z3sr5@c^F;bQA#4G0d^ z^0Ic;1#eW$eUCK)Yy!Lo|4SQmtRJY35riMus`&qKVgJ9j0scw}yZ|r)K>!W}T-X8v zf)G9c-@yc75I7wE>jG#90p4F_0ow0+Cv|?;J^hLO`}sfRfi}Q)jxT`Uv1LGJ;KdL8 zkQ2O)zXO4T9}o=u4phcQ0PiVC2OAUs1UN1WJU^~?QtqF6Cv6A<>q5sguzjGt6Ctp{ zPhQ|{0NVl;{4FHNCQ2*Ft z0Q$c_k1Y+-z~}$EP8eZ>0{m9^^e6Z#9QPm~^uM3|=KK5e&o@b9vqVUY6p-O0e` z|MEHfd$KA4IZp2YjK`7?7&yxS?xcX%^4FaV5WIff$wGi}^!TY9-~_+!TNguqXD{0daZM?eAnfM|EZ2MCfU ze146)yeInDKmhMO=5-wUj!zE(6_2AK?+Hg+2=9qjc3|{8E`1VIf!;V4#UAL`Uk@B1 zh!d7hU|c;e>Yk_ugkdn+9=GEL0a8{R*ZE&O2jcJlD8It`@wz-9 z=1E9DUd0B!dH}BRs~t}WaLV+!78uKcqh7!6kI@3(i3DC?JU^y`7=Z6YS#JpNW&l*| ze+Oy46A^sCuzy_TI^=lv1u*r404EBLY5u?)adHRZ3;thR{}-vi@4w@&EDUmt=KdX+ zLAicEmw^2n0sWKm4k{%96oOisLttPCgi7%IUyA`?;H22Uf+eW%35);91#12m5j4VI zL{Poo68=ICzf$=@mF%rRNCv$8pe2s8+j#@g{#eM9EG$4)ga1{Q0y5Nrppykah;B}< zUQYHF5Fa=fjGIf417hdp<>n~_g*yN9jLX%-mK7TcKt(HWOCaOjzpJ@f*+4AJEgbP<%Gt=%?o<@DH;#xv!`jmNsrSsI4~lfrtyNAX-?JU1&512 zXuucHdr#5e;LH3cjfWo`LjI(2^KhS*kDG@d#D#y9g9(7)=rj!ujuC&B;n zeB7sL01xn;|Dzl?k03Y{{Yitv!FT5rjUOD!PSFH_b9tv|yxid6_h&g?-oN?s0`Blv zIeu^;KUG%%9FhK{A%IT*iv|PM>rR(L{M{c20WiG$*&ZLy-!|Yw05gMA<@jM>Dx*K^ z^7DWr>M0rm3|yyZe89xxG)?et-w@yh$Miqz3IH8^nkLBow|s&;V37K=t{?&&9#7E( z|MCqOH~cU8VBCCv`8|+H6O1K)v#oVULG*^ohk$=c&MDXo^hzcn!415#fqL; literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalized_score_variability.pdf b/audit/dataset_statistics_plots/normalized_score_variability.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9af42030016afaea7577f2332ebc577db172ddbb GIT binary patch literal 34031 zcmZVlbyOSS)(4ChcXy|_1a~b`pv5T`T!RLe;>9UmTHK|$B{-BKK??y26qn*o2@Zvp zo1SyNd*Aill|N=pGRf@O>)HF+KMA|;OI1E0eoe@dE?gmIL1qtI7hG9cW&wjh zPY|3|*u#-oK-b0*jJX# zWtRBwsY-yan!c}%FNj(6zi;T;`1*pp-I>Myo#G1UJK5MfyE`(A|L;?84?BI3FS8l) zwl7tYp@0H>nFU_EAwy95-%qqpuUQ(EskN>I_CcH?x2$7@3ThAUh9x(7*6~e7!+7 zZny#Y6QLSYZX;f6j}H{VTttu1pQ&)RB@SjuMSGHOeTeX$ZX+&40&)51tFX}dn)S-y zxy>{;lXPMVudI}mk_IZ2~_=)=Wc-AcMS}xVd4E8Rp?8!tm;H(-I|KjZF`sPN!qv#JcdRXF% zyT{Y|-CIHBfhXW`aB9XA>eBih7UN&l%{Sx0OLrVw)(_C1=8a*cLyzm;UoYs-{`~&6 zef^@*%Btv(WLjguAN;4MTHYSxQMz@kyDRB?G|k?x;0;6Vh#FznOHaw&{!^;$d4dm z4$A!_m-IOD-tzBbDHY^nPza^P29v@SDd({6o!v{1cVW}66K6gtj?)-_pH~FEJ`=lr zp?Y<`PxWV)^zo30zyC>U+N}1hI^(J1U|PUO`fVR@(s)pNR!uVnj(GX^ctqe`=%3#! zXY!%<6AMqb+r|qXFG3#HMtZ|lC5EyQh8`mF&IX95E7F;t3o|r!p#mcC^;!DMpENq> z0^FgnTq!(`hpQR9znmIHN<$cB z3DM!sAZ;YN3;w9poL;Mza-w_gEb?`7oj)M(F7JCH+H2pJ5FAu2#z(4Gh<5S0v~|Z{ z#JbesthC;Y&a`NG3n)!e^qMi!GTtTHN_m^EP+(-qH)M8P=gmQ(%Yu@T*F&CBmA-jx&QUq-g*iT*yPLv9CY1w+Ax zlfso*x_WbQTWY~dCZc>Hl3%bFY*o^90e`(iPNrwKf*YH~r6qSbMkf1b1z#xFhT`T6 zX)1Ci6AJdGnmSb_39Ui0>@@B0eK?YBylqjhh#`|`OlK$$a=z~aW2I7x$$PV)OnRli zrXN!_Cnh#0`WI8xd9v-)se}50NU9ZV&@Sr@!Skxh*~8Jt*c*H0TN#lwco=#g%Mzo! zuae&e)r)mC3PtYsRcWec_hT6$go-*gW+>fubqjeZLrw_4O5T%GfVe_LIipp^VX@8e z2mmGd{otYi=CIw5myas-bLqHkm;o)#n2%Zn+;Lo<(G4Q`i~%Xoid$Xf_|-GItxuw~ zrKl$uBzEmyT8VCbBJS805$$i`oRaZ(w8L&+Z%}+$$hpyY>q_r-aSE1UN1ltkM0AQu zGcItltxDjvv` zNmpt~t?@&xjl+wcm;$Wn@%QAs#AD z_l`n?ao$|hJ4lp+zOKUa2vJzftgR1Em8GM1ET&F*pHHo8`-P7+TG7zPA^asinl0s{ z%{cJ+n|(y;qKI3jgJcj;kzwpR`HO<&=#9l1HK>~gw(<>iwb%%zn3(I>9uRx>hihCO zx?W01#(Z|O5~<#>;-SSf3G69%ZNWk21AmHO`$8-NAPT|ZPDbOj^N8VfvpkCxY1>2> zsZcr#gNt`+vAXwp=R*MB6KR3NZWcpFnb2|T=YzN?ICL*|D}Hib;kl;7`vpi6za*iY zvw!!_x^A=VjfGN+j}`2fMMgx#SDX19c(o*)WhT-CYt52nz7M@p;#pe?E+ICmC-oS; z8~z`0QGA__Mg3|Rib7RC8TAMXs<)<7lGCnuUqnoEq%*%aPL(wA-NCW0T!I|FBptoO zkfj=+^(0XbLCu2EqubwwCX8VekMox%Y6wk~^q2%OEOi37omeY?@V zh-O6zXau}b{nnHqhBr~ek&#o}hv`El&~lKn+l+yzEPb0>-9m*uye_YIERka9ESfMX}k8RLBe+Siz zN@jeZ$28aP-g-+?XuKH11@i}di4`t=MP2e8D>LeQ$D?7&EzWEt=2aZGFqV9*?;@=f zQ}Gl2>~<015VOwWTwWTntdFX7pLvSl-*_iq)e%f%-Kvq0P!MgcXLth3y(B8?F7Zpr ziX%xba(NP;aH2XyWK_XzKaNbguwEU?oywG2&L6qklo-9vSmRhjUJ+SR90s1>o_F(w zWskZoO(jNp6{!S<3az)b)0;;5-(fqSU!NhMwnw)K`n0j$dig%>*6(i}xG1}GR&o!4 zic%O{@|4}d(X2;UmDDxcVF91t(74NAPrMqjgoY6u{@MTUQ%Np zs;_ZM7?{Wy(Tp~O`KtonSR4#DqTWRRwev@u79<4Ml8LH1>;(9DKJNwJZg! z+u1aL`RE7UMe_P2OzB7^yd$Nk@f1b;AKIU5W9U&}K17BsK6w2#q_ed#uoJHY{V)G9)Z6bp(6ePB*?HOzM~rD zIEo*DIAW(#98a=nXt^xq6$`}Csn&2n)6Iy@I`0l6GDe)a)yCmrMkz9-0oSA@ZG1ay z*;la99O_)5&*J29f1!JhGK3A((Tm}oT2U_HOfVfNgd-8NGtUA4P~J?0{z4OeCL9Tu|Yc=7b2!#5xl7r6(~%Pz2qdS8&WQ>~N779be6EY?daL9lIrk0Q{%hLzO%OGhU#zUAz$O>JEa_^MTRL=KnxGcJ zrrxe4=D%~~`$8e#wY*GI z%VwEiN^N9jwXD-C<+#7TZdHCI(*sdjx?CvGwaDd+zGvE}=WR8xJ*YV)t3WY^cpe<7 zSa`p&$yd55NWbu{LBlLjTS>)`DS_#fd$-JaSNXKmJdr{L=5}_LniM(~7q~zi^J8>H z%0cQSzc^wjU_i-nvk!5cqc!r3Xj{bpq(QT7i)XQ4ykBTDXr*L+=OYeNlO=U4!8$hJ zCB!;T^={-)ex8klcm=RJ1awCwlDzzk7W}>3AW&A%?H2=Kwge60=gn=NN%ZsQ_2s+o zF^I>S1K%2WDv&<=iIZ0KKq)M)`7Vj51ec;eI!Qbor8s*p`(lhYGlC>|!-Jt3-GBf) zq=nre)shHp5;M-Eb%9cY@4=QFOhjEqFNC2yH~KkT2sQPRheIVFC9klL;*wMR4k(*6 z;a>e7PA|)09rjFix{%nC6@k5Ib3P^6v6Y-6^h@^_1wN62 zk`0%D-!F9afIf|Zx`e{yer(j)@hyLw+!62283BWPOn5GdHmsTgM4v7SVQ!aPnOy*m zZv+XVC~-RmHTi|561jt7l*hcZQl%@%zeYzUNR!KVk-0?@ROUL zpKBAF*_umGvtH8Pe&X9fJGO96Xq+g%#82B_L(!{H30frUeATFrK=}#o4CS?imoH9W ztlmg=sie8hgTqAUyK#mq)B?XtA@wkJU!Z5vjUOIc!uQKr4cqh9r246YmI$Gg2WK_4}E>k1+8& z+GX5pwF4j?UpyadQ|B05@+Qf2T-|-!WKNne2ly9v&B=1d_U9oZd*^%>R79X_;#6cV*$|5|@@H^A%cMq;WERiK) zLUB+))8YWnR3HOWrg+RXi}7wQSLvletoniOblgDu4gGGt^T_`huJBWu53*m#9V*+P zByrnLx2j+rTei#+^#=|c65o0?5z+l@MeOi;?^fEylv8jVjT>SmWz*1gZZ?0g2uY%# zYA`mVBM$3WD#0Cax&>7~yxHUQ0++DQ7qg3*8eg z3V23H#dg7S-_M^(fAT^sxRczFk1Nc#!oEEy1FJR06Oc)^IXL>0iD+M!y63rr(4{A> zs>M`y^KP}E?CT`O!eR#cXt#b_Bh>FpBF0LktjX3`PLKgE+4|29F)yO^giW1v#PN~Ha^_`BTD8(F4;+Of6 z^N?N?d6IAMUP1L{!r#45P7^WjZAem84Jl0$BFy5JZ;oKV;CJRKbUt`Cd3A-#Q*(su z+B)1!DtL4*6uTy1u8a7uR>>(M{vCd{fv(~So8d;|sJjBzWqd-slU4_;FUOtAJf@Q8 z!%E7N8s$DWLWp|@DA5o z{uMvUsgb_nusqFx!qs_Y;9FL+o^L92#|yhH&}opcbY{}GD^~y(vqT{mC!@=v55aC^ z`z!xW5n6}lA6AexCzPm zFtb`XnPESasHZX#!PFwHgLkeX=9^h~xc`)Vr;{;6bR@U z5Qc|1$0_~PtcFUJN>VQ18r>(x2`kThNJ7}yeIFU4w!qJ3Wai8$98IP?2)1wS5Kdars=EEU%Cr9y zu_b{Cy)Tfl+cTR|8oU&c9yt*$#aS8`^)Y-`+9vSU>Qang?*!p1)E=m2UX;_fI_yvK zaXb540q-Y6d!J&it&N&CWn;sT4g{u@Z0&tYh|8gInQdy0YBMur>iATce z0MYdE8)Jk)>^sYkuhrWVR-)o{KxVMIU^S3zD>-*R&lXn5r>U9$CICBSp%Z!xR4`$U

W`_7!{S8ZJws18FRH7p2$JN$EEXJtS98DYNDghWwjnl2bizcI7@W8Z z0$C1kyF91490z5cycRR~RYTgh(Gk>vKXds!mKtIL+k8ptn=v0~YlBs}5YdzYk5WIv zkkxm))h~gUBJ3OUBqe*^uiM$ki#j6eUniRFdWybdf-ms8K%9}8yKe}7LvGx=>xE7E ze(Rg*2FYorMb1X6IIK&OemsI-JS1UfP~RW}neBo%lwx>HlU%TCXz?iLaG02=wl)`L zP^ZO*mx`a{^sB6~3ImS`8`+*o!GfN_42Zx9V#3_H#>wa<^R^P7qoL1zKAVXpQ*v$_ z;4TEEjz?(&v7mS|Y9CGY>!+<2K>D0hD^ z>axq;gh@%?Y;%E^t!*yS-!oFhdV-DOL5M(AXTrFw8%Bq&v+odv+}e>Lsw3v&cKrJCSUiS&wNXaa{bkP*x{ ztHLSU0tu_bSK>}ox)zh;%0yV~7(nKXU6k>K)n`H3I_ZgD18J{w5@8UR`&V%w`2mf< zAFR1bH5fywlj%p(jTgo9#3^(BNThX9D+E=;pQ5sb3=;IHIz~IPV&(}NiNyLuUQQ-0 zmtaH)3rW}A8jCD?N$FIhrefaVb-#_Wdvh6tA4kkq=|?AV`6`KUbLRO+2ikDu&J`mW zS5Jsr`UT~iOS*a3QzOXM2E86ngU*5j z^xSD)v`Jmi*O>%{C$)lEH2oy?z>S(NTbOi*)+)i!b)GWJ$g-q)fL8Pf`rWsWZdt?b zB=@@bRTd)xaTp>Pc%RieyOu@vkUJjr6T|vxH~p^7NKvS?IHY5*kneou96S{v*ul^F zb0^uOApKLs&Yl2YcjG0quH%j}Rq@RT3DX@@{^Arj9vo`u*;or#3! z3Fc4Pw+p1;y^jWsY40;k6;OqRf(pTh0bi2{8Z5La=-6?3n&1T^R=>qOFiki0EuVL% zj2wIj-C)b$_IQ~nWjA+bLkd91joQz5H;*{@ah9IN)WdWN*i{XUiKZZ~D z(Fr+WXdx5M4{HVLu^A-x(lP~K30^Y4ls1-C4L8ZnFj0>Qynsg;=X1@6_MovRtYt05 z0dgY4b7ntakyp25&<=kudYx(>8qr*e$!fJu!lv=GFEoOk(Wy_Ed_@iU`cqVVvT{vB zwge8YBy8`Mk>`;gJ#I0|aY&K3`kJAEVefCaJ!s?x{y+}ENhqh@F+0h=qnbTS2fWq9 zS9dWB+(Tzj{k825h==6Xo4h*JAl;JBlcj`NvF$XTe> z)3lC@v}VXEIrVhkIqUpl7>_g6$L1g2lE>Zlg~yDRd#^`|o7p+HeCRL`hDLbgX$;MHNTmX~)y|oTxFBJJd9dL_ zU27cmtg>cc<>kdqr%bPuIA1%sW>PUT` zmr8b&M>QbEvLe7$ZEtGX{dMT&+mxwmF)7>`x|i%5vGc{ z*l(9Or`>~t)E!6`3CG~($DE!6;)l#QDLp^6)5H&B7T8|)%z)dern&SCNo$bNHK_%5 zP%n=)!1B)me3n7h%_sd{vHg?O6jsqDtSCEWKU&AO;_UMUcg?b{xi50*P9YifvAtyz zPmP1RKQ7>tG?Bac)(ZkYRcHQ8!}k$(Q~oRxFZ+3a&PDeVOi%jRZDF_$y$oOXHs6%~ z*}38e$cE7;7(K?116aO^^Os`l)#McXb+rwPQWha&TWEz0swnD@yQu3B6&=*O7+ABx^cPP{b#5U625`hgf>tD--z4DI=s*SF>hX;6CU0%=pIgpM|lxeQch&)XDao<8O40}8uKlQrEn9(S->2Q_XN2381Ixb zD{kNrDzbTivuhKb!C@vUo>CP)6+FDlrPFixGUV{C3O-a+QYW)8AmYgzs`NS~MAr-) zPgvoyUCgAZ0VD#>!$T*=L7L;gxb0*PAXx zypAg)PphGykugzfF7mG1=c5z~O>nYQyO_HM_fH=*iT-~9M4|9sky zRBJ^M()O1XsVDoTa-=PWF-SA6RP&=)38 z@#qMY$n+}7X&Rq1ydj;YSE}3hnER7s6qB+EEw^m(F`=eKywxaMh?WE{Ou=vUKbpp9 z?Ggybm(9*V43k6YI$s3JWq|=HyR!65SD31;&JCJ&+gKKiI5pv!VGFQCx|~ zXzycy!$?ItG|6({H!tkz0|VreqaGJf>0^jsUUvKW!8v z-uB%RcbG~;WIwo-&&#As04|jBNG5sP)_{`xVN7Z!Xl@T+AY4O{rQDjLL zS)5B(pSp~*;QpUVed`*iZJA3e9Fzw?5Vfgm_m~`p)E|AH zG_R&Wk<5R)5tUY7CTpUdz1C89q70 zv<-}fD6amjTc-?G^rL*LN#IOyjA`Nf!sa^;%GdO=G0KZT%q%StW1>6AN4Gd(k)`Dgp}LFy?sn0Il6v6@fE3{Gi5)Yl z>9rj%kHGTcMoqP_iL&Syk9wDK0&Dj1s3DcU^FC(&rO)~)CYvIQE%Gx`2ELn!da3e| z76ABPmxKOH)7JXqVx3?5+h1<@qo?y?i7AxRFsBiX_$VzV%1rLT_N(aorXk(7gu9r- zh1w4q-%?F`D~d?7sfB^6tkVyiVkDQib20^ZXAalCk7FC(%`(tLM-uot6=f zU`Y_IGnr_GAgNjeD+Jb!J9?=%kKSoOM`v#nMp>c>pPBWjzReBtvY+spT*IB?Oz?XzHAd2qiR>RV)xDe^Em=R_d@hK%>$j|6gS}Zg{D-CPVS~Sc#D=kAH^KtYFh+ z;n1_e0=mq$8%mQ;#34ltYM2VuViGJpwa$G|+}s3%C-FO^K@OJNU~jj0m@sF=2)h^Rld8;o1~flwTQ|YPKh$Bt|!lC*IoY>-W-9 z|MsL;hOSGk=Qt9dhDkEQq!VQ&_0ZAS0apz+Q~I85pst_syRt!n`H;$%NtXKt{9|Q{ z{c~d8^~MQ>h7?ZUIm2X$c!Eo2Tr?+!at|2Xzj+IPY#3tRZb+e%hq*{f zh~&8yQo>w68-}v&9P~QaQ}ONUb5qH@r`gU)1+%mNqSeiFT`!S1Zhub7!*is2cc6{K zB!@>g$pdRyMqvME`b?xwGlhEd2UFJj zi^|$qrBK(-+MHP;@W);4npo1!jVE9vnQ@g|GHbGribLE77mP82$I=37`W^3+^cdZ% z5peyc&U|26YNP6J?*J7#EDL4ozn?+dqYLqL-ykdcCGg0D_ituGwgw9O7F_DH2p2+7 ziM}{+7KPW~N2KnhpG-FiZfddsYEe=HTuzLYw?FSf=nd=I4%c+_`R*ktc?CySW1@Tt zev!Dc>Fsr0g}q=YMH`okY&BK*>94%!6%CHNJUsVpGxNtH+dp-cX~p-oA+KL8GNVSy z!y*K4hh<0t^?zv^^V}xZOki1sTrC;9qyInRZW;pt04|eLcVJ!YhHrmgpCJJYXga*c3%N&!9!h*i|(1 zW*4_le?rzMQ#9)v=_kY21DLBFXq;xU+&T4BzE437OJ!H&vjsiXk{U4lnvF~_}G588#T>_F-$Ww+v?R`^N!_a4B7hyW&zVcSNj`;K(1 zS7o^H_Hycbp?VLJofRL)V~eC|%Tt7~Qi+{huZ5xKxU>x8bD_1|1MCn}C#+(GNH*#r zQUk8;DEx*unPXra>Qn+Z85w&&ao%r@7V){Y*`Oif)_tW7_kAFv#?CC&+_0i?C00e= z-ml)h0-jpk;IVG|7GuIfqRbEcOFWf03-rf}fRpsoS6o%I|8J8Ig8Ff8W05s3#eZA6 z=ZH<>!4RLJYnip_DQaWL{8Y5whK7mS*8!d66ky-cMaQx1G3VOW?8WHPWbR`N@S<{X zHxh*>$hOHcj6U>IK6K;AZrrS^Z>mJSX+4%^G?t`1`7?8*xk2r~nLblrl7by~C=I6a zsX@w2?Rqx+o4-WPm>r!JOt$op1j`Gx!%^X$xxk5KBdm%i<#oBa{iAP9)WO9e( zj2ni+SxTrM=DvTQ6ts?dkkyD!9$Wb^M~$gLM%98wReN9wOpf6?<{zBnt1wOGESBZT zz3Miy${utu8|oU15w5S|5vka>UQ#X;a}()g%5fP}CON_w zjoBzObu&%K!`&FZ21H7Ou=#6b(hVo#h&<8iA&W%jd=P#3X0W&oM1nEX8|UsE{xG|K!LOA!Y>n!I+?o5YN*okYC(C#G6$xncAN z11|Nr`e;K+RnMMX1w@~_gHp~!3iPNMwFjZM*es?d<89bb?{Cs%fSv|{bQ0nfTl_1& z;VxSN9V?MIcBo8#8QdXce*8Y3sONFgEvRBiKY?iHYdF^R5wUrCJ5FsO5!E_RoMY=| zn$GDT7vFrI)`Hd*lf=GXVx;NLu7tel7tynufM2EuuT}+G>o;2F8VKf$!nEm~JSi|R z;h(i~@n`W1!+Lk|PdHcSE%487KB__QE|i$pyOt$aN?5?}v&!&u4&P9M%Bb)&gTE4I zn(W(r4s3@6ymt$jAyHvVZ6Iq{DA^JPPwcHR0>dUjb8eYkj z7{TVS1`g^NNBw!np|O#*Yp}^k(6x(Qry~5w5VcjrM&9IY4xIm>wv8tfc>JowsUi7y0G>s?v`SWJEDa~CE|?fJ|-ClX2ANB!6PtcLpb5=MhW z-#_h1OKk8=nLe+x7^p4Z(!1f^#EW(Ht6uD2|0)Qw;J#dr`_?jXPU5B%=f}sVd8x!* z=|XaymWuP)M{b+ypI(jI7+)uv5H>L<2W^cl%Rf|^yn?IRmhHie(Z@u;v_WQasdRNK z3Hwz7~wOB!xp^7#a}bs>8*k!X(~ph(sO(ti{U@a|e=QypqO1jX9-oTJVnNNCWsY zCs!h!HSZ2u(WJ{x|5(oLe*~FB{ zY&(dood!itxEm_S$87zX+5G{Xmf5d}sI6>_yKdU~`#e48E+*lFBCT zzVz!;&WG5WKKbCxGe*VtI4~$tOqAsSeOzO?5ymFyrzd0Uq=cQ8z~|#y`W}&fu9119cC*=$K9sTQ?rO? zhPQxwwOmQjAM*nekan*BK#{$He$cr(cm&oi=xrE=vD)@c>nCgm7mZg)&?A?YYZ=ou zoN_xGolHCn?OSa{2|<790Rgn;HnObeFpRIBgf# zassTLQRCd_2EO^$6@g9_FC6cvrC-Vj`_~8PoS-nEg}r+SQkndLX+ER zkd5ZP)5a`TxPD%L1?3p}M_s5D8y<@dt!z}fPx=~LG5A8b3tm7tyi^PUbF)U z%c42n#_W%C)vuOl2Ot{J1!}n3=_b6ZqB&^xIGghoF|S1w>~5|J`|1oHTT78hibeDvM|M# zH|!9DO8eB@3>-&t8xu{gt@uu)@%fZmgy_8#mNkQ zCWA4xJ*Du~BELCHU0Gcybz(>?%Nq2%lCJ#)XS=^N?LWVUNzkFeX(fq zvoxx=6lJ0!Da77BamN1in^)(BT`TSMj~eAb4C7(xKa}|dHhtk@pv7o2`*(Vll)O~Q zCK`(nqoGtJQG0XUaR25$H$%p5+V3A$!APf36+-5?wx%9{6uGhTLi{e^SjR`u>C}Dc zOI2Y%fuEQK!szn3WKE?Y8(1C^TVcv>{nMV3KP``>H`zPK!p)9P$X%;L3u?K>5!jP0 zT`>GPzYGf>`+KTmk7S52T&?X@B5zT$%2;W+vv77&_nRilg>2G$!mo?(c=1y&fPjWY z!%a9^w*skpw&EycZ)gnDqtdZ+#e=CLLgHq($3XsO%O>}2mjNBtA8VWs@WQd7cNQHQ zgOUuJFh5Vj-M8il5K66MKp_clRJ_U8kjMmKa`{|+nA-SQY%|hp%lAK8dH*ps@A1@5 zJUG!!p`}9r>J4wxmcv{LrSkc;(d5Xx*~dznsrqV<+14tW*JGcf zTQX<-+P?SdL?MMDIMqv^n4b5tZ+O*#tUlIHJOjv%;V=ss`>m=I0-2hwb3@i=8x})HjHVLYGUANw@_Zaw6+;Wi0k=JQez(gKrl z>rbuR{5uBWpBRLXwQEGxoz9|3GYUQ$5R_NQd5Oc?)@(~yk>XnKAWv0*Re2= zU`PnAxlO|uddB!ad+o39ay&+iKbXfL*=N|CnScKlOVSLQkAN4Z>NfPh$Q@Z6mT{XJ zO%3y%W(v9Xa#E5FJTTp*Zn;$4sU|C3vc!KKq^TQ@E`RjKNu|?>rDsJEyE7mM9+BGOVOo3**Tl$>b z5X2Q`P??^Zi9{jq%q+((?BA@%98H3Fjn{c@6Mq_Pxwv>wF>k=g&X&*X zpp7JJL0tr_%?ZgD5!K`-IxdD7(&lC{lf$UYJkNc+OSa?%kUG0bL&n#{MV^UJx$KuT z$EQcKTq~^Q#EwB*QB&y%PN-Cai3B=tYNxH*KP&V9l~acjSV~ME=kC5QHv=2$#M?9M zhHjS5mYeHr`+B7VEaFjb^{CHgtCc92VCfvbawKJ>)5cm^br2f<7BsPueuqZ%?$YaqacpkV9HcrsAy?73GiS*thCZD=QM2PZuGG z#Y&SXjosW4dEo2jEJU&8zeFRax&KON^+vM)7217hRhH768E@Wp4FwSDxUE235-lZXBez2>@X+-s4K zNNApT)5`lrIg=m;^1MslM|M+ZMV?kW55 zE$dGIkH z?{z%8{HaUf${F9|j#Q=^jjB@PVdiCo*(JSZt>58Ou_oP>!>(g=>dT`3U&p@&Bl3>z zzwzc9u`pBNS>o+$w0h}&+$hjHl!*V+iXrV5)CJdp8}0w}>My`670xpbXz0dE)^sqw zNB3LSeLT?SLQ*nJ>pXeRVn|A60;yr=Ct~&tL@+_y2SLk>M%RCWP-SnjuApB zZ?}Jy*yHAq`FWd#qE_BwWjo;S9*rb$Z-6}`HThWZJ&ece?|0x*1ZUN>Vr%sV3igQwuGTAkYMv zG`?BcFkA?o{y?(bRwp@iLgs2#f5OA{q1->7kP>J~<4)ji%X6!uOwi9i5ZKSOP=G#u z8hstfhmpB}TpgIe{Cy6H7Y7Z3rk)ritfZx?AZdMsnkQAq8#~dkD<$}1CVqHHI(xG| zXmO*0#ryLa(W&Zj*6OWn`g4@1 zDF@;`ZFi_xw*rO$_JWxj{l3PUs6JqzDWOlq3s2AACUxw?OxCsI;Gy4#pa?Z15Q}sd zgDc;eD=^c)#l3lm3&^l$)3_DTWzDC>u)NNfebO$_<#p%xi5pMd9}-^MdZEV{z$k32 z^Lj++c+gH>pvXsKL`^M?A}OCA=^8Pr>RuPUS&)6Sy3bO|$+#RIoAV9vb!cgb8$iy? zJ0`4}BszlsH9`5+0!=*kh+~HWSn{cV&p{-wr%jsQVFnk*J&#M1`7);#r?#FaM<(Bq z0q0MSKbuY%hPI?+(8jqs#7~_}8mhOfI&Xbq_ssG7Zc>zUUX|fLVmo%7hUB!Go8-#K zSxrOUSjWjJg9Qc|B%jPxMlS1r@el=|o`GGwRIf`Uo za{n?r(Z|;GK+OX*@6(*fiu89lG<8SW<-9eAuUu^5)QlnSl>L-UegHxO%IF*BqSX|{ zZfpBD$0+ulljZJDx|u1L<`PzT2G7Ime|dnrr=Bk=Z%So+HK_?Tys!qFEM(NmdFF~h zF!@~r!{c(!)ZTpZ$TqpP69mvO?sA*r_%YDpO+IjO7+R$WvnL4QPhq((t_G^4xLN{0 zys3Z-y|v8Iopk6l(Xq8$r4!qZt2~2t?cgFuh(wylVSM7Sue}CbuW-q{Fo%5TTV<6C zQVf|^#Q{MxT6Wt7IwQz~zQbSB zW~JIOET6UPrMDx#dB@)>`{zNg+4Je&<7*bkiw8M=+P~;XUA8i_DfOIPOwR#+0$7Y* zk^ptQ%3bglA}HIEo@XFwI--7+|My>bT|)9@upELw!3W{H@!!IEi5)|oUec-9WM(@o zWp??4z-(pfe1sbz^<+Ic(3Qp&)Iys^C05^6V-ok$g-Awe~;@iueDha(Z*aw_LTk$R5Xq>~R}uI>~=r>`$x{v=xLo z`*MB#kBy?760kBnTlDq2*^Z5=>&qQKqfIR=04Rq2t-Y>?H}3yGtvv-)R87<%pmZZr zvVertvb!uL-5@31(%s!5ol+toC4v%4mz0EnAP87=OP7jt{clnH{FMKEp8x!N&O2}R z&di-VcV_Or6L0QdU8pH~*B#Q~Wel#(O-+{Q;rF)p3{AY1iZB@R;XKkyx4LX*ef*n~ znPSa)_OPaV2n5nIWSPB+Ssfj4>5v!gWNx?PAAsAKRz&FxQ-csqjpd|CU8FKyBpoOXJ z2s!nARgiz@Cb!*=0QT(zKEl##b*?v1B|1oQnr-D-u1xC@9-BNBt54HZbRCzHU@p5ev-Zh8{V1pVAd{cP%>g0HI&;pw__lNzJT-GmSf z`-HY~^3UNp##cv^+H&_@yQBs?kBhcF;emC}npvMp5(f&a;7i|mTJ6T7H2ywK z8kMfLo9@CNUW!Oqi>h(5^7A;Ow;AyhC}1C1X;Sv3xN;EWKl67sWmgn) ziNsz|UN+B*=#j5P@oDN{3EI1uWus|y=P`u1hTBRry{*srm+2lc!RjwwV(elUDUEzD_S`#;``Ze><$4|0&ytsL&}NS0`ZnYpGV?KBDnyHl}3d zbuS^tD^7io;bGxX5bWdC<%hSX>`Un)pc2&aPJZ@FRh|pXzx5WwO;(N6tX-r`bv?;B za-$fnRpQ&%zAX+Q_5)@2EVTEOfRFyI+UZVB=AlPn`V%J50kj>@^2}IU6~oURnb*Hw zn(w_>GZ@ca-S;Nr?sJc_mQ_`opbDMFokI(5>*!IIj1KliPXX!n^+=v@JTEhgj~#A7 zJ_m_w0)0nJ+sGCC)H(NjFth}@W->;{F?Qwy7c~Rl)4+46l-4=PoR$`5WN?|x8+jikb(aY`7A`)di#X8#2`Z?vnd0dTF{p7L| zW2T~`V5>Rkc&bF4t#FygthaQkgcwnC*~9%GrDOTo21Bj8EhBiZrW8~ z&XO3H(60oL@y-s4aNX`4-J*81dHeC&JrT=U4B{rUs#41J$oQvu36G`rw2PlK@i|^i zzxNsIfxDh8S&>pWJw;60rG9m-$Dg#v`Kfi=@^1P-D8M9fiYvoVzbgLvdydU`Zf@<2 zYyh(%D;n<|6NiF_EAljFRZN4-Y1MOhnBT4BUUPj-I*d5$=di1usVo?&F8H3cdkc~y zTMKLuTBLhlFZOG}Y#u0>he%g=UoV;wxT(=FaDQHYgLyP!x~r2^e2@$a87f*B*XA98>ZsN2l6QSX@eDtOgT|ipc@U(1ooX=v`X@D>1cGZN3&xB9QZv~ zbk<~La*YR!s!19EBdI%HYVg|K2OYE4+d+}ow<%4l{e#2>_}GVrkdiY#IQIBHyktG@ z7_6cMpZ*dyk@Q);>}&S6QtaI=RlYJ{J+!POw|1ebwMS*bTs+x&<{t3@UMnjMBmIe5 zgQ=d{MTiak=<2hJV`%gKpC6;+jxbYPWtIIj%!?)Wfn1Y{S2Jk{xrh(%y`pMSh9ByCjIL1+TpDbrN()Uzsy(Da^sIo-{-1xjso3awWQ191E}8d0Az zM>QWO-eDm4Rm-ucIm{7d@`1L{*qkkcF7I_bGD_Gd9DPh&QyYB z$80^1CIr(ZN3NJe1}^fkIa94BcTkiFPt% zVV3IcL7Ucg`vk!KA)zx;u3@FXk~y$fp9$>MXORO2(@@Q&-nYqY?J?XW4N(>DDKWl` ziLL^Z4olH(%rN00$~4-hSW! zjX~nZ2H$$v+sI8EMKf-Fsgx(yF_5QDEfc)%_JW4PUe0Yx?Hzu(bm4Okte@qJD#aOy z&1OE-Jb~LAT~M*5n!Z6bpNhlqwA!RsAjEvT9=!oa=?ki=9=Z?l6O)?Qdzav^rZu0r z=c?QUMD|Yh-rm-Ol?v)7EB9YCzTK)|DR*kyDj_{QS=n5vc(J#8P$J}W7}n?S$QVhC~Ws&9k6ntF6?%KKqFYzb_woxRsA4s3~{7{lDy!>JNDC>oXlac!3Y1 zd#u#x=h?oZv3ugLce1w^({wb?ra!&E;viuyrKaf)s&KArbd(?4_T;}9w)<{WvB~3L zbz=rz3_eB+Zrf_tus`uQTI@Xe{BGri|IXIQk%#Z$(f!H84KVPp=Ed>a!NI|8y%QJT zFBLT}j%L;tCJ#sib&YO3$I1jV?AiqKn#^4~AhhMOvB*xUruk^Oz z2S#zC29~=^-5PrAG2GC6H< z$)Mm8*9A>^jtW=agUu7cNnstjx}KqkwF~Mu8(LY@UIZV?O~mLvn(`Ac+uj@B{nmD} zc6z@NPUuICL9ig=cB7RIq2uFO?!gWJ1HWnRRBu#&!Qm4kyPrt; zA1&CQ;K81cWg~77`*H2nkGsT9@5n;jn+4&8@Nk^nFm`&o76PgR()$TRTr2kb4OevN}7f-bh95^{5@PpMDx7Ou?6Acuv9^ll)V)HMqRBRbyCGp=tY6W)9J z+)^koRf4ws8$L(M+IQM_kE|4IS@QDK*b;A!$5&r_sl0Zcn zBy8g3Hg{C!QdA3Qy1P|vG<0q1R%UhM>Rq9uV?~jWK_XuN3Y`p7Cu_=^Z{Euvczt9l z^0jTR)E@olRP_AA^}_p+byX7(Q7ThoWikQFXC_VsM5Aw)_A96CoS?Bqu!8r~tL0Nq z&4O1HAeNaQK(LJ)Iq|ru8YsfY? zy4xg>U!_s9aNsDO2lj~~%P=IZ?Ve%AjCd1$wf#XyV7#>K<@l9C6eYXDOApF)9OWOn zCof|Yzl$%Ex)L$n+V`zPK~-Zi!rKc6SuBuvo$7^lN%^4E9vF=-FyJm~=y=3Ms#~=g z?~vYc&<=9&VdUREqMO_b)9vJ3yT5D9tVMsr6l;NM@?eQIKiZlo>1FYAZ}J;iBSK&4 z1!nO_892lYMqV`UKAJ(fOkHZW>vw#7i7+af{G8T0=PoxW?_cyZdS#hdR0w+g0?Bc+ zv?1e0!7XdtcPB5ZWvFHGK^<;|s$~QeLbR{AlDPa#CAc|JH{cQO++`r?yJ zEonUrMd<^FyH4z~jxLxjV}zxLqP-@irO7!&)r(8}qMG03FhE2UeGuL>tuAdY<48;( zz*HA6I&^JAnrU;jMdQx>D{@DL5ZTmS*1OIX_>u`xHYU9Z#C9ocNtDEK5-PQUD@FO` z*M*okUNNxps^k)xaP(wck+HS=H08>cB8_D;DffWtQGAcHqEv`98@^j>5Z^0vytLr9 zpkjxQ5kdn@{^`b&L?++fN?Z?HALUhb^jP;gsG7QQUaV)(uj;ARn(M#HI8!V8)Y$e zF(=Xz`W5&&6#aUo;E(YN=lu(c;BXZVB@XV3gpVadf2W8nO$$`UDi?qqXT{ToBptN% z$-&ei4;d-*7CzlMBp@yi{EaD)zY%u=3S|T+5cCy+5J};HmQ2RyW29L}qXwNaLBXiu zh8IQlnFZpp`G?(3^h<;5uWxctAm7vKk!S&>U$kmg6Nd1otuUFkX29b=3uaT!1$RB) z#5#apj}8~weoka%e?7T6e(vR)`P6a$i8ZSdylz|TPChjHb!Sv39_&bkEJg!-2qm5-b*y5W-7w7|a z6SiK>-f~zM+G1CDB;0zHG)t!Jnu} zHBe+Cs7!I{G~0VP+`BT$68eu1w zO`Rt_LlEsW`(Z;UdTdLJo32PB*Rs#diuO^yH!<11h*zHo)6UxXqX1JrWPFF#go;)(&ZU}tQ+GKL+R$@h~iYj7opcEure)}1{&!yv1%JT8o z_^lmk5&_E_=WrNtyXkKn=H!I_H;360Z`uL6M102=lO4ay$2QA6hbijQ@hE@F(`;K^ zRT7sE$@KAGA2R1%AC2`{Zj0DUn3NKD6?+&*Rc4ywD9&AW=dgQaz*pi~t?Piqz58QR zt;?^pX0UWg?=FA#I@Dx*$bKLl+-;eX&$1)?`5}2i%)DNfc1}UU{5#JX&c&YEwaE1m zd)5Q_G5U)JK_d@SiR`yy;B<@qqf4_^GZrE2s1I`4m}QsR*Veho({-u!18=(Htqlg% zHH%8OXT(i(W;jH@xo~qy-7W@YlG$NO)BMo|EcH$JdQGe2U{5m}!_3EtM2Rd57TRLT z*Cg531hC%Ir%H9ZO*XT%yos{b+R*KU=+BSzRA88k9x6FU(g!3KJ2Fj2iS4S}+~kM%kNCR?5|i_iM9L`DvM>T7Rbj%U8sFD%_OSC`kz z=1`22-`Gz-XAs?O@FzBYMQov1Y9`=SpXyvs`=o+h@$#^@foj^}i85gG?Cqm;Yvk6S z(*@2mTsb)Yy9>Z$%5d6*u~EF`TL?1y?i2UUf*ht-Cnf!OLXc)quhJ2s6oDJ!up2Bl z;S@f$whm-cf|kk6b%|*Mq@0(e!^BOdIoP;?7n0+tXB5n z?G92LokQ$7mMSObUq}Raf=mJxqQAOgviB1h^}A$lICu;+oqPlrZ0BHJLp5ROd2XZ0 ze&AHIWC{yA`uaq2nEBw~lUqndC~rrY`~kmXsIPp~NXC^+TcNvsHA&u@Ll}?77E<2} zGCQmI_x0wblc@6xCN^C*c5Gy#7`4Dxb1P|oZ@pt%#{Rt1Xi_2O5$Xc*{e_?&o!sd} zsY?N$hAMN}_B{7;=)Nu@m!%SyWPC%3-q-F1HV56-90; zFJo&6FnlswruxghZpVJ7{D| zwsB>Uug+EvjYV?~qGzw?aW&p+-gXU;MP0V7aQ;+)Y3(}Qx}TZs&&?$7f+?b9Z>S`` z;7u&gIk967A8Ht54i&JPAbKtw5Esr%@d0(xcK_i%&f4S&Qpd^@?sIy;dDbudFBGMG z{7t);!apQdgg^I!BeCi9nE!(debN=BOt&~cJBSmiT=b1llI~B=wbz=V*n5E%wzS^g z(RgiD3^=OVPMn@weLY%+g+c%Iw=PXkVSL6*{J`0nPp+?UCJmidr}OVrOkeTJqFoSH zi?}fE%>3ans}G&am=Cy- zE}?s&l~QW0p-^G-5(Of?>Y8$Q38`$Wxa>7zSD5Z@pM<3;bmDYDcll2?!p<>jp`1K_ zG3syKv_pfSk1h~53FigKNWX!7P$6AAs)0ASL7*5d)MmXA82UabG{uIFGw&pO0>&e< zZrx8NourFpt`EN_Ej3ioVy-lf_BQRMGhDEZvA%qWBW$TQc6~>c3T{0-!Y(?X+aHd< z;CvGTPMfdJU~{i8r;JW(*pZigkY1OGp~+ckdw4s9f1~! z_FRvG!Jb7KcA96N?Q|CvvKLMoMB&E3JAb0Etp6;9yQb&$MBI)qf}B!+^LPmB5* zDQ5`jDJKrn+Q4P}b4WSI7Kd{Eg%XD=K@bzdl}$`R2jynrgq>AO*Y?$uQJA>Suvmw| z4O&x7T|d?Kn%8fWZc_=yY$_Uu@}qc)>j4v0?WY}s95H5fmr0iKQ9Ai*JYK0!^UJGp zKk@l6Qx$K75w@YC)U?Au(z*kG zHN4$a4fP@j*@F?Si*SCujTKwU2Rf!7b2*RcqA}GWO0+)t3v%)M7s#E}Ej{B0)}VPE zqsdXXbvPs2Ny#rY`aWv7vT}rHXLZ};9Cn^FJ%Ikr^a`%jWs3^IKB&j!!WVvW*NZ{( zQ`yo9h3td@KfdA3+*dsqfnc8kT6jEgV2t2#|MSl`lySO?cfu>Ojl$_uA{|irV23=r z5}XlhI$4J06qXLL*JfVz9`L`sl~boZ3XL+0Z)+Y@T8dqCi-}3XNc=E5%%{hD3oK4E zOhu4Bci5(u!TK?l-%R5H{2fyeH}b5J-gS8@otO!+t^(Ct3Wb}uyL`=bFj@^+GjGg^5hQPl0Tq%u2j{W2KIdz2|N6a4?D5{Ze{H zdfGWx8oFh4KgW*TOT|m83FFFc;%s@5OWN!)e4%_cMj6Jp?|a|v^lR+z#kALC`Fy*Y zzWR~H*vA+G4@dl^E5z|f&6q+#uyE3&L&H1aU$FMbW(qu_UV)@t;GoAymlvv|J}g?6 zlBQ5-b3{}nW8q81IDD+})$@^1G^Q)wP5V~v*?W*(_JQ>RV+(pOz61_;d1zYZ^<~^z z`hCTzowvC?y=DC<=(b8BN!|Nh3y&?Puic>tn#+#P;SCsYo=lvs%H(dcE!;(>jC2ze zy&DGE?eL=%@2>%^2~;~ZXw?Sd69-;K@AtF|2qt@}-YqEpV3+*chp*GHvBQsZ0+~5k zd@u(N!i>4MGqmib6U6T?o6I8Fe#Ew7M`nbE(oBnW(l{L`^;emBQ9ST`owN0-(rk7S znGa?&d%Z7bHY&0`k8Xm|^Ew-|)XMzR$=(2FQ^-Bj?w}B@_^jdVJfxmU|bG zQiibapTmfAOlc_hUuZCSWf>=6#i4$M#OF?}%7)C7M6vx7YZ2z9Zw5h^rSVJk3M;Q) zsEHOUe$eF6yQvto`Ebx^)Bm293yEB^oUW)_iixogqd+p#ytp*2lu`}0VDd;4W6BDy zz~UHkhlv>4{mW$YJpo>61t=Fs+|-N)b=YPKB`Q{v?mV};wB)d4POAkI-kNy#A@qRh zSgd`lUi9%R+Gj|#r0F}E0kXG*TfDMpp2wiHL{NFOUgiE;x2w7Cq!{7O%>B->TTOjO4as*nC$~-sZE8V>;&%1%}k~%HwilwY(o_l zXJ4Bth~i#*v)!TpsCOa~!sHnGJafL!0kJ-I zgqdl8?=daVWm5d==tT@DQ7*Cy6n$~@b4^de1 zFiS*UPgH7|rwQ%p`mcPsz@RD4_9q88&$BRr?*88!B1$4oFFCO54oLQe%~>(JzIeYK zUdlUpaXI}PYD6%SMwR^QO0YZ2q&n}~=%o{J+XIp=W;&_L=T8jt#@nnNC)xO88Pus8 zq>QLSWrl6UoCU%$ zzw{~OF_4?D^X$rg^)Tm+wSB$!AvRuve7lb`4Y)f(Zk`=BX=<%yqA6s?R$@Y!@QKJN zO`~zeAsq!hd5GrlWNQh^s2MFe{p=r>cNvpY7#wvau4iVY@9?ACyAX^vp; z;(HHszYuyc&5G3-IZ#LQv*Ju+SKaD(%KF((_nOvct8jY$qa?50O|r`$6O%RHJz^v| zs$|U_@u;LELpGKyz9W+fAw4o*n=F*)ZhYbNmbU*K;?FTeIsZZfoY};v2%EUSSr}mx z_w)hw@TXIIm@}z+N)`p_z5{mfIc1(>twA{dW}w}zCqUsq7xPwZK?gNwc_zHhbJ(`0 zTP413)Adonv9><{+ow6mHN82@wFV)2%bRW$;X-M|QZ$t0jIX9xudN#|9l( zRmp_jAE_5&qgmHD_AGg{$ME*C(ih&f1xKOzm7$YPx*pgb+h&XX)f=4?ce4z=H9fB2 z7`b@}Io^;derBy@Xs)baC75q-bl5JN!K7pP9BCws<=r7}2Ku%#nI#1G$~#(@8B%eA zLJ3!w&Z*;r=frNq6PLefhDevCfy}NvOJu;5GEE>ib;(c3(W5Wdc`SG1^>VnI);J++ zvE0O`cJZ1Lkp|m~dKXq5M{mZfyu5qVm9Synd}*MLWm3c^wZX^j8h$1Z|Or>h07kg@$)8I!HVOJn7OfNqFCO& z!&^t|h8JCMl-Dw`C0e;>KX=#vP-r<+c8ANiuX9 zm5!>zjGCfi?Ec4BHWnr~kFaWTQ?brzKt!IUmfm9GyK_1z-JQuJTRi8osit4rE;UAcm0lLiCh8sivtS!ZaB7+zTA^hCy zwjw+}C2=9W`M&c-@0#>(uKwlGf5X8B}@~q73t)|eEsIDWNT=-=`a>Q_Iw&vS6NH7 zyZvLv>o;&w3A!qnWBYGj)f7wLNEBS5Ajm4)uOq=7xpyl1F)sydqdf8F6#txZ|6TH2b6I!kMMmZLP_vsyE zW)X+WPDF04TX=)q8FOlxw~Ma`9Gp|>b8Ju;*I)1xo^UhvDg-_F`}PCBZ5D7df|kkp ziwrzIt;+1)qCyZbKTJ7PPE@`3Ov7)bEh>{`^6oyN4zoXX41HR6jRZWRNcVo9G0SzL z%_LgS60WSyrO?@16H!D^wCQP%Ut<91vj?Au|SrWB%Jh7%oxt=QP|o<}H-#FXr{|Td!@~ zfysk6W~{-{ny3D)Z*lAXYh>IzgY{cR_VM3PwUejugt7+V$XAxw9v4Q%UzTgc<9@hb>i*!v2Xtv zWli2t_D`+?6Td%p;~>1S|2CQBmClp_uqcZZ*zBn-;0kFx*-q`@FTgS}5#*}9#9%ai z6P1Nj7gGrO(dKw#3RkzSl?EfxQ`@zxW#b_polK~}I{MojwBa}{4hb}$1bMM*+oG56 z;LKd}hIw0vHsHRu+vjq|aXh+T(3L1toeuFz&`xHfblb*MtCf;%sj$oPb~bzs!}Gqv+z7#+#d+Hy*?+WDM?7m2msVh##KA ziSx{0j=z|u!($-0z`7BNxBfI^08#lTff{Yw_yd3M*d!LD*UdcR)8UUl7R5mv!ywfM z@V!y3%b8LT_I_&?g_j>%*aIR9mZ+$WljD$a^|19+-{DnRR^TpVF}%rb{fMErJ+g_y zhfAq;4z=gly-?tg*ndxT;$Fu9`_y6^#iBS@@aIaFW$;&NMo-|m8R;(}or<|(_0J8p zaTHfDD5Sk}dP*bZ4!~lO{td4$s5NhYT)k+o!H-GXWNs@HH>|P~%j-ID-|+c|vGi-J z^w#c0hoXdV;F4QOWJ{j6U0xrfx+bGxj zQWs0)_%f*2=QT1@O!C`wY~Tmes#PQ@YoT!u{el?Y+c_G`uByrksCjU-*C}Ep+|6F9 zbqbp=ozxNPzU}FYWSu0Na1LwEGtU8+nE%Nd`PhFN5>-_6YolQ$;;hc1%K7@)4RBKub?{9 zAtTU1-Qi{MfWq_K5&W;hT8dASbejCvd}w*jq5B-;8_N9`(+c>H{gJ-wK7q1_GH*5u zpV)bw*vh|U>_sOmg(#Sb;D>_Ma#<_wd{){h6gxMrP4^PM5N9*t((f~3r;t@eGyXiL z(m!G0mbN!t{)XuG-Ic~K7Lp;GeaF_ zsJOM1_FPPK<%(OdWwfRpB*$r@DcAk%EpC!!#V95(U?e3k3@27sPgQh3-8x1|6Eiz! zWdJxP^pBYP?$Y(tPb*l|(Ao5RNJ>#sOG%4G#L3doMqU-sTb!I-#4HS*fSh1C!yjQN z9F7S_WDqxXHg>XfaItp+K@bh63<9gT8o8W$i$xRwh)@MXTfpb)zwv(0EGl~HWR;DJ z3l3s~0d7^H9KiNBgoA?%;b0YkUf$5f$@0_@D?0}ugb-4i<7Y?@6O15C3GfdK_+35q z7K;g%umjwvBD}}`;*dkWdr1Zz^)*$8{kG37|oslxxiUiz-=s8!_vgX z0^kH<;Q#7@_OxNnj+*?_>c2GIDX-6KC!h{ChUU%ytNy0NM4XKgesy_)_5&lZBmS^K zfs?#|<6sBrQ*ME{AzYYX%^!c@hz$uDz^AaKv51|yjVVAlSjENERvi(LH}p8I5x@$d z-vSjpI~jZ?*mOYLfZqOpcrgF3IH24-AjAWM!$EMMM*t5d4-YSh3)qhY&MkvrFxdA8 zfWdh#Th{YdvSACLzT<^>jzPGJcA0KOkYU?Sd}fW?Wxg{a^ek%%(R z!iY3K!w4FH2N92WBg+2~_p^ehco6Yt`G7Ej##5XKyubsT>I6Z#zP}L+Kr|Wx58@5r zJ;jB<1IUkm2>A&>)a2(ML>5Gq&e|P87UDr903J-(57eiqe&+g41niVCr-cLMAkZR+ zMkG3`@Yx$6_9s!?!1^l;kO`-80P%?Q2pAj=1#u$i<>Wmp{W~r|a=;OJzEgyFW5NLW zg2>MUFyib%)Z%9&AsXtJ=ahN`7k>o;;nNmG5PkNX_68u`X^#QmzrIg7jesHIe|di3 z#6%EqR^i{@h@o)Wf&kIK#+{XW_Wlv~vp4*j7QvWv%Q$Of;Qi010KYC0h=3r7nLQwE zMM1zy!D&bg7?$5dl7O=L9+CqAa`ZG-9_R$$Lkb}1sT9QoBsTPiUQ-4E8tOEi3eXR~ zhtvTceh(Rfpg+nm0zrS2VGKGw>;bqG|E?ckKguydsGd_yra;5~2qAO{2x!?;_^GOa z{V2{71pA@fY=A!WJ%KF<_Me2T(HR5` z@Y5s+eFiMteh;51#M6H+AkH5Je%FD3b_C#-HZ}x75QxDd z(El|VpfLYQ2K!%=0Z!flB@CoI4G0SULhmmY)IXK@rQ7`jL@@3L<^KQ?Tsf=GcU6pV zNe?LXo0cXBn*zV@DY!&x|jeH$-&0n#m3SIm0Jc>Nd7csQAwVFHqj3lk8)KYoBo1P8jY z8R#brVW%N}AiEzh7?ca~tj0>;?e@_PiMC@-cVEy5D z7(feRup@pyX@T(aAY{$6vq8W8v#WCmicdG04HF6 z?^jSx_@8`%b0Bobzv+WRc>c%>2YT-x={ONm^0&O)f6~mw0jR~_(s9AK{)E911OMOn z0PXunyKy1Z?QiJ-6XOp)bMyQ$jsQgre8%)!ULIhi{{e#o9|8QHju)XT5x<{(5|9#q z(!vYl`ZFCDF!lO9FAqX%{tF+#Wne-6C(Omk(9*`#2@^5JQ?c|iJ)Mjx+uI|=<*Cw< iu`>fy4N#-+s>a#H(8=XgJi(wmFbF0+y@aA9=Kla7vv-*Q literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/score_range_by_eval.pdf b/audit/dataset_statistics_plots/score_range_by_eval.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d668868c26ed88bb78d8a1a364055374c837f3fb GIT binary patch literal 21724 zcmb_^1z1!~)V~M_(hU;2bO`J&%hDm;A&r1iOLs|kmz0!%fGDL1NJ=Om-6E3GN`s05 zO8xJmzOS+0_xgO#AD`>aJ$Gi#nKLuz{ATXl!>leX%LV1;#bJIp4Jv(#0|h}qj;1y^ zA|fELmX{L(1eP*!GqH2D1cB8}ED^3C9w37TNK6a|;b4x;DDYDOSw{yq5bQJqSWnH= z24Us~;{SFl1N`F0P%h!sGGRCAzU0laO5oxSkubH+}gns#P|KFi=&w)!VP2q zu$7htL_v7Efxrs(fC!S`f2F?vDu9fB5(oNc1CVkfrFTI9(tQ&jtbuTKbaycW+JlrI z;ID!(w>FV<^aMCUfPega{M=BeAcT(x4&sCGaSQSYK!KzYpbnsoKt-+~!EcgEI5;>0 zcTi-hpY?)#{6mq-2nS0yD-iTMb6IOUpy?p6tR0{aX@r@hIRe=NS2q`gi9L>I)*AyA z*C_##rXAh6bU}=cy$DBVi-pMz3k%Z*4+2e14jBs)1Uq|$T_Mji|4VF97|iK4Y%vet zD6%6&gW|TZa)(z~RQ(6G3zH8Q4r3de`&8E)lMx-=l)hrbvdwcl*-kF;gN3p4Q=Csa zGzLUrBsXIAOHIPP=j(=^>+h^Da~`B;HwLFOUlbnXK~QNchu^4vkRx0g;`FxYWmymf z{wt`>YbQ>I;NG(4L@lgV!5-c-c}2|pf?q=Vy1OqES1l}YiQT|1q~?RIPs(J{m z5{)djyKCc|Wb4n}yML%5)SUBpc!Or#Vu`s;4JXx`S@A-WW}foCEl=-2S!FVRmJ|AB znOLfnvwQZD!gWZGKSMxbY3&49cjBOOaJkmdRiuA3^)1x9{4Mnp8Qc1_xA|DTj~>Xb z*!MDia_WuCpDt?R)a|UDmRAVlHAwATX-TDNDX#Vg_o@)4`sDUF7+lPYFmMpPJ+OhO z7jAybz9;O{b>#76T5$tsX3Ip-(Yr2u(CL+RQTv6@_x%%T$5KA7FIez#KL}VJGZ)kk zOuzHtP}AR6kui$O)z*XU(NI~ei`o~P&hn%+d+JQLb1VZhjmKkZOnB&xn`@W6B%*zN zvuAzvVjatfi4T2BC3Y85eD?*|t)NYVy(hB0s~fh7w>o;KhCO-e8!o8%2lhYY=W8mI z?Y8H;Ty=058gQ9ep^I0J*|j^|kI50%;#9w`nNCe7m-eH*(pF2wxgmdh|gF_E$`<)#0Hk(_9w zn>uYq7m~${*TtCLZhcsF_5U((^yJ3+e9f1#gW~ytnXHK(@5@EvmAEBGUm(X=$C=EB zYMA0}l{>~C&U3E>m`|7#7CMYw9d%lcVr5s|xX&iRwQf7iWA!vGG)WeRmGu6r2X zupNX`_~Le@h6qLG=2mdp;;k>6xBM;BJc^4-!W&%X=et3&w4x76EMISw)nyY<=i{tv z3~S@xR3(KrPcUKdMHNcxop|o_9@VYfm!vgEBN%pHv2RE z4hlQbmTkX*mDzLwYux+8j8*n5X|-d1r0n(_+@z+xH3dto!c*W+1MG4Ui1)4nCg;W}yb(w+L2JGnEnxmm; zx9ynT_r8}ra620y&%9V5;BlcduClwp^F+}mw$yZhrLzBJk$Fvh05iG&*uZx3DxFIm z&8n^v^N4bn)>ro{%y;Tlh6)=~i(kD<5aN3op6cEEsJggnK0(oe`EB}$@)s=)2YKe3 zAB`jncZ{SszATB11S*P&cESph)CL~m6)!cf;u0q@@*Y;I##MFux4P%~-DsX&J~yhf zqc~&+-rK$dF%5DIOU`qZw22M<-~gwv`occI45v<#m5SA9$Z~fn;zY|#uyc}KFJ~T` zNbHWyYBXC~mKm*lXcy)h+is3sKjZc!E%>TM73s9V4%8V5yUioG&hKBU z?pJCQW$QOoL^z@!L7N8BY6ZfQSFR5syud3A)3`Q4oa(92Ew-gbI zBL2dRO*wYAd#!ywur+#I6MOX9GN{Hz%Tv4Ps&3CQZmZ(>Bog&~$0^_O-zlg;0v#S6 z2>b^qJVirDq5;V0AKK4I@dqM;!T}icA8-gB+l!Abfi1PI8KyKhs6Hrrl|bQS3}U4# zzaPXU$7ID~$|(4L!F^yh>6Mpn+Jjj)$yaMF_fpT#+bvAM;KYvM5yMq zDiR2`80D5=+mK68Ene%v0Y_-CJ_Jq8nQw~36&@X7O^!45p=cxuGzWw8|F`U-NfJp* z5NwQXe@r!^v-GQ?Pb#1hGbcCrV{%qwT0rwXt=d!Jn{!ulYMM4~i60%SNra7&3I;qe z$VRx>(kczSQaSK`%~tAX-(GcX>a|O0{V01$TJ+QE8K@*3!c60WsP!XrmtxYXm#bf^ z<{Vsj;!9!0ujV%@=N?*wuB$?=b4Ee1ZQ=Yxf{g7d$$M>;TfBQZO}Cs8b*3|<(%mc9@6jk%C*LRr_dz=0Dr5PPxxPDfCpydp*7 zdI`F^LkZTM3Ik`Ayql?Ocw{4q6>?Nj^R0uQI#jiEXQO<)@zJD$$hPR7U3*+PCiex5 z$ru!P3nTnZ)H%AVb=f0lMj#Ah5TWx$w~iQRKSvmL@@}T>n{ntf3nOrs>1GdBIg4U# zNmE{w)%#Ej-<=fy$Sk@@G{pjuGM;?)eE;48I>n`Oi+%s&V=R)GSn5A}iwB9tf5YB9 zf`2j6m^BpQF`(ED%jCz;rF>KZRm|Auyyg0YNH!_P&b&W~ z%psjCg_bNqMzbzxy|k#3U7QWl#lk75SwLzI>C2{4uy=9d2f|_rWuV$#%4*03M3_9~R z(b{0?hz6ys@{&G&L6mF6V0Imkb5*)OCho%Q$((+``e(E)9m^dQ`u=&%^?hawK>vGk zsPtD52STthhjQ4d(9TL$-dfgYI;fQ#(4;;_k9K?~zrq+^k^5x6yL|6vL`FFCaofH+}hQlQ?UY1$}+snwYzIIpbf1)2;=1TkjOqzLmwReOQTDe3#F#FLM=SFd?$%k}JQi7*++C|y@` zIp#S(fKKaO<|lgbz`I7uuJ_vb zXD#_Squ~m6KBq;HU|ob^etFB5iE{bJ1PLYU>+il=O?XO&Fl&F(7EG-kWnCz%Z5?>% z$J0R)-OV}VvNn*Foh)He$^ORa3rU!$?zDYd%j4oSvxlxMA(pvpH*KIzcFvDtWK#tD z_{UPFv5qMF`^}$Ba#(N4nBlc8xd(H7j8AIM9=?Nr$=lY%{z2f|0f8Bj`n~N%AMt>@ z+n3x}{F+b<806-}Z`}ij{CDrbcxu&w0mFNaB%Fc*)qp_<;p-%}-EO&k?o2KiuN?(v zo~ds;HGOy<(o0!g*G++(bhj^i?2t@!EJ>Hs;gP zd5SrP=@+j;EYu1e%*OVw`Ern+X!S{2QCEwk6vzAPMBR3paE;s9ov!-&c=zqiwcV*A z+S0A)tgc;JI!4-tuQ5gRCnqlPcRzeVCpi{6g#*9M3FMy7Z%r5Y3*V{|SAqW=@U1=% zj4?v`r_g1t;`p5K1xBWXi+g$ocE?%nouyJkyCFGoH-XBITzl=?X)l1I>GVUWY_(tJ`Ek6jR~Z=j-qFfJ6OMo2N(SBj)B49nhzvJV?JH1`%yl+I zYc@Rby;v^o+tBX%9oz#R_Si`A-Fi|RNA}d3#J4X7-etTAnAx;>OxUBy!iqw66wryE z@4wXtk8MEqE+(>f&HO)#>7=yIJd0~7n5T>IoDt@86K-2Nb3rwD`lfUw=8|CnPl8r- z*})}nb)z#&#yx7vui25XamxD{Kw3SNfo3{e$~Z?cB|NRl_@9 zy(@QfD?4iDW@Iicsx;i|o}^pRp61uGv7C~*ehE{-e-uRM7;!z-`>NU7a3iF#FtHEad+9FiUAjUS!ViV zy95g{8(vwrhf1!N_sZ^*+?-~Q4YvRf&7Kv6;&5ZYJ=ddf5s7$SU^+UR$zG&_VL6+!kZ-wKb`UnI=g1&b@epEggxq` zBd(%Wx_9CpIrRJvT%?5Z1B{R7EfX_BKazRX;qr{1x09fa%03%n%yFq4{d%;t(w?u$ zjmz(}+TAMJ*hBR^jWi><>=F0lWq6xA6zx6ksF*I-Ek4GTh0U5I_HVtfz9Z;q4kOw0 zijl*F-A;1xG^?7oh$WiF-;%+c(fP1v_WFxdOIZU{wf!bShR{swu+wSYtZw}1%jL&+ z1GtBjO;IQX1r`A|mH&Gph$c2TqXV9Qr}Qjy;C|BQuEncPTjHO&RqtJCy-0aiuDoCR zKHX`<1G}X_28}-XVbkHP5Ncb@TQQp?oqTpry_i`-80NVT8^f{VTUtB}C7Srw zd>7WQ9ToYIUHB^DJt)DpxB2E?AVP?PM4pPgi8*yAsm^6?xbFFLLwl^#i!Sde zzR#mhihE~t_ga2&(z_9_1>Tjuy3OdVNk`5Dm1*X4#vzk=8KjQ8ad5_!p{dnHn+2;d zZj3txTpWt4?VDTt)LDj?t_LaIB-|VedHP&ZzCAl(rZd|qcHpeis*Xb(`YeajoSx;q zv$#4t@U7Zb=dr%$Tr3N(XOd;^T3u!+lkFwXWfH}G#hfA6<1zdEa?3!Bt^T%QC-nNe z$-XD&EhP`tU89);lgpgh=3}Jxb?kyinix6vGW#|(h+!Wz?$VvvY_83}ulDI3Pe#oh z<)C7=EYm@6&R0toruM7^Aylt>g1=rdvUT63cNnxyxw`nuvIRFj%Kt{3j$KC3l!0BY z3hiAXTMC)%eCCW^A3w+=Kig;49(o&VTNQ87%~A&(=j&M{_ZtJq%(}>|)XFVHy&E!I z8yE_oaH~;F_!w(t9-e3b%hNAk(R*|Y8c+-Y6!;Aa`3nFC2e3rj6+Cnwl@{Wh!8Ee| zMUd0{#;mNrK-ifDjEjsU=%wJs1iVJ;9XO4zy}i=~IWgUD#*w87r+OuUQdipPt;p1$y_dO za7*+e*7C7tj1LJ@72~7aH&FVXd;RK3p1ni$VY9i-x6QgX?i6ztjakT=Q94S`+DN^7 zVQWQEKyoc>EtESb(`7;9rfT{$XTQn(9T};S#Ok9sLp9Ugh4|dXNqy=edDT?WD76!_ z6y|lv_-uszbu(w}R%zQhS2^}5?y+qMo z6gZEU_b)6UJV_ym4mm&Fakz(wO^4iaww*kOn@?VYi+A&Jm@v#)`s(fUxDQ-vR}rv? zqmPBM6C4KzOCDiQ!Ua1bR1QR(!~IlZCbOw>?8R>l)~5LAji0|ay`1q%jKftkV6gvw z7P*dySaLIknR62x&6E|9w#Vc4SGIfh72Nfmrn9PX_b`^p(w0N|3<~Cx<*)*m#;Xdr zzIc7fXZ*NwIJq-<;e;{bj%$$YjB2Ssbf3xn+7)$8wtOy|E0vDnoRVnGl@(l#fhKux zvin7r{g@N0W(f<=-isyu=KBWeQ>!b)5+CbxrL;qfkk0@}sJ zn(^sa-ZAXlt^0gUx1aC22P$H$**|ezD#o#OpKtw|lN!L!rs9Spq3>v-F1_qSroub3 zX9*u~oaP7@wV5HUzY>@bDM&MlF>C)d?<@Z1?8%vq^+J9WBLD@I=7IkOFi=TUa(Hp& zyM>Ml;9qtoL(ESH+&Mca|D>GlD(`zIX%fwIeo^Z3L#YLh`U^B)o?%9;ZVh!bF>OeB z)v&jdWfj!0$0}TAx!(J!TMtx{nEiq%XmMf5y$gTV#ARc?DEP@dmG@nS+E^PZ#~~4`b@5w!T6A#RiAiqBVZ)(FqIHiAX)jOU z)c`}04vXFG!TbtF{RwA5?lERVHkM{r_3bq4W@{zJ{EhX4olBkfYYaqNWZDZni^uv_ z6u9Xhd9^d1Q&r)JUbffLlGSCgB%Wz$?TJ}^q16drcKq~)PP=-++oU@7k)uatLqbEy z<~1ua`Rgj=1+T$G-JhR|ZEUJfRN{ zM1P9Hi)tyt_ZJ&NaCInhuX23{N6bm%`IV%-4Qu!Ijgu*uw84Z_hp{jN0>{u_tG%}O zWy%dYvA7*Ivv3h~FKHuS5B1u^jxk7_#ZwCMH6rv*p<2%_op})zE&f8^(Q)@d3*Hy= z=@>%oD$kn~qzXH!le_+magQF!(_NuW9F7;|SakPX=u+C2z2nY9RY=#}g`#(WQ_;yHO7alvTe2BK~Alis*M`&Z51EG1c zVs?|f+iK&)lk?@7z8*($pY5C(-64AHb?Rr-(rIU@U3ILz5{EZ=?srV3#@sOAjc%u; z#%l7r*GRQ~MCf31!yJV=Q7t|GX0;Ai@3zN);vF>L@DW`pyyeZJw^XruLZdiiEJ9?W zRM6FTJ_zhv%m7d1-hWN}V5t7Rum*ll*MA^k^saLD-?dz6@{Fromm4N*ZgY zc&3G}{sWO0SM#4~@az1%+NBib~fkh zco7TTJMa;<5Pq~pQzLd2I)k_wsqSK}tEweCH@f{S3~)HhNV2c!XTN>rPEmyYE(lg@ z^rm?bUzy{~*6#Vw7sOu~d%I9MjmzNa@ryJa__?|cP_k&grPC_E6K(&cz|4OOABMfQ zKIt~&pEQ3_$eCyO!!y(3XUAtcCXI4X=n}<>g%|o4J0GGN21q=8a97hQNO3+p%d)gY z+~;I=vd`{*e=Ek>8u};Z3PrxfiX{r`iH=QG2ELA7H>e}Qy zNG{Ilb*-O~uZ)@*W7&S@r%JE>gWQTy&aS_zuCTIAB{HJLrn5k1a!Z{pH_OP z7PKi^K7equBM1eizQAE|=V8-_uiuL1%Yph(wS_m$9k1T}&s|j(iz{eCFuO(e!TkLk zG6I-JmU%K&%)1nl9Mek~d5;)z=1eE~q_5zY?&Qfo6%WW9xgU$7uTiX>pnS0ZUY)9_ zsZ<67E@h8Q@T|IjkCz)QrT59BRo&$;6xpHYNXoAf@y@=Astjw0*M2xlBZBdGIDl$W z8b@5iU%p2Y*F{JuF|Q7vT&_}LB3m@CjIH=l0aPW=UbjT0{yiV(!r{XvwX7xqwfn0J z$<8$?oW=?P;Pl716K3B2FCwSz!kaSmvR-4p6&rSk>}VYN+|oWYTV8o|oStRIFVeY) zuJuqv5k*^3EE0Hl|6*LrTvc#*0l{lHAV0i4PvQi3iSI6G4;vg(e$S_5x93PVt-}a`#xX65hgFB|#sJ;ih!s z(;{lk?YA#3y)X4naR~L0pqq~xr+rMMEBI_*kA5J(tC6F9^^;KEJps3y!PlNbKl8q^ z<#X(J5-bXfe02Np`aUda>7?k5yV4}qsB8z*M{9Zej5tHg!mJX`*J0*j)0&1*to;I+5xwW$TDp*8Z_|dq!t&w)6B2AO{ z(-sDN6jGvCZ@}Px0h2_NgySzl@j?y+!?+=V#c7*6hY|@4^#Kh9O?MPS?c=)CG)MV3 zr>|d{a+4?a%O8%o z+76nd4aKveg4VU|3vcjnMF(E>zhm?$keyGtF65ZrfEoPy2qPiLJj^j?{30K`&Pr!w zx?D|h8>Z2J2;#K4W@+`xh@gB)W6Xmnw`dWK#(D?BK$O5H$?a$3xpGdIkHwOmmc+m1SNr!+p8^$ypb_=FO1d1ma&&dY%|F*S_xgW zs@sb_Oi^SHnGRIXVp-##ylS}fHTQKR+Pd=2jj^v~gJADqab^@PM}g=0|6*MXNx~nn{CLA~r8y(7Sdu#ujihHjSV2+`)J>aS0=$ysE=Mzd`yf13T?RW9O}YY2 z&zkzoHVXv=g*YhCvEX0qp{T^RAb0H1Pcnju12G~tG`0N-4gzqkIX;IFmfQEez<6PW z{tDAgUjHCFDMJ6nC(sX(ok?lg)jeVkZLHn09)G)>*T({ zb(-4>g~?K1Pi!SPRzDesP{3RoPI| zBi-E0msK>C?VYqdgPeLTo>=N+Gi&-ff#}LKT8Ft5W*WtXcYT50nZ@YmCOx!G#|*d@ zN@SjFqy*R7V68f>S~BRvu6&*u84W*RJCFeXr zTcYSZTQBl|e7di<<)W7G^63ePu$Es*%0WhPNv?D5orQWet4t@B_Kqgv`K4X^PFk(@ zgSg!To^rbvP@>dOlkI zdWl}ksi&-9{3+SK9jp%-qVo20|PP zuL*W5wu3z$6@Bd3)#1U&aJ_WZ@l^)>lyYzhSaPv#C47OCE)kwJF`1?>bjLtgGK)rC z!ZCGmZr&6TQi~8NG!yEs*0UX4$wR+5xlm{T1rCD= z{Ke}st|Wy@+V6X0bJ$QtAvIyz0F&8(b*#}+X;7DW7pFc6%QG$JQSpz@e$2kUOO5YE1EsZS{qyv_BY_9fG|irAb#xP0wVjDE4MpWO0;<*L z4au3QdW|{HlU;}wQ=>wdF~Zv835A4&gAtqO-#4#taZqHfM_*#Z5^hF~=Y8z2FOEx@COO~FlC?(f>7kgmf5lbtnc$TTp(JqX8#ecb zB=6xpquw~X$?#Gu0iqn?NG9!35{YNg zxzs3UN*As$VJB6e2XA>t3GQO8Ae05G65L5Xn9A}tly~YpKhs*U2wm0MdQWQj zuGQy-Z;61h^1G*A-ML+P`2)6&4L*l>gpDfgzOMA(o+#ya+^|`Mt-iUQxCPf^bCRSb zQkP8Krgf(*bnMh|0V=x81jGt46=L2Ba$C(2k3v}tBVCV0p!+Sy1ENXwhjRCF>sNZ_ zE)X7%eLX~LiO_w3LTf0XB^3G>69BF%NPsSZw+5Jixv9lHJ2HFdmjIe=Hfy}rz({#W zs88ALyn+uT-fW9fPfeXJ#fos^&*N2J?Rd!f-ocPb|GiBlv&d13_x{cWir2}hdL#E( z$&acy3no3QXfL3d$(98xp8i>&$*it@fE!d#vcfFE&?>p@}GH#Hu+jYMdb*^hD`n2>G%4GD` zdb3fS*;>itNu)Ss-dmwJE2gALC1-PZyY98*@Tl)#ojzL3d{H1Ap!aS`xFzL42w1Z%0AGY6dMhuY~qZJK_R zI@M=#<5>NJ;O4Tk_`CJ-lTOAy*cYyy7RQUioin%Yn)v8>QsJ9=c#1m&ax^_`SIlNJu&zHd8F6{!kRTg-SL1>;lz?i4!EJ$3nz%&9-K}%(_@JK5V`7Hl zlU|s7MJC9C>QOQajvOM18sS!yo^QllY4AWQ36We2z8z^zyvqyCnxFV@d7uq5R}@G9#S6*7n130m|3Rt_paAmm_5S ztLASX;d_uLoh6O7mO;qX#jf<#a%94foxwU6hicOYr zXW3r6xdMK(p`|usQ?`fP*PzKlzZLy8#k>{lq2PjnxhG8f9=z}K<;U6I zujw21>nvb@#PDAC3<%2122rrK5yVkHYD+h#$tL50lDzNtSikaSPUdXZ^EB6UKDAjr zd_&Klcnpo;8E^>L8|+;qBlle`*>iQ9fK=q37Zcbl>?$wHY3Riuio%#bQBKf+MmFg^ zFm;T8nP*8)kz`8ONpuR~^bUNKZ4IwPOyG*(z01Vyu4t*b>G*(^U6=raxcdo5{E*T` zJ*lkiWU+M`;=2`JpOOrG+G-zI+iaL;!IK~zqtLn|caMJD?D3#m{!LuZIaTjRUVu*?;pnmdt^*} z8uY)sbe!?#X589}w*y^k{5awb@ypx!;Wb(V8u8}^9|ogwNH|frkb1O!CLH6>eyg2x zql`-Q07auw;AtMdzu4=8U+uMT2i9voIPu0un_kzieoEL1;JTlCg_dLZRskakjrW&m zd$#)hq|tfjL)G#RoKxrzHG?h~epP6(44bdKO21Zz)f#no^pXos(mCym^WDa>VAhqO z`j%)t(Ndq-Hy1?>=a=!qIbfvDWp82xUbt}2zFDNSCP^GX<(DGhPNd0yKcZ_g1JwzblqPxiXz-rcgDN64rUF1m&N zG9PmyK_4Q6>6@e?#k4C)5sbgU1tAF!C``h8rAviCZsS+-xCd?JoHZF}sCEV$_h_hj? zb#olX%DKU`p_AWElb5m+siWYI0&hYgf3b4}kAo5bZ)nhcuFtaul2$H>)?TwuJP7cK zPq}=i_qo8E`N#*aOB0~Z5zv|g_?IbtiX1s8_mJ&n)fb~J+=0=>t8|ylQWMY!jPQ)K zMhL5|pAanHWf>@FeSO|&cX9_^h=5ibg|sNZ8Wc109N->ds)GZFAkHk3bjYu zKOIk@u^54J>|#Y^Jj>rHQwj4ulZYelfs-8z#X69wQwa(4q8?Jkf$z{43%ICb-4If@fWZiUFoSe}-%m@29I6q|fLD2#f zP>=`s*6BY16Fi|e4miUnwOuBOPet_h@tOkB2L03tydXR41+-JGAfln(kRf3<7`3&5r^e@$mlzx`%&1OEJjq8>D!sFtAfn=-_=~ zukw<$ADg5cs%jxY6b{xd;H+}+U2mgN>)d9V?Wpjb-qhS$eqdHT#W|-1*d;D1G&QFSae_ShW~S>|E62&LxY?!|VoX?GHJeHbM_F z2Hv04i{JX%S>4^Yv!6dL@AG~=0jI7+P?v&0*rZP|sxxy<&vmLZlO8)K+-ZT1Utj%; zo4KKKSqq+m{=Ady8~Rw<7NCH;ee&BEi4c~)Cpvm2?G!efLg9jCT2hc3}8FM3E^VxXbwW2+VMuX zID){A4uIHhHyuH6AfE*goC5gY+5@1u0usT25(sNcD>odlgEcavnWLSf0|;mcSONqF zB4S8^kf25eXa>-B5I>*?1rS&f1XcpH2m-4BY5{@OfFKwk0ibpbKx-fYJ{+(P2&@Z2 zei-NOV2*HcHFI=9fCPaZDiaXc6yN~@n*(eCIY3|_{D&0?Yz=4&cx(#-+kwFLAg}`< zHqaCtC=^(O0|L7OEdVM5fjvOrn?Se=VETIj?E}661;Sf^b*;_atN_&^!NbpyGERH) z#|IYw4wIky`m}RUbu7?cI}=M+K(~L>#S*S&$PgevR1XjsSv&HEiwC*jl`?UXKh-aY z9}4J2@B0lLdD=k%2!CR2CgEUdhX4czYq}xqb&v@a6VKBY0XhL%@NYnr)Ak{oV*ug@ zUR3;lxN!cjb?^Y+G$1bkZGhmw7y>RF0Rce}A8^Mc6aWM` z%?ms~E%!a&KjnZj$h07^@pDQ;)(_P8T?icHFE20~kaZy&_(Mn}j~{7do`2Fv8GsA< z9Qljn|NYrN4Lq#}`TR#YAdQsqv`%Ebzy*9X2;u=&+sI#}0+5}CtOxlE)O%VNvL3+T z{42;m1Y}G8xj_~|Ht9#dBgH~q$PB=R1N&a}X;uG}`X&VIR57Q_0Y1oTkwPOg{b=xy zUqIM@gyIJ_reT0DJbeZTkIavx!2xT;iZj{edZZSACIacx9z+WL<2oG;K)%xv1JHl|J=HXlhJ60h^}S9UqyRq} z{QEC*DxCHpAoS1Ae(?SA`~BHJqv7YgNX4M!@uQQ0-_vj2kQU+8Yy7Zu5+Gn*f9k>{ zfywwSB?b7TZz&lNU_DQt$^u>gEhPs!{d^QK-wGg}Q!9!C*m%H^o#s;lM&GxTGN8F{ zDHRaUcOI%Bp6@)=K&PKS0kx=uV5dBA0FMB8r_)+AfiC@)(g6YecuLa+49K^X2?+L` zhbb^zzR}D;uu~p5-<=ZdJ0HNgeQyik@xHg^n~ep&$vCao0>t~B*Quui<}*OX`R4q9 z4c5~YHl@KJN!^aR9#toEbTN;0gjx5`IeoKKXQG4bb8@pU(Hag>E3= z@aAbAq%Q|fP@UG{4mw>j1G#_lw7_&Z&FBG4fN!arAYgi&(!4-?-kksN*uDOo7!6m$Lj9NE+#&n2KgX$G{Ri|6?}5qyL)? z_CIEWa08C>-;BKfOGW?y0U06O0s!luyeLvP-^KpdwIbE@2N9{U9}QFi@Z7gv{DxRa-ExOWBsXyMLnQ?s#$V_*nZPyAt~2_h^GVvaPUe!r)%|PRM)Jg>a_p zF3jGP2hg|=NgAwIy$LM)aIFa*4Rf<8FxMN*EFEws+-jk0luSJvQ#3ExAzeiz;#Org zycIsX1&I%tM~fDPWQBCgx_2n@4x{;Ds!II1AVQAbU$ylu?i*Oi+8haE0egxZp{J41 z+yMY}I%K}b0s#)K{99NGh-QWa>TCe2cCvGHv$Hk@dBC^<^$Buwif4LO65e;|kNG^n5;a0cOb z8VuO)`kls$w8hB3pUUz`G6I{uVsNlSAWz8<%c8f`Y$}7e1Zahv`v5?SU~)m58#V5aKDs=@CYD{@UJxF zocn{uhqU3pU&q3aIq=vQP*w($D^u zPXIWA_8SdYcK%7@5%{A%DDNM01`35EUG1+tfF;o%G@yHb*C~_-=%nAv@*tNZztsnW zBC*u3JYc-QX`bI{e30M!7YY+VBEetF^1}WY_fTN${!t&V01^rQS{4rZQ}=LQq;LE! zA0Pi8x`+Qc-=J{8KlAZHfsKdX>H{qCAF}ZA{n2m0R?N?RMlb~wi?r!1g2tk5Bv7s8)CgwJdRuG<(r>;(B8W!#l z1AwihG$4wFmpcS1=KzQxesY&MxywO}eiH}wX9GaFLFrvB0O@{;4^_8tbMkOC1KI=1 z5AavAFt;%gck%)_asmH%d3ZUwc=-{$NC*!X4<|1i32=c*0PO=Raf9&xlv2#m(Fynk z155m_6Y%jbB`R7tTDe<8U?+^FZR~+&L!i?3fG#90%$&?EK4$}c&1x}>j@)k@$!kIPpDpBS;f6p zo4%j>Ek~10L+fynX1ukhV$`>><>vNq_Oz{cC-G6#qy*=#)xqtB2$A-6+I^*&jfZi} zw5$}aHLGtGpbe0%jvB@*3kWYfb=UXm3AsEwfj%{}Q z@=-o~VZQMz4UcJ#q}7XFZTuS^2R9Bb`hSzmOK1H-_%h=KA?~AGtVJ6kPNMPj<@2Lf zPP^67-mHl&59diM{0xO(N)GczRnL)g2RmNNtSawFRTH1*eQWmI?bQ__4MT7D^(lG< z4VN#0hbbblc)R{PO$uAz0vg_3nrLv&yI(eOjficUQSe4hf;3ApE5=ohamBj$4V^c5 z1y&(BC7;N?P`}w;q-b3v7VvK=7=&XGnvgBz$BR2y5Ms-_+#QOlJJokK1Qpt^!2`n-WAnv zC!bEq)VJWILEpzHH>Peh8r)jcURNP4!aMl%Bg0>Co@{UY_RU-0a5EZoQ#}T&qNIc3 zZw^+}%DrX7W^|hiQ^v4wzsYj&Zg>BO_3md1vMc(`k)JHq)ep7?swBLyR}NY(6GTsZ z+`GO_t+eWXBPpZic2@>RlGOqhla;>;Qk1;c!;nW`@}*zXJp(?ybQMfy>Q&1xiG*>_ zOON8d4n{sx(Rj}BMj%w3IycP4SCss+|Fh*JnLO&F!{NCh$ebb1bat4!Q_8;D#r?>8 z<4Agy=F&XvQI?#ME97nS515Mzq!QX*m(-P%rz{zCuSk?&Wf!jP1rP1UQsUuhnjFsH z#ng6$tJHnW+EK=X@ut--tf5pMs$6}gPuccjI!n9}|! za+5rt2>ADEAF0}0%QjHjO}Zc>?4FDT!6Su_*|yPaHaNQYY7m#5MzU|_S&jOHF|cgErT^MLSl3Iv^<~AhYH`0LV|BXQd=aZsz4dW9 zvtEdyT09PnBGkV|lZ&<4XRfH_E%EhF2R@I#uwolf4=@uD-)-u=ZnZsdfh@O2bH*>& zmpi|~;XR%F0g>4?kpc=EN_DT*M&W{x6un@PyGOc_aI~f3D6?Vo&<9jJGf~RMvgFkI zg*)%>z4 zTZ7upE7U+u7%bj*XnTZrt11oMh|Rm2;5R0)K~Or^@F>dtBc(%I;dxvrzy0e1t*fC^ zxs?6rhK0}b89r=Fdjxl@u7z`H#k*^-eP!~Y?`5!cQoZD@;`!8P3gknjy*-FMyQb@b zrc)Tl#9vXoOqh|aZxYGNf+wM_^f94AEi`9jI6yxk$5!)(i&uvUy>c-{Za(c~{*@{D zs+LCl5ijj&gyt-fzW0SZxx(`M3i&;(Om>u}5R3;*QFv6>U5COniahlnI$!Q)_YQm< zy7n|+pOA;Otg7c}^Fn?Sp_rOp234N|>m|5Qa(1$Yt`q6-?d-jX76n{N7u)`G-sy>y z{kVELF3JzQZcKMJ=N#QjIkyQ=W8?X01}#S(1;^Hw9e8 zRdE<6bogu@I@0S0clSg$yb`c7?CPa9ippVmBFD&(q54UGT<2%YR<_Xkqb3)nm8J~2 z8ei{Y-<0lVW&6@aP*)T7!4?48K60inqU2CJAJjiMrcq2&~}mtcEu?B*={9$CWm?C zXC^p)G7Hmv;XTdvsl4F%{Wqf18i_89L1UNfM5_zW*Q-Y}7`@u2>?aYHL1LN@$bS-V zE}9}?%|N)Wv)e{%e!Hi2Lv4rH;;BH|*sw8WKj!tKErwh6WL6NkS{Mi zT)1?bp*)rjX>nY?bQN!N8&4LASMi9Ct{tc$Rfn8#Yf@iJZ3mp?$a(|;p|ag{U34fO zv9Oelra?4&UYHdppS=&D(;DRlC$KBjl-lI_!p)!AiWg4%$)ZujMD&e3`cSj;YML=L zny<<+?ftn3FH6POw^f00rtoc`@+)8~=6Ov+t8*|uUQ+{{g|sd`t#XR4_}Tq zRj}gbg;cm|jyq}Uc;Drx-2qDc^9l5;uLugnG`w+7dfQWsZ{fHjq)Fr(qisoq7Zf{u ziPZ$=dr2bI&)S4(u@YQBb3xJ_Sr2UgYK%m_Ug?ZJoA^_yQtbLt=QOAH)wK$@VjK@3 z!(Hbu4G?T_+?^F1$EskP@w@6PG0hkp%~G}Oo<+r@_HLF=D0y0;9{QCWvy~_R{+Fk2 z8s>wEj?#1LLH6#1yV4uxId9Q=DN?G0e%e*8z!UVd<6*$aL|}dy3D9nS#(n<2<3@gE zN$-o;*A~~yY`#u$&}kKUw;U__zGX$T!l*Tn{TY2=bfyrTV=7~y-rJFx+M+mPtf$f2 zePKf{)pvCJ`MSg8Xk)dX)BeO(!`cz==mX+3mZdMon-F-${`WgHw=-wFHZGDFk9|(L zOy6w0GQz{)KCv^3&yrXr$8*ru_D!SfJf50sj=`OCnEbZ|{hc@N-`S}2Vsl+^YV>vO zDJBi=H&b#NK03o8zbgdd+!J(mf_RUi90>2=+(_6j7<`NYL81Y~=pP!0eNW>+7!m+~ z{{aG#QExAyi(yHuYXmDyb*pwutKrHW4RTrQ$nFGk$S_zlnbPxrneli(8Q%o&r zck#aErtG8()AloCyjsyx&qNg7&{`N-KMz&TY?jC6Z!&sZh-FJ6Mn0?Gi46@^XMPG9 zoibk&jLtheJU=qT(1oIrzvKE742wYW{hpRU?1GWHmi&dc?WtxWGV3guVn zlrHSMNajLsMP5!bA*<<}VZ#o8A9bZQr7bW{R-QC&r4U`!vG9CKxq*vP=FOyK9OB-% zavAdQ>E`ZlZOWQDli|KTm(U~vi9b`n)Gw_Vl-Y%1(gz0I!w4AXT2BXj{{wNcd zgz?DBS3B7==%h4dmOHn9{5Ve-8AXAjw@}PD;QW7KX^iS}F&Hqcx&@LSuVhV_*9+9_ zaC?tlK9{4B$A`3e7HXCgP>RsK;YsAVZ6U?Wjj@iD>Tf>ku4Z(a%Mn{`K zu=uB%NfViW`zXa4`gxRJ<5PlKvTF5vKV{c$7w!fs+w{pAw*P!@;9^o`EyGS2W{`lE+N7?rMxX=sjU+9Q%%kAP^-iVkUYK)k z%7L&W%p@RhKa5d@TbNp(iAQovn_i|&7PlZ^kEI&=Mpc4S!hL25iyVnUB5>k+N>vDM z_`7{_3p6xa4zV|8_5b`0@~qmgQT3d4+<-vR66Wt ztK^3}-1i^cl=l`%__4mE@1-TvxuYq_U?8P%UB&eW{K9*5DxVTRp)317&m|1I^v5o& zysF73sbS0uO!h(|btbuu1%Rv5`P{XXUrq3pd zWgD?#3O3j0_pFD#B=0b4ebeGks_JK+DXDIL|I`oOMi|k-{=s$meQH{Qm`MfexbrSy zu#nD}LrYU>!2`3WZcKNq9<$uEg*Di_6h}%Y@^|qLCXSsyBzyPH{MiVb&1WeyoR&F{ zAdZcg_|~+ZluI-|b|wx_0zSRxn-F}ly*}$J;-9fjEJ& z2My*7p(T^_X@j}^D21+EF~G;QMp3CNP8VPAApw38nkKF^C1r|)l4zNztMRC74ZP;# znZQs=c@#o{Yec7nLUR4L$spj>s(?`3=LkbcNlyy}S}{80l6G9n{igfp&OC4CCedZ`Mdvv6$CfCyo?`FFx7ZV~sa{bYrmcnmPl~Y(N^4naO@uimt0H*~@JR zEhT1B>|E5v2Zb1JLFyXvZD;eQsT5_x^P71dmQPcWx=d=6nW)P`_V^yp>JPP~Vb&H+ zlTOi%z3#hXsgmbtHn^jIi;d_=vrF8XqDnBaAm+Aq_7zz`>=rQ%y(m|98s~NTddrDi`lL z!Y9%prSB)|Rj3jO?B7bIumdxoBt7)$r+2X#f8eFbOG_6QIMeR)m73 zMFX1BnO}^NWgLzgx69qRixFmWPHc}&C=N$(*z@Rm+1)zUO)g5b`+8kcO_0=cwy(5A zVSg_ zqmUg1VB+QZZ}lOg>Mp$kLW1uTV+^y~8^YR&%@Z%9n{uY9L%k-h@wi`WnLBe?IcV&r zWEke0VGcZ2Gooal23l3`!jzm%K^ELGj|ml|iH;%jLm?9?SnyZYtXAhB3By^LBsPQu ze6JGf#th{ietD@t0$buPdk;gyc9Zy(TgfIe#`~?+LaUP353tRhQiGQ<-%9Q`t$g+P zP(5|?ow8xdo=?TrT4TJLwpOE3*J&{2Zudj*oWc$ml5xjg zyPK?;Ud@scs<$EZpzDJgKonZN;yL>ViGD2c-N%(_0C*1O_MWpAZ-2yc$DMuj*8neH=@Y-1Ewr(#qR5)UB1o84fM z8;x)FZe_YtDW@fs(E36^5Zfw>sU^L^n2VTVFpn*WYR;6-Q(DJh(k6wbYsZ|m=!=tj z$fQ3>hUL>g)r&xZ0^nSK5p9K3`Qux-a8|x!3yaO5$*;>XJqzAA@)Yx73cmD^>UtSzgmE}n|-^(09xaJ)xgNe8w@9Je%Ic*t*KX&P}6y}8X*KMRiwPRJK`kmN8%_{`;F*HRsPF-#L}DMVqgX zWyqAhlg!0C`X*Z%pW*Rtn+6^4^IH`^&BPe_agEsWoTb7=q-RwxK zP;RCz2ojEKCx5KB_-5aJE`UzGOSZ?fC-n}69p=5rHNtit`&w^CraN@goCozGSTRja zo`zx#Jj=IcR`d_^eTgsciTQMkv23r6X9rjauo23VlQb|UZN}HQPW9BhdS!fF!1W3Q z+nZ1B+clxDdYdfnovQ4ZRgpzZ&Q!vL%OAgBb-TBnf4KZ(Bkub)4T+G=H53{~0j#+H z2KRC#C^!OBGRbW}Y)<@+TMijkSuBxrKSl)~K7H(9s7d19pTroqk;#_JIvR6pxg~rz zeo{v0P0T?ob-6{BizIJ(&_U-)kDpX=jYp5v{Rd-H&C74}W-c0%-CO?RbD+zd$+<6k zx6|ff{?%>yFPRka(ep+b`dJ0>^S$0P+>2c`YZ0GEoY?o3#u(2T-xJttpXZF(PR zr?+m{4!b@-()H|umH2_GTLfc3LWv8@bfmoGK4C*s$jn!GX zD&OYe$0oycy;CQr4uuJzfU&%P17nXTV_a|mQ6>F@ zsHC4Hi3S>SIau1~$F^WiTdS;fAl(97_!ir0D3Bp$Qf zUd%MSyK|9?L`%3=>57egZqVnMI6Z|weKmtt{jr=#GCya*Ya?cGDm-OThG#WV)<(mL zogBT&1QD!xy)$Xpv*TK2njKa;Zt(%l?l&?R!-PDAl}u_;XwmR3IfWk0DyBJ68n*EY zn-N1eTw{%GP6=PXMrYM=>-}4)$i);j0czi^?>Af9cdp!cIqf3x5QQ#KU|ks3Uvv#J zy8Kd$C=R->QWHUX_XFa0vk>R$)k*2we8FdCFs{%Oq8CByV{z(jHj$LK92}f4%Lv;f zy{b)k)I-L7UN%(He42}cM>ap>TR@iQCoVjpM@QOrwFt+r>Gi9X-FQ2Ll!qvUMS+UB zx&LCd0U0kBPYrq!4{Xj41g0O{)7PE7dK!-gpaol5*bErvOkFSRbvgH4tCuVgp@$oJ z(!*@~`*WVb&qDayLY4LfT|)enB1h85(;YJDAV9$QH66J~SM@b7+?n@XZB zD4ftpYUa|wLOE)UujN_V+Gn@zP|o?H-E>kpIvZnw_`$-RE`yxu1ex;zb3>Im9J}7T zS@au=2P2y!myhT}Q``cjCzOl$BDzd+s~1%@Sh6^5MJt>_*u~KrE6O?Q157f<)7}X# z_%X&+PU7XA&5j_rE5ji-KdYOWVKqWU=#QYGjS779b)&B`t6w9grbB_dFH+L+-M3Aw z7V{j;h1y$0{&w+FE^xkB!6BOC3kEwJtq@Y$pv3xCGLrBE96l9 z7+Z*t?F7*a(SX=6e#(A~Nr%16y-RD8M`zkr@_12L0E)Q<9QhYOQz=ft@wMnlED87L zU2q|`m>%;_IomD!tc*pC`-`(Ap~gACa8=n4NjXk>GnBh8F+-O=e`sr9Se5X8&e}?x zn)94BO71Gt^|#+TbRmUtX|M4EXJ_U--dvhAab2Cx4|+CD?vp{cAgUGT5KM$)fwZI8 zmwb1-SDr`Ue%#~E1{NzE3at4M+{a0Z^+~qT@prllT$;QZYSzDB#LVIzGbYaFR1Fid zkXTu=UG>D78lL^OgXq^Z(4DfS{GhvxIo=YGH&mFOKOAb?-@c5U!h7aCmJenbmEIaU zH4Z;f;F&iaQ#aj%D_fqC`iwXhrrvZ)yr?U3(a#c0&rcFxOQKafL3`#C+b?q(<5ek8q0j7=8%DU-4X(^!94g3!}@| zX0pHEomEb+H|)aCIml?p!qn)dy8ghX(MEwjYjtIRlcqiQxq(oVRBMh`!C=>-94BqD zcPsrlWhJ(V1qV${X&ojjf{CW)&d8ZRw=ww~y3}J}q$G#8h zw5_oh-@sv&s&768VSjSIoK+?o&u{1~eU5DCOqr_$I(j}G#@jPlLua0HUplkaLyC_= z6cp18IL}`!Diq|RPxw^ z&Zew1L2K3O*2NloQct|@9ZK6CN0E2mxCnVR2@#h|9T&ew*tOxWhP7H~VVr}Yc`;*l z5YB&YVaJiV1s}T}O`6Mc&W!p1J?c8eOA5&cXDQsYZM@@p*5J8qqe+oB47ej&$tcb@ z_+{6VuN>kz+TJinp-xoufxlV(L8^8*V8C$p>#%w7Mf2|YFzL>fFC9_JPZ$g0n<(VG z>ADaIy;VSmjN{xHAb9fO#g}X9mpV(f!=61h4P$&5;f&smIN;ln;tpRk$S|p(v~iAM zn0fPVU+}eBR;~UhJkm0*<<+3-Qp}=fbadi{g#OWC0V94js3h$$H9_j9gBGnc_JJ5d zOPv&CFUuWXv{_RlRwZhK=n09A0!=mL!p$2UewGH<>?MS0qIzkc`aDSUvE~C2)kfou z-Io;E&V1gw@cpt#pRtcCx$}?|jxMiY!@i%J+j}x5jZf5?WhoI3yE$gJKVL#%Ew7BY zPuz~5z9QfP@A>-DwBY5BGi@VA=_quG0{O#u5dXatq@Y*_~mB!ZlSLVJH`6N2t^%SF? z4j-Z_#n<6~xuZ+_KI=_ATkFy{fy`_^_nSfbwXpBp<90kw@0|JbgTspNA6(x-#Lpe& zk9#PLobQ)zW7x2fbx4jj#LP=AWFH7N7q$_IOnU-}XO-F}aT0|2+H%kutmeh5u}10{ z$#GrFR&ZTxXVK`tW=ETS)+!1?H&fWf+gkDEpl)#I^18@}=X%yCtOW%u=l`3~9xaCZ z3IZ&b)CV!ws8;^Svk@FRx^a~}o9y8#=Tp7;*B@Z)I)JSxmG-+YG05byl7ok%q{#wix z7|xx2{$Nh7fPx2Wn+IaCbT9ntavDa=kQPJqkF8aTv=4KxpSN*}tygRHUE?j0*rkzYrszdL}i+v+>K9}~1x z*ubG@w~-7vbLD%LG`IZCbSG8Veg z@l1zDD*GS-)^|Q2X?Fq@d?I&+9F{!_KD_6nGhN~##~8A5v7ZaeQFx*Cy;1WU5faYD^!9Zl}wtG>3=%b?>?W(ywRD_fUXbE9TignP^tVAUzJ&hyoN@tD>N9}^j z&t{41<@_|r6TV))#j#m`pAcJX!;^2ihH*ELP%1s>PWhEJV$FgJCf<&H2xZ!?s?=S) zQEy#}VKFjXXZOC(mYKbSs@@n&aZQ6C4_PJ>A~j>*P${p^b~1z$KV{fgK6~Eg(otVs z%HHFFdbAbA%^QPzCEZY;AQ46sEk^;Bk$(XT5Ij8Ez%B-lIv1Q@zn zEN2)RVdr#^oVYHegX&@7&l&a)Ib2K>yiwpw-oKcqA>iO%8W#ecFysl-uN@T8#^lTb z@jtn9mZ2GcdJ>YPDutDVCRvWAYeR8nor#p|PjP@9vp+X<{$h^%OB^&noIkeqPVJ&S z9Kv~kf-@>`_E+YoF6Rm?jMuG@+zP7ESeMI7kk~u26JuNYW_*WK7Qf7>u#)v`b(BO& zN~71iO_e*FnS-XA{`d9VNfeV54aGelnw#BX7D{57mz1TGQLV-iP8w-sez<}wv^a*= zW-fvGfb{ZwSAfr>0`zkuo?51Z1{^bmQqNWsgI?I4UvgfuqSHf&exK;=57}q=A<;Tk zC;sFOUGW(@vefPL0C_dhCZ7!27t!cV;nZHuS9mvScXU6ys>FV*J%Zp>^D0O4Aw)%G_N|4o zI4;Bct+wmg?`-_)7lvH8y;4axAcDgaX z*R)WFdC8l@m(h>}#fT~lc*Rf{y@4`K`U8yTV;)L}llb}@hRa`DQfc(WO^e7~+aCLV zc|NuG*>LsY#X>*P)^~hn?q%NZ?>Oe^^LNO%LOqM~H`=ze;TXwoG-^(L$+V-2L4{E9 z*_OqS8FuP8Wa{w913iHhgKOfcl&WG*NwZVarWSXqEd=w-1Ujm8?Yb8;(XZxWmx{fe zsMNE1B(iJhzw-4glddEO3Jst@Sa8H&OhkZPT40X>r*@xYPt=P2LdRF%kHbs3M=wcJ zzhQ*mJ)={l^tKXu^XjBF|JvyJBT0vSk`6X{naUS=Cb{D+b}o|~f-y|mH1#s3)FE=i z_7X2Lkxlbt&^OHGc+ya1+?0g<(NrHpNmpB+(V0N=GtD`(uqDmUUx*Cnn|)v3D&#X( zoUirnc>E?a>%E;*o$mn-UcFN5EjL5?G1bQsy8)!JmC5^uvx*N=1`Pza9#Cc*js!kd9hAb?Ae+!n@!;)@>03#ddq z2A9)?*_3*Mm3$=M;j||+N)*mL@s+cc+lz%Z2!AN5>*MHFnDr+s7}t{RVyzY3BJOSK ziru%k5;tl4zcUi+NsDg>(tR2Ph%mmb1Sy^5n^St znnhb5jMjUIsGpPy3K_~hWnWW#O`w-Ymp&V9%{<#~N@bvbs37{~ljVlqr}z5zXz&s( z2T&*&1;PW+{eQlz`0Jeu3~+A$;gu*jx9{o(=Jj*O^Lp;Y&MA5HGY_0`?xNs{3Z`)X zjVHTTM}W?SCE=^mgavt(;T``r*Lll{ewCQjzGFbhrKT?b+uSD@gV86OwR#ao8wJm2 zVIpLip9IDiO^M0Qgwf;M;a8`WI#V|&qG#S_`L)C6$-RCXGQYh0j91TUwP{0sw;KkAZ zR3}G*oGZN;mK3;V`#8+}&61!ipAWcZyG{?)R5fSxKl2V|uUp$NR-822Wpd+(>R0}? z1s9R|m7$|{`YyyS$7Yk$m22%2_cBa;b-l%kBo*#M)P98UMuP^sIM)AtWAd6LAR zC5o_-vXH5XTI{N3dxSiE7TWPlymK++TxztWjEAOm@U`x-dy686dg7|CcJn_P>nQf=BWZNwM3C zTHinXIMNR-ctU1BOw03ujLm-f;bY2rZNaC`>|RqE>WKP_`>%RLp9|^vHtTgIzpkDA zU|ghIK;f2aql1u?Q1Q6b6(c1%XLE;@&`%T7`Ug{sxa5G+le6aUhA^R?$;02eZ(p7J zkvx7gdU?^uk-9l%$YPw}B<&?hop0hnWA-Tk>^D(H~0a8Fv#{PW8+JORDR$M1>?AGIgSc3C-OZ4iD-`pXPiiYde`BaE@(;v7$}D}Liv z45QVcR|&^imT7VVAp#1udMew`cD;3$g3i3zl4n#BJ{E`-*(P$aTN|X6cYXD`5nG7( zc2L?=?=BX%Si*wyyiWq7a|!q0V+Wtq5&@>=8maDEb3|e5N|j!}Za|l`e(d!{ihPe7 zUza}S#N~XX_04^1gPFKNNc-r1>GuMgdqj&-Lwy14S+wUWo1=oie_S;hN|sfbv~m@y z!_~9DITw`3?`JP<&VGYvzQ*Se_&jmUcZAq4w6H6@dG`*fGY3Yu?w@zQum_=g565 zO2yW5FNdCx(q}iqf#tC zpVPWB-C-;ZWnK(?(G;O8ROA~qenrS|dI2Yd4MF5mG9JnI+Ld#1e3r}xH$mJ-d`0SW zxz~|BA)`fsxh11~?SLs&nqTNzR1;av?&Wu|8&D4#(a`>R2BSG$Y zD$FNdKZ%3Na|>ImMn=BrnPY~p>j6uF6}k;+GPz%ct>w+?uh7)z#a79TJ%txrH6%;~ z<;bHCqe@D4O z62vE*4=-Ar4R))UVl!5J97J_@VmGK=-{q%^W~f zF(*{5?+~}|?;DeN^~~E>eB;w$PX>x&VJ@Mt=ljUrQ9aUh85rjWyQ|8t`0lM$wpzAnEo|f>K^(^NQrxcwUNy5&GwF7*?*Gu`J{6coV+#ii&M?aOSr3D8bemyww)46bdnX%`K%Z|6^&zFbDSNS&^X9~ z`Z>BhW%WhT0u-PS4t!qopU?>z`!*VQHA!NIH_n{II{`GIqYQ5STSUu;YBZy7cXyqUmJFKxC!|&1az~n{$ zSSrIRqutG-193v+lVCh)v^3{HcX4I?$FO`|=IOdKJB0zO9Q}_>>ZEAKy5;Bw33!+j z93nj)$eb%x6iB1yoYzTzXrA9z6sQsgeDVIDs!@vhpDAL9i*KY0r9GeeF`3@M zV-Rqj*5FEUW|OVAJ~s_H~BocZ4q`Im~&kcbjrj z%4=epeHqjEFk$WaXm`5eJ<*MOD-B<*rGsx~dR7n;N;-AwvV1&$ErS25)|+>oS+&@& zd2%jaWhH(-XN~**5nn`Jg|QTl3sWa&cB9`AVam70I7zkKw^(+VgH^&zQB&ci?Q@oo z4$|rG=!vDh_*{5qt#*G#(j!`mM%>$;7$ zKcdA-4{@HM=G9Z(bvHLuENQ}#)0=jd9;c0>`h45TTH$g=v`W&#g~X(V;e_YUr=E2_ z{r&^}k%T3RwGiNYvj2sNpC>jPAHx6@H*vH0nUYbF)>GBHD&}fqVy~nLYTeb%UBcSL z6(|W+G&xDbkw|PPSU}Rk&CJ!t+1<$%0s}p-Is~fWVd{Q-7zD@w2tmpw4#1HL|E~9E zVey}5C~)xbAR!zG;MfE>7w}#U@NNwccz^;}y^@K$tIhE_3!Gel8wHUr*FPyEY$zz0 zDscP*aIOOA%Yd^Pq#S``62QYD{&y(FFA_FX(#F!#0yx?MJj=lVf&ehGo3n`-aLfc? zUe?~u))tNssI!HujgvV9-0JtSaCL$}og4x2-ERUZcWYM*AZ-bp1_AtU<7ojw@?k^G zoa~((AwUzLVh|{B;DiJOgfmh=1AtaTpt8Wx6A-98a7qNAKL}I_PzVI70-P}cfvQ2E zpuT_;DTfhQkg)YrVywZ1Zoa}g1jM6;B*aZ2-F79 z6Ck2J1nK~R0%8N=IYVGTA3y=w+_0hUKur*+Cj@#EI8Fi>_1-{pfUgaK<0GItHshLv>GnAYp(;<$$9_Y|O+Qt?VrT!J!)N77p5ALdnGIxJ7_M z0Q>&mVI#-w1Dj(2;RTkr|36&V|F3nxf%oXa3ji4qB$5Zbu=)7-Aw0kf5iouj1c5;O zyZ{=KkNam{fcC50Nts_|Pv5bBJ^v>^PzPAg@dfZZ)(prDym;a8_KjnZjV45FT`8lS6^#k>t2!RdWxq+bo)&(~3myjTj zUuiJUKWR_~-~yk6caZ`CV3U6JJ17=-ff;}c8*x(gaaI46`Y8nBSTV=U0X|^0 zpwM8ZUk(0s2Za4cC|+P?8UeibdHf6z9?TEYkifnaHz+T#Uk=iKT}Z&KAi*;KqhTWe zUj&xt0~GM<0$cGD}`L7{(L$JPMkJGK~r{`>t{(;y9e{@Zm@ zCpIX+uLhsKgG1rC2LYjffA)*-ulvcff2`s6yr5!G^7z%s!2S3`CeR}s2a8{xP7DGA zQ6%7r#Ua3o%yCKr7?wX%Qb17pnUaP8e)RaM3}6I5Q?d~Fu@}V#d_4Rl{>ejtNOqi0 z5wOFbDJ4LMKU2yO_z4dc2>gVHD&+X<5FmP~K@cZA)WPs{T#E+Kxj$2y5THkoXS*9sWXKU3Ba?voas1Wq7e9%r$E06lRmhCR@!KWPpSh6(@aEAm-5of6=_GX=r+JQ9 z0+A31-N&mAKw~^0$1`Ox^}jI7bHdvb4C2RC-2{fh&y+U=7$^V@8;G93^3w6sV^{z< zHt@&)!VTD%UnV6DgB(Mx-=Bj_emxh1{SHt5IqeE6AqHfEnwdagV8u`|`2RH<5YbPv z{U!n^{RxZz$ptp#4m7c#>r6bVq!TJB>i{Rovty3-p67-7yl7$Pw2fW5~ipI?adYFIb z;|3SCPSbe)nR>^-al*z2ImI@1Niqpzkxu2Sm6|n8;sAVXn+qoP2-0B z(I0?c`@>FQ+<=k)QI`J?dx0V0f7lBQfRm@k8w|++$)K8a6%_fPV&N#7<6NXgQ9wa*mcxkmJyD97)~W XOEjs1TBrW5VE literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_summary.md b/audit/dataset_statistics_summary.md new file mode 100644 index 000000000..11ff1811b --- /dev/null +++ b/audit/dataset_statistics_summary.md @@ -0,0 +1,37 @@ +# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains 40,495 result rows across 59 datasets, 178 evaluation names, 794 developers, and 5,299 models. The coverage plot (`dataset_statistics_plots/coverage_counts.pdf`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of 40,495 result rows, 40,395 rows can be converted onto the shared zero-to-one scale, or 99.8% of the dataset. The only observed normalization exclusion is 100 out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`dataset_statistics_plots/normalization_quality.pdf`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are GPQA, IFEval, BBH, MATH Level 5, MMLU-PRO, MUSR. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`dataset_statistics_plots/top_evaluation_coverage.pdf`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. + +The new model-per-dataset histogram (`dataset_statistics_plots/models_per_dataset_histogram.pdf`) adds that missing model-coverage view. Across datasets, the median number of unique models is 37, and the largest dataset-level model count is 4,557. The highest-coverage datasets by unique model count are GPQA (4,557), IFEval (4,557), BBH (4,496), MATH Level 5 (4,496), MMLU-PRO (4,496), MUSR (4,496). This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. + +The inference-engine spread plot (`dataset_statistics_plots/inference_engine_spread.pdf`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are unknown (39,618), ollama (450), openai (150), google (54), anthropic (47), gemini (33). In this snapshot, 877 rows have a named runtime field and 39,618 rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. + +Mean normalized scores vary sharply across tasks. The lowest means include BFCL leaderboard CSV: bfcl.memory.kv_accuracy, MATH Level 5, WMT 2014, BFCL leaderboard CSV: bfcl.memory.vector_accuracy, while the highest means include BFCL leaderboard CSV: bfcl.overall.latency_mean_s, BFCL leaderboard CSV: bfcl.overall.latency_p95_s, helm_mmlu: Marketing, RewardBench: Chat. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as ARC Prize evaluations leaderboard JSON: v2_Public_Eval, ARC Prize evaluations leaderboard JSON: v2_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Public_Eval indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`dataset_statistics_plots/score_range_by_eval.pdf`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`dataset_statistics_plots/normalized_score_variability.pdf`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. + +## Plot Notes + +`coverage_counts.pdf` is the high-level inventory plot. It compares the major corpus counts on a log-scaled axis: result rows, normalized rows, unique models, unique developers, unique datasets, and unique evaluation names. Its purpose is to make the shape of the datastore visible at a glance. The figure shows that the snapshot is much larger in row count than in benchmark or evaluation count, and that model coverage is broad relative to the number of datasets. Because the axis is logarithmic, the smaller categories remain readable instead of disappearing beside the result-row total. + +`normalization_quality.pdf` explains how many rows can safely enter normalized-score analyses. It separates normalized rows from the different exclusion categories, including out-of-range scores, missing scores, missing bounds, zero-width bounds, and incompatible score types. This plot is a data-quality checkpoint rather than a performance result. When the normalized bar dominates, as it does here, the downstream normalized mean, range, and variability figures are based on almost the entire datastore. Any nonzero exclusion bar points to a specific normalization failure mode that may deserve follow-up. + +`top_evaluation_coverage.pdf` ranks the most-covered benchmark/evaluation pairs by normalized result-row count. It answers a different question from model breadth: which evaluation slices contribute the most rows to the descriptive statistics. The chart is useful for spotting which benchmarks can dominate aggregate impressions and which ones have enough observations to support more stable summaries. A long bar does not necessarily mean the benchmark covers many unique models, because repeated rows, submetrics, or source-specific reporting patterns can also increase row count. + +`models_per_dataset_histogram.pdf` shows the distribution of unique model counts across datasets, where dataset corresponds to the `benchmark` field in the JSON. Instead of listing individual benchmarks, it bins datasets by how many distinct model identifiers appear in them. This makes the coverage imbalance visible: a small number of datasets cover thousands of models, while many datasets cover far fewer. The median reference line helps distinguish the ordinary dataset from the high-coverage comparison hubs that dominate broad ecosystem-level coverage. + +`inference_engine_spread.pdf` is a horizontal bar chart of result rows by recorded inference engine or platform. The x-axis uses a log scale so smaller nonzero runtime categories remain visible next to the very large `unknown` bucket. The y-axis is ordered with the largest categories at the top, making the plot readable as a ranked metadata-coverage view. This chart should be interpreted primarily as runtime observability: it shows where execution-platform metadata exists and where it is absent, not a definitive ranking of which engines were actually used most often across the ecosystem. + +`normalized_score_mean_by_eval.pdf` ranks the evaluation slices with the lowest mean normalized score. It is a quick way to find benchmarks or metrics where the collected model population tends to score poorly on the zero-to-one scale. This should not be read as a model leaderboard, because the model set is not matched across evaluations. Instead, it is a difficulty and saturation diagnostic: low means may indicate hard tasks, older model coverage, sparse high-performing submissions, or metrics where good performance is rare in the collected data. + +`normalized_score_variability.pdf` plots each evaluation’s mean normalized score against its standard deviation, with point size reflecting normalized row coverage. This figure is designed to identify discriminative evaluations. Points with high coverage and high variability are especially interesting because they have enough rows to be credible and enough spread to separate stronger and weaker systems. Low-variability points may still be useful, but they are less likely to support fine-grained comparisons unless the goal is detecting failures, regressions, or saturation. + +`score_range_by_eval.pdf` ranks evaluation slices by their min-to-max normalized score range. It complements the standard-deviation plot by showing the full observed spread rather than the typical spread around the mean. A wide range can indicate that an evaluation distinguishes sharply between weak and strong models, but it can also reflect outliers, mixed subpopulations, or uneven source coverage. This plot is most useful as a triage tool for finding evaluations where a closer row-level or paired-model analysis may reveal meaningful structure. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py index 8b2f2e0e8..69ac29aaf 100644 --- a/every_eval_ever/helpers/dataset_statistics.py +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -24,6 +24,14 @@ STABILIZATION_WEIGHT = 5.0 BOOTSTRAP_ITERATIONS = 400 RANDOM_SEED = 20260429 +SCORE_GROUP_KEYS = ( + 'benchmark', + 'evaluation_name', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', +) def read_data(datastore: str) -> list[str]: @@ -144,8 +152,7 @@ def numeric_summary(values: Iterable[float]) -> dict[str, float | int | None]: def shared_evaluation_key(row: dict[str, Any]) -> str: parts = [ - row.get('benchmark'), - row.get('evaluation_name'), + *(row.get(key) for key in SCORE_GROUP_KEYS), row.get('score_type'), row.get('min_score'), row.get('max_score'), @@ -497,13 +504,13 @@ def descriptive_statistics( 'score_summaries': grouped_summaries( rows, 'score', - ('benchmark', 'evaluation_name'), + SCORE_GROUP_KEYS, summary_limit, ), 'normalized_score_summaries': grouped_summaries( valid_rows, 'normalized_score', - ('benchmark', 'evaluation_name'), + SCORE_GROUP_KEYS, summary_limit, ), } @@ -598,13 +605,29 @@ def print_report(report: dict[str, Any], descriptive_only: bool) -> None: section('score summaries') print_table( descriptive['score_summaries'], - ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], ) section('normalized score summaries') print_table( descriptive['normalized_score_summaries'], - ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], ) if descriptive_only: diff --git a/misc/dataset_statistics_summary_writer.py b/misc/dataset_statistics_summary_writer.py new file mode 100644 index 000000000..ca9e19c51 --- /dev/null +++ b/misc/dataset_statistics_summary_writer.py @@ -0,0 +1,109 @@ +"""Preserved Markdown summary writer removed from plot_dataset_statistics.py.""" + +from __future__ import annotations + +import statistics +from pathlib import Path +from typing import Any + + +def label(row: dict[str, Any]) -> str: + benchmark = str(row['benchmark']) + evaluation = str(row['evaluation_name']) + if benchmark == evaluation: + return benchmark + return f'{benchmark}: {evaluation}' + + +def top_rows( + rows: list[dict[str, Any]], key: str, limit: int +) -> list[dict[str, Any]]: + return sorted(rows, key=lambda row: (-float(row[key]), label(row)))[:limit] + + +def pct(part: int, total: int) -> float: + return 100.0 * part / total if total else 0.0 + + +def write_summary( + stats: dict[str, Any], + rows: list[dict[str, Any]], + plot_paths: dict[str, Path], + output_path: Path, +) -> None: + descriptive = stats['descriptive'] + counts = stats['descriptive']['counts'] + quality = stats['descriptive']['quality'] + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + out_of_range = exclusions.get('out_of_range', 0) + models_per_benchmark = descriptive.get('models_per_benchmark', []) + inference_engines = descriptive.get('inference_engines', []) + most_covered = top_rows(rows, 'count', 6) + highest_variance = sorted( + rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True + )[:4] + hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] + easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] + model_counts = [ + int(row['unique_models']) + for row in models_per_benchmark + if int(row['unique_models']) > 0 + ] + median_models = statistics.median(model_counts) if model_counts else 0 + max_models = max(model_counts) if model_counts else 0 + top_model_datasets = models_per_benchmark[:6] + known_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() != 'unknown' + ) + unknown_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() == 'unknown' + ) + top_engines = inference_engines[:6] + + def names(items: list[dict[str, Any]]) -> str: + return ', '.join(label(item) for item in items) + + def benchmark_model_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["benchmark"]} ({int(item["unique_models"]):,})' + for item in items + ) + + def engine_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["value"]} ({int(item["count"]):,})' for item in items + ) + + relative_plots = { + name: path.relative_to(output_path.parent) + if path.is_relative_to(output_path.parent) + else path + for name, path in plot_paths.items() + } + text = f"""# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} datasets, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`{relative_plots['quality']}`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. + +The new model-per-dataset histogram (`{relative_plots['models_per_dataset']}`) adds that missing model-coverage view. Across datasets, the median number of unique models is {median_models:g}, and the largest dataset-level model count is {max_models:,}. The highest-coverage datasets by unique model count are {benchmark_model_names(top_model_datasets)}. This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. + +The inference-engine spread plot (`{relative_plots['engine_spread']}`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are {engine_names(top_engines)}. In this snapshot, {known_engine_rows:,} rows have a named runtime field and {unknown_engine_rows:,} rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. + +Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. +""" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text, encoding='utf-8') diff --git a/misc/eval_hierarchy.json b/misc/eval_hierarchy.json new file mode 100644 index 000000000..5ad8650ba --- /dev/null +++ b/misc/eval_hierarchy.json @@ -0,0 +1,4187 @@ +{ + "stats": { + "family_count": 20, + "composite_count": 20, + "standalone_benchmark_count": 10, + "single_benchmark_count": 108, + "slice_count": 58, + "metric_count": 208, + "metric_rows_scanned": 41616 + }, + "qa": { + "fallback_metric_count": 2231, + "fallback_metrics": [ + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + } + ], + "metric_like_single_benchmark_count": 0, + "metric_like_single_benchmarks": [], + "single_equals_only_metric_count": 0, + "single_equals_only_metric": [] + }, + "families": [ + { + "key": "ace", + "display_name": "Ace", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "diy", + "display_name": "DIY", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "food", + "display_name": "Food", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "gaming", + "display_name": "Gaming", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "shopping", + "display_name": "Shopping", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex", + "display_name": "Apex", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [ + { + "key": "apex-agents", + "display_name": "Apex Agents", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "benchmarks": [ + { + "key": "corporate_law", + "display_name": "Corporate Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "corporate_lawyer", + "display_name": "Corporate Lawyer", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "management_consulting", + "display_name": "Management Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_8", + "display_name": "Pass@8", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex-v1", + "display_name": "Apex V1", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "benchmarks": [ + { + "key": "big_law", + "display_name": "Big Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "consulting", + "display_name": "Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medicine_md", + "display_name": "Medicine (MD)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + } + ] + }, + { + "key": "appworld", + "display_name": "Appworld", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "arc-agi", + "display_name": "Arc Agi", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "v1_public_eval", + "display_name": "v1_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v1_semi_private", + "display_name": "v1_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_private_eval", + "display_name": "v2_Private_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_public_eval", + "display_name": "v2_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_semi_private", + "display_name": "v2_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v3_semi_private", + "display_name": "v3_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost", + "display_name": "Cost", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "bfcl", + "display_name": "Bfcl", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "format_sensitivity", + "display_name": "Format sensitivity", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "format_sensitivity_max_delta", + "display_name": "Format Sensitivity Max Delta", + "sources": [ + "metric_config" + ] + }, + { + "key": "format_sensitivity_stddev", + "display_name": "Format Sensitivity Standard Deviation", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "live", + "display_name": "Live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "live_accuracy", + "display_name": "Live accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_multiple_ast_accuracy", + "display_name": "Live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_ast_accuracy", + "display_name": "Live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_multiple_ast_accuracy", + "display_name": "Live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_simple_ast_accuracy", + "display_name": "Live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "memory", + "display_name": "Memory", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "kv_accuracy", + "display_name": "Memory KV accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "recursive_summarization_accuracy", + "display_name": "Memory recursive summarization accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "vector_accuracy", + "display_name": "Memory vector accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "multi_turn", + "display_name": "Multi turn", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "long_context_accuracy", + "display_name": "Multi-turn long-context accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_function_accuracy", + "display_name": "Multi-turn missing function accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_parameter_accuracy", + "display_name": "Multi-turn missing parameter accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "non_live", + "display_name": "Non live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ast_accuracy", + "display_name": "Non-live AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "multiple_ast_accuracy", + "display_name": "Non-live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_ast_accuracy", + "display_name": "Non-live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_multiple_ast_accuracy", + "display_name": "Non-live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "simple_ast_accuracy", + "display_name": "Non-live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "latency_p95", + "display_name": "Latency 95th Percentile", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_mean", + "display_name": "Latency Mean", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_std", + "display_name": "Latency Standard Deviation", + "sources": [ + "metric_config" + ] + }, + { + "key": "overall_accuracy", + "display_name": "Overall Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + }, + { + "key": "total_cost", + "display_name": "Total Cost", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "relevance", + "display_name": "Relevance", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "irrelevance_detection_accuracy", + "display_name": "Irrelevance detection accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "relevance_detection_accuracy", + "display_name": "Relevance detection accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "web_search", + "display_name": "Web search", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "no_snippet_accuracy", + "display_name": "Web-search no-snippet accuracy", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "browsecompplus", + "display_name": "browsecompplus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "fibble", + "display_name": "Fibble", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [ + { + "key": "fibble_arena", + "display_name": "Fibble arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ], + "category": "other" + }, + { + "key": "fibble1_arena", + "display_name": "Fibble1 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "metric_config" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble2_arena", + "display_name": "Fibble2 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble3_arena", + "display_name": "Fibble3 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble4_arena", + "display_name": "Fibble4 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble5_arena", + "display_name": "Fibble5 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + } + ], + "composites": [] + }, + { + "key": "global_mmlu_lite", + "display_name": "Global MMLU Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "reasoning", + "standalone_benchmarks": [], + "composites": [], + "slices": [ + { + "key": "arabic", + "display_name": "Arabic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "bengali", + "display_name": "Bengali", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "burmese", + "display_name": "Burmese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "chinese", + "display_name": "Chinese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_agnostic", + "display_name": "Culturally Agnostic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_sensitive", + "display_name": "Culturally Sensitive", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "english", + "display_name": "English", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "french", + "display_name": "French", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "german", + "display_name": "German", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "hindi", + "display_name": "Hindi", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "indonesian", + "display_name": "Indonesian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "italian", + "display_name": "Italian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "japanese", + "display_name": "Japanese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "korean", + "display_name": "Korean", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "portuguese", + "display_name": "Portuguese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "spanish", + "display_name": "Spanish", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "swahili", + "display_name": "Swahili", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "yoruba", + "display_name": "Yoruba", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + } + ], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "helm", + "display_name": "HELM", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks", + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "question answering", + "dialogue modeling", + "text generation", + "grade school mathematics", + "math word problems", + "legal", + "finance", + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries", + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Extractive question answering", + "Fill mask", + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "category": "general", + "standalone_benchmarks": [], + "composites": [ + { + "key": "helm_capabilities", + "display_name": "Helm capabilities", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "helm_capabilities", + "display_name": "Capabilities", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "ifeval_strict_acc", + "display_name": "IFEval Strict Acc", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-Pro", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "omni_math", + "display_name": "Omni-MATH", + "has_card": true, + "tags": { + "domains": [ + "math", + "olympiads" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wildbench", + "display_name": "WildBench", + "has_card": true, + "tags": { + "domains": [ + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Open-ended text generation in response to diverse user queries" + ] + }, + "slices": [], + "metrics": [ + { + "key": "wb_score", + "display_name": "WB Score", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Helm classic", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "STEM", + "humanities", + "social sciences", + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Multiple-choice question answering", + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "boolq", + "display_name": "BoolQ", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "civilcomments", + "display_name": "CivilComments", + "has_card": true, + "tags": { + "domains": [ + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification" + ], + "languages": [ + "English" + ], + "tasks": [ + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Classic", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "cnn_dailymail", + "display_name": "CNN/DailyMail", + "has_card": true, + "tags": { + "domains": [ + "summarization", + "journalism", + "news media" + ], + "languages": [ + "English" + ], + "tasks": [ + "Summarization" + ] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hellaswag", + "display_name": "HellaSwag", + "has_card": true, + "tags": { + "domains": [ + "commonsense reasoning", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Four-way multiple-choice selection for event continuation", + "Commonsense inference" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "imdb", + "display_name": "IMDB", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ms_marco_trec", + "display_name": "MS MARCO (TREC)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ndcg_10", + "display_name": "NDCG@10", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_open_book", + "display_name": "NaturalQuestions (open-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "quac", + "display_name": "QuAC", + "has_card": true, + "tags": { + "domains": [ + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "raft", + "display_name": "RAFT", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "truthfulqa", + "display_name": "TruthfulQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "xsum", + "display_name": "XSUM", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Helm instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "general", + "benchmarks": [ + { + "key": "anthropic_rlhf_dataset", + "display_name": "Anthropic RLHF dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "best_chatgpt_prompts", + "display_name": "Best ChatGPT Prompts", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "koala_test_dataset", + "display_name": "Koala test dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "open_assistant", + "display_name": "Open Assistant", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "self_instruct", + "display_name": "Self Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "vicuna", + "display_name": "Vicuna", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Helm lite", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems", + "legal", + "law", + "finance", + "medical knowledge", + "professional medical exams", + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering", + "Multiple-choice question answering" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "gsm8k", + "display_name": "GSM8K", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "legalbench", + "display_name": "LegalBench", + "has_card": true, + "tags": { + "domains": [ + "legal", + "law", + "finance" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "math", + "display_name": "MATH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "score_details" + ] + } + ] + }, + { + "key": "medqa", + "display_name": "MedQA", + "has_card": true, + "tags": { + "domains": [ + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_closed_book", + "display_name": "NaturalQuestions (closed-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wmt_2014", + "display_name": "WMT 2014", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "bleu_4", + "display_name": "BLEU-4", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_mmlu", + "display_name": "Helm mmlu", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "category": "reasoning", + "slices": [ + { + "key": "abstract_algebra", + "display_name": "Abstract Algebra", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "anatomy", + "display_name": "Anatomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "astronomy", + "display_name": "Astronomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "business_ethics", + "display_name": "Business Ethics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "clinical_knowledge", + "display_name": "Clinical Knowledge", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "college_physics", + "display_name": "College Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "computer_security", + "display_name": "Computer Security", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "conceptual_physics", + "display_name": "Conceptual Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "econometrics", + "display_name": "Econometrics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "electrical_engineering", + "display_name": "Electrical Engineering", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "elementary_mathematics", + "display_name": "Elementary Mathematics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "formal_logic", + "display_name": "Formal Logic", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "global_facts", + "display_name": "Global Facts", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "high_school_world_history", + "display_name": "High School World History", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "human_sexuality", + "display_name": "Human Sexuality", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "international_law", + "display_name": "International Law", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "jurisprudence", + "display_name": "Jurisprudence", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "logical_fallacies", + "display_name": "Logical Fallacies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "machine_learning", + "display_name": "Machine Learning", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "management", + "display_name": "Management", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "marketing", + "display_name": "Marketing", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medical_genetics", + "display_name": "Medical Genetics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "miscellaneous", + "display_name": "Miscellaneous", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_all_subjects", + "display_name": "MMLU All Subjects", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "moral_scenarios", + "display_name": "Moral Scenarios", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "nutrition", + "display_name": "Nutrition", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "philosophy", + "display_name": "Philosophy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "prehistory", + "display_name": "Prehistory", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "professional_psychology", + "display_name": "Professional Psychology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "public_relations", + "display_name": "Public Relations", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "security_studies", + "display_name": "Security Studies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "sociology", + "display_name": "Sociology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "us_foreign_policy", + "display_name": "Us Foreign Policy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "virology", + "display_name": "Virology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "world_religions", + "display_name": "World Religions", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + } + ], + "metrics": [] + } + ] + }, + { + "key": "hfopenllm_v2", + "display_name": "Hfopenllm v2", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "mathematics", + "explanation generation", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation", + "Multiple-choice question answering across a broad range of subjects", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "category": "instruction_following", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "bbh", + "display_name": "BBH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "math_level_5", + "display_name": "MATH Level 5", + "has_card": true, + "tags": { + "domains": [ + "mathematics", + "explanation generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-PRO", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "musr", + "display_name": "MUSR", + "has_card": true, + "tags": { + "domains": [ + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Question answering", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "la_leaderboard", + "display_name": "La leaderboard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "livecodebenchpro", + "display_name": "Livecodebenchpro", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "easy_problems", + "display_name": "Easy Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hard_problems", + "display_name": "Hard Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "medium_problems", + "display_name": "Medium Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "reward-bench", + "display_name": "Reward Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "chat", + "display_name": "Chat", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "chat_hard", + "display_name": "Chat Hard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "prior_sets_0_5_weight", + "display_name": "Prior Sets (0.5 weight)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reasoning", + "display_name": "Reasoning", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench", + "display_name": "Reward bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "reward-bench-2", + "display_name": "Reward Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "factuality", + "display_name": "Factuality", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "focus", + "display_name": "Focus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "math", + "display_name": "Math", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "precise_if", + "display_name": "Precise IF", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench_2", + "display_name": "Reward bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "ties", + "display_name": "Ties", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "sciarena", + "display_name": "Sciarena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "cost_per_100_calls", + "display_name": "Cost per 100 Calls", + "sources": [ + "metric_config" + ] + }, + { + "key": "elo", + "display_name": "Elo Rating", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "swe-bench", + "display_name": "Swe Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau-bench-2", + "display_name": "Tau Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "tau_bench_2_airline", + "display_name": "tau-bench-2/airline", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_retail", + "display_name": "tau-bench-2/retail", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_telecom", + "display_name": "tau-bench-2/telecom", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "terminal-bench-2.0", + "display_name": "Terminal Bench 2 0", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "theory_of_mind", + "display_name": "Theory of mind", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "wordle_arena", + "display_name": "Wordle arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/misc/eval_hierarchy.md b/misc/eval_hierarchy.md new file mode 100644 index 000000000..d6005f3c8 --- /dev/null +++ b/misc/eval_hierarchy.md @@ -0,0 +1,459 @@ +# EEE Eval Hierarchy + +## QA Summary +- Families: `20` +- Composite benchmarks: `20` +- Standalone benchmarks: `10` +- Benchmarks: `108` +- Slices: `58` +- Unique metrics: `208` +- Metric rows scanned: `41616` +- Fallback metrics: `2231` +- Benchmarks that still look metric-like: `0` +- Benchmarks where name matches the only metric: `0` + +### Fallback Metrics +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` + +## Hierarchy + +- [ ] Ace + - [ ] DIY + - Score + - [ ] Food + - Score + - [ ] Gaming + - Score + - [ ] Overall + - Score + - [ ] Shopping + - Score +- [ ] Apex + - [ ] Apex Agents + - [ ] Corporate Law + - Pass@1 + - [ ] Corporate Lawyer + - Mean Score + - [ ] Investment Banking + - Pass@1 + - [ ] Management Consulting + - Pass@1 + - [ ] Overall + - Mean Score + - Pass@1 + - Pass@8 + - [ ] Apex V1 + - [ ] Big Law + - Score + - [ ] Consulting + - Score + - [ ] Investment Banking + - Score + - [ ] Medicine (MD) + - Score + - [ ] Overall + - Score +- [ ] Appworld + - Score +- [ ] Arc Agi + - [ ] v1_Public_Eval + - Cost per Task + - Score + - [ ] v1_Semi_Private + - Cost per Task + - Score + - [ ] v2_Private_Eval + - Cost per Task + - Score + - [ ] v2_Public_Eval + - Cost per Task + - Score + - [ ] v2_Semi_Private + - Cost per Task + - Score + - [ ] v3_Semi_Private + - Cost + - Score +- [ ] Bfcl + - [ ] Format sensitivity + - Format Sensitivity Max Delta + - Format Sensitivity Standard Deviation + - [ ] Live + - Live accuracy + - Live multiple AST accuracy + - Live parallel AST accuracy + - Live parallel multiple AST accuracy + - Live simple AST accuracy + - [ ] Memory + - Accuracy + - Memory KV accuracy + - Memory recursive summarization accuracy + - Memory vector accuracy + - [ ] Multi turn + - Accuracy + - Multi-turn base accuracy + - Multi-turn long-context accuracy + - Multi-turn missing function accuracy + - Multi-turn missing parameter accuracy + - [ ] Non live + - Non-live AST accuracy + - Non-live multiple AST accuracy + - Non-live parallel AST accuracy + - Non-live parallel multiple AST accuracy + - Non-live simple AST accuracy + - [ ] Overall + - Latency 95th Percentile + - Latency Mean + - Latency Standard Deviation + - Overall Accuracy + - Rank + - Total Cost + - [ ] Relevance + - Irrelevance detection accuracy + - Relevance detection accuracy + - [ ] Web search + - Accuracy + - Multi-turn base accuracy + - Web-search no-snippet accuracy +- [ ] browsecompplus + - Score +- [ ] Fibble + - [ ] Fibble arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble1 arena + - Average Attempts + - Win Rate + - [ ] Fibble2 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble3 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble4 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble5 arena + - Average Attempts + - Average Latency (ms) + - Win Rate +- [ ] Global MMLU Lite + - Arabic + - Accuracy + - Bengali + - Accuracy + - Burmese + - Accuracy + - Chinese + - Accuracy + - Culturally Agnostic + - Accuracy + - Culturally Sensitive + - Accuracy + - English + - Accuracy + - French + - Accuracy + - German + - Accuracy + - Hindi + - Accuracy + - Indonesian + - Accuracy + - Italian + - Accuracy + - Japanese + - Accuracy + - Korean + - Accuracy + - Portuguese + - Accuracy + - Spanish + - Accuracy + - Swahili + - Accuracy + - Yoruba + - Accuracy + - Accuracy +- [x] HELM + - [x] Helm capabilities + - [ ] Capabilities + - Mean + - Score + - [x] GPQA + - COT correct + - [x] IFEval + - IFEval Strict Acc + - [x] MMLU-Pro + - COT correct + - [x] Omni-MATH + - Accuracy + - [x] WildBench + - WB Score + - [x] Helm classic + - [x] BoolQ + - Exact Match + - [x] CivilComments + - Exact Match + - [ ] Classic + - Mean + - Win Rate + - [x] CNN/DailyMail + - ROUGE-2 + - [x] HellaSwag + - Exact Match + - [ ] IMDB + - Exact Match + - [x] MMLU + - Exact Match + - [ ] MS MARCO (TREC) + - NDCG@10 + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (open-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [x] QuAC + - F1 + - [ ] RAFT + - Exact Match + - [ ] TruthfulQA + - Exact Match + - [ ] XSUM + - ROUGE-2 + - [ ] Helm instruct + - [ ] Anthropic RLHF dataset + - Harmlessness + - [ ] Best ChatGPT Prompts + - Harmlessness + - [ ] Instruct + - Mean + - Win Rate + - [ ] Koala test dataset + - Harmlessness + - [ ] Open Assistant + - Harmlessness + - [ ] Self Instruct + - Harmlessness + - [ ] Vicuna + - Harmlessness + - [x] Helm lite + - [x] GSM8K + - Exact Match + - [x] LegalBench + - Exact Match + - [ ] Lite + - Mean + - Win Rate + - [ ] MATH + - Accuracy + - [x] MedQA + - Exact Match + - [x] MMLU + - Exact Match + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (closed-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [ ] WMT 2014 + - BLEU-4 + - [x] Helm mmlu + - Abstract Algebra + - Exact Match + - Anatomy + - Exact Match + - Astronomy + - Exact Match + - Business Ethics + - Exact Match + - Clinical Knowledge + - Exact Match + - College Physics + - Exact Match + - Computer Security + - Exact Match + - Conceptual Physics + - Exact Match + - Econometrics + - Exact Match + - Electrical Engineering + - Exact Match + - Elementary Mathematics + - Exact Match + - Formal Logic + - Exact Match + - Global Facts + - Exact Match + - High School World History + - Exact Match + - Human Sexuality + - Exact Match + - International Law + - Exact Match + - Jurisprudence + - Exact Match + - Logical Fallacies + - Exact Match + - Machine Learning + - Exact Match + - Management + - Exact Match + - Marketing + - Exact Match + - Mean + - Win Rate + - Medical Genetics + - Exact Match + - Miscellaneous + - Exact Match + - MMLU All Subjects + - Exact Match + - Moral Scenarios + - Exact Match + - Nutrition + - Exact Match + - Philosophy + - Exact Match + - Prehistory + - Exact Match + - Professional Psychology + - Exact Match + - Public Relations + - Exact Match + - Security Studies + - Exact Match + - Sociology + - Exact Match + - Us Foreign Policy + - Exact Match + - Virology + - Exact Match + - World Religions + - Exact Match +- [x] Hfopenllm v2 + - [ ] BBH + - Accuracy + - [x] GPQA + - Accuracy + - [x] IFEval + - Accuracy + - [x] MATH Level 5 + - Exact Match + - [x] MMLU-PRO + - Accuracy + - [x] MUSR + - Accuracy +- [ ] La leaderboard + - Score +- [ ] Livecodebenchpro + - [ ] Easy Problems + - Pass@1 + - [ ] Hard Problems + - Pass@1 + - [ ] Medium Problems + - Pass@1 +- [ ] Reward Bench + - [ ] Chat + - Score + - [ ] Chat Hard + - Score + - [ ] Prior Sets (0.5 weight) + - Score + - [ ] Reasoning + - Score + - [ ] Reward bench + - Score + - [ ] Safety + - Score +- [ ] Reward Bench 2 + - [ ] Factuality + - Score + - [ ] Focus + - Score + - [ ] Math + - Score + - [ ] Precise IF + - Score + - [ ] Reward bench 2 + - Score + - [ ] Safety + - Score + - [ ] Ties + - Score +- [ ] Sciarena + - Cost per 100 Calls + - Elo Rating + - Rank +- [ ] Swe Bench + - Score +- [ ] Tau Bench 2 + - [ ] tau-bench-2/airline + - Score + - [ ] tau-bench-2/retail + - Score + - [ ] tau-bench-2/telecom + - Score +- [ ] Terminal Bench 2 0 + - Score +- [ ] Theory of mind + - Score +- [ ] Wordle arena + - Average Attempts + - Average Latency (ms) + - Win Rate diff --git a/plan/backend-canonical-identity-plan.md b/plan/backend-canonical-identity-plan.md new file mode 100644 index 000000000..3cd00e9c5 --- /dev/null +++ b/plan/backend-canonical-identity-plan.md @@ -0,0 +1,115 @@ +# Backend Canonical Identity Plan (Data Audit + Actions) + +## Snapshot audited + +- **Code repo (`~/every_eval_ever`)** updated to `aa966f7cf` (origin/main). +- **Datastore (`evaleval/EEE_datastore`)** updated to `5edc7b9`. +- Audit scope: all aggregate JSON files under `data/**` (`6448` files, `49659` evaluation results). + +## What is happening (evidence from latest data) + +1. **Metric identity is mostly missing in production data** + - `metric_config.metric_name` missing in **37071 / 49659** results. + - `metric_config.metric_id` missing in **37071 / 49659** results. + - This is concentrated in major configs: `hfopenllm_v2`, `helm_*`, `reward-bench`, `global-mmlu-lite`, `fibble_arena`, `wordle_arena`, `terminal-bench-2.0`, etc. + - Concrete live examples: + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405` has **19** results and **0 / 19** populated `metric_name` or `metric_id` fields; the only explicit labels are `evaluation_name` values such as `Global MMLU Lite`, `Culturally Sensitive`, `Arabic`, `English`, etc. + - `wordle_arena/qwen/qwen3-8b/1776347262.820056` has **3** results and **0 / 3** populated `metric_name` or `metric_id` fields. + - Backend implication: cannot reliably group/compare metrics without string parsing heuristics. + +2. **`evaluation_name` is frequently carrying metric semantics** + - **615** results have metric-like `evaluation_name`. + - Confirmed examples: + - `apex-agents`: `evaluation_name: "Overall Pass@1"` (metric semantics in eval field). + - `bfcl`: `evaluation_name: "bfcl.memory.accuracy"` while metric fields are also populated (eval and metric axes collapsed). + - `theory_of_mind`: `evaluation_name: "accuracy on theory_of_mind for scorer ..."` (legacy converter style). + - `wordle_arena/qwen/qwen3-8b/1776347262.820056`: `evaluation_name` values are `wordle_arena_win_rate`, `wordle_arena_avg_attempts`, and `wordle_arena_avg_latency_ms`, so the eval axis is fully metric-shaped. + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405`: `evaluation_name` is used for suite/slice labels (`Global MMLU Lite`, `Arabic`, `French`, etc.) while the implicit metric remains unstated, so eval and metric identity are still entangled even though the names are not metric-like. + - Backend implication: card grouping by evaluation name produces metric-shaped “benchmarks”. + +3. **`score_details.details` is overloaded as a nested telemetry dump** + - Found **52208** JSON-encoded values stored as strings inside `score_details.details`. + - HELM MMLU example (`Abstract Algebra` / `College Physics`) contains many cross-subject entries (e.g., College Chemistry/Biology stats inside College Physics row), mixing eval slice + telemetry dimensions. + - Backend implication: requires expensive post-parsing and risks accidental interpretation as benchmark/metric labels. + +4. **Benchmark/evaluation_id naming is not consistently aligned** + - **257** files where `evaluation_id` prefix does not match top-level folder benchmark codename. + - Main cases: + - `reward-bench` folder vs `evaluation_id` prefix `reward-bench-2`. + - `tau-bench-2_{domain}` and `appworld_test_normal` folders vs `evaluation_id` prefixes with hierarchical paths (`tau-bench-2/...`, `appworld/...`). + - Backend implication: any logic keyed on only one naming source (folder or `evaluation_id`) drifts. + +5. **Eval library naming is not standardized** + - **16 distinct `eval_library.name` values** including mixed casing and source-specific names (`lm-evaluation-harness`, `BFCL`, `Artificial Analysis`, `ARC Prize leaderboard`, `harbor`, `unknown`, etc.). + - Backend implication: harness-level analytics and joins need alias normalization today. + +6. **Fibble family note** + - Current snapshot no longer has `fibble1_arena`, `fibble2_arena` top-level folders; it is consolidated as `fibble_arena`. + - But fibble still encodes both slice and metric in `evaluation_name` (`fibble_arena_1lie_win_rate`, `...avg_attempts`), with missing metric IDs. + +7. **`detailed_evaluation_results` coverage can be metric-selective inside one aggregate run** + - Current live example: `wordle_arena/qwen/qwen3-8b/1776347262.820056` exposes **3** aggregate metrics (`win_rate`, `avg_attempts`, `avg_latency_ms`) and links one sample file with **35** rows. + - All **35 / 35** current sample rows in `9a357c44-1c36-43dc-a764-de1f3e204fe1_samples.jsonl` carry `evaluation_name = "wordle_arena_win_rate"`. + - The same aggregate currently declares `detailed_evaluation_results.total_rows = 27`, so file-link metadata and actual sample-row counts can already disagree in production. + - Backend implication: a linked sample file does not imply run-wide instance coverage. Aggregate-to-instance linkage must remain metric-scoped, and instance-availability badges should be computed per metric or per eval-summary node, not per run. + +## Backend-centric recommendations (proposed) + +1. **Enforce canonical identity at ingestion (hard)** + - Persist canonical tuple (backend-owned): + `(run_id, model_id, benchmark_family_id, eval_slice_id, metric_id, harness_id, result_index)`. + - Keep raw fields in parallel (`raw_evaluation_name`, `raw_metric_description`, etc.) for audit/debug. + +2. **Add registry-backed resolution with confidence** + - Resolve benchmark/eval-slice/metric/harness via registry aliases (`exact`, `normalized`, `fuzzy`, `manual`). + - Store `strategy`, `confidence`, `review_status`; quarantine low-confidence rows from card generation. + +3. **Add semantic validation gates in ingestion CI** + - Reject or flag: + - metric-like `evaluation_name` without explicit metric identity, + - `evaluation_name == metric_name` collisions, + - benchmark-family naming drift (`folder` vs `evaluation_id` inconsistencies). + - linked sample files whose rows cover only a strict subset of aggregate metrics without explicit metric-scoped coverage metadata. + - linked sample files whose observed row count disagrees with declared `detailed_evaluation_results.total_rows`. + - Keep structural schema validation, but add these semantic checks as a second gate. + +4. **Phase-in stricter schema usage for metrics** + - Immediate: warn-only for missing `metric_name`/`metric_id`. + - Next: soft fail in bot with override. + - Final: hard fail (for new submissions) unless `metric_name` + `metric_id` present. + +5. **Serve frontend from canonical IDs only** + - Frontend card grouping/filtering must use canonical IDs, never raw labels. + - Raw labels are display metadata only. + - Instance availability must be attached to canonical metric/eval-summary IDs, not inferred from the existence of any `detailed_evaluation_results` file on the parent run. + - This prevents recurring “benchmark cards that are actually metrics”. + +## Should we fix adapters and regenerate data? + +**Short answer: yes, but only for adapter-owned benchmark families.** + +### Good candidates for adapter-fix + regenerate + +Adapters exist in `utils/` for: +- `hfopenllm_v2` +- `helm` (`helm_lite`, `helm_mmlu`, `helm_capabilities`, `helm_classic`, `helm_instruct`) +- `rewardbench` +- `global-mmlu-lite` +- `terminal_bench_2` +- `exgentic` (used by tau/appworld/swe/browsecompplus in this dataset) + +These are high-leverage because they account for a large share of missing metric identity. + +### Not fully solved by adapter regeneration alone + +Several benchmark families in data are not obviously sourced from current `utils/` adapters (or are manually/externally produced), including examples like: +- `apex-agents`, `apex-v1`, `bfcl`, `artificial-analysis-llms`, `arc-agi`, `sciarena`, `fibble_arena`, `wordle_arena`, `ace`, `la_leaderboard`. + +For these, you need a **backfill canonicalization migration** + submission template updates, not only adapter patches. + +### Practical plan + +1. Patch adapters to emit explicit `metric_name` + `metric_id` and metric-free `evaluation_name`. +2. Regenerate adapter-owned families in a controlled replay branch. +3. Run one-time migration for non-adapter/manual families. +4. Turn on semantic gating and canonical-ID-only serving. diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index b2479699b..e9b7a6c65 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -76,9 +76,14 @@ def import_plotting() -> tuple[Any, Any | None]: def label(row: dict[str, Any]) -> str: benchmark = str(row['benchmark']) evaluation = str(row['evaluation_name']) + metric = row.get('metric_id') or row.get('metric_name') if benchmark == evaluation: - return benchmark - return f'{benchmark}: {evaluation}' + base = benchmark + else: + base = f'{benchmark}: {evaluation}' + if metric: + return f'{base} [{metric}]' + return base def short_label(value: str, width: int = 46) -> str: diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py index b9f2cbd6d..708572cf6 100644 --- a/tests/test_dataset_statistics.py +++ b/tests/test_dataset_statistics.py @@ -14,6 +14,10 @@ def row( lower_is_better: bool = False, score_type: str | None = 'continuous', inference_engine: str | None = None, + metric_id: str | None = 'score', + metric_name: str | None = 'Score', + metric_kind: str | None = 'accuracy', + metric_unit: str | None = 'proportion', ) -> dict: return { 'schema_version': '0.2.2', @@ -29,6 +33,10 @@ def row( 'lower_is_better': lower_is_better, 'score_type': score_type, 'has_uncertainty': False, + 'metric_id': metric_id, + 'metric_name': metric_name, + 'metric_kind': metric_kind, + 'metric_unit': metric_unit, } @@ -59,11 +67,24 @@ def test_invalid_rows_are_excluded_and_counted(): } -def test_shared_evaluation_key_includes_score_scale_and_direction(): +def test_shared_evaluation_key_includes_metric_scale_and_direction(): base = row('a', 'bench', 'eval', 0.8) + different_metric = row( + 'a', + 'bench', + 'eval', + 0.7, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ) different_scale = row('a', 'bench', 'eval', 80.0, max_score=100.0) different_direction = row('a', 'bench', 'eval', 0.2, lower_is_better=True) + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_metric + ) assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( different_scale ) @@ -143,10 +164,51 @@ def test_json_report_shape(): assert report['descriptive']['counts']['result_rows'] == 2 assert 'inference_engines' in report['descriptive'] assert 'models_per_benchmark' in report['descriptive'] + assert 'metric_id' in report['descriptive']['score_summaries'][0] assert 'coverage_aware_model_summaries' in report['observational'] assert 'pairwise_model_comparisons' in report['observational'] +def test_score_summaries_group_by_metric_identity(): + rows = [ + row('model/a', 'arc', 'v1_Semi_Private', 0.98), + row( + 'model/a', + 'arc', + 'v1_Semi_Private', + 17.0, + max_score=77.2, + lower_is_better=True, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=10, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + + raw_summaries = report['descriptive']['score_summaries'] + normalized_summaries = report['descriptive']['normalized_score_summaries'] + + assert {item['metric_id'] for item in raw_summaries} == { + 'score', + 'cost_per_task', + } + assert {item['count'] for item in raw_summaries} == {1} + assert {item['metric_id'] for item in normalized_summaries} == { + 'score', + 'cost_per_task', + } + + def test_models_per_benchmark_dedupes_model_counts(): rows = [ row('model/a', 'bench-one', 'eval-a', 0.9), From 6f62905550380347544d0735111919ade8e665f6 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 09:52:42 -0300 Subject: [PATCH 09/15] Revert "Fix top coverage plot ordering" This reverts commit f0e5dcd8ec09d85beb9831265127dc7e37bc0d26. --- scripts/plot_dataset_statistics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index e9b7a6c65..aa36f55fe 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -202,7 +202,7 @@ def plot_top_evaluation_coverage( sns: Any | None, top_n: int, ) -> Path: - selected = top_rows(rows, 'count', top_n) + selected = list(reversed(top_rows(rows, 'count', top_n))) labels = [short_label(label(row), 58) for row in selected] counts = [row['count'] for row in selected] @@ -211,8 +211,6 @@ def plot_top_evaluation_coverage( sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) else: ax.barh(labels, counts) - if ax.get_ylim()[0] < ax.get_ylim()[1]: - ax.invert_yaxis() ax.set_xlabel('Normalized result rows') ax.set_ylabel('') ax.set_title(f'Top {len(selected)} Evaluations By Coverage') From d0b09e105c5448009be20f81158b9dd3da2bcf78 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 10:01:37 -0300 Subject: [PATCH 10/15] Ignore audit and plan directories --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fa0573f49..8e12943bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Local data (generated by running adapters) # data/ audit/ +plan/ # Byte-compiled / optimized / DLL files __pycache__/ From ef7ce1f449e07eba51ca9f6272dc3493a71d65a2 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 10:05:35 -0300 Subject: [PATCH 11/15] Remove tracked audit artifacts --- audit/audit_after.json | 262 -- audit/audit_before.json | 426 --- audit/dataset_statistics.json | 2407 ----------------- .../coverage_counts.pdf | Bin 16555 -> 0 bytes .../inference_engine_spread.pdf | Bin 16123 -> 0 bytes .../models_per_dataset_histogram.pdf | Bin 13934 -> 0 bytes .../normalization_quality.pdf | Bin 15186 -> 0 bytes .../normalized_score_mean_by_eval.pdf | Bin 22251 -> 0 bytes .../normalized_score_variability.pdf | Bin 34031 -> 0 bytes .../score_range_by_eval.pdf | Bin 21724 -> 0 bytes .../top_evaluation_coverage.pdf | Bin 21142 -> 0 bytes audit/dataset_statistics_summary.md | 37 - 12 files changed, 3132 deletions(-) delete mode 100644 audit/audit_after.json delete mode 100644 audit/audit_before.json delete mode 100644 audit/dataset_statistics.json delete mode 100644 audit/dataset_statistics_plots/coverage_counts.pdf delete mode 100644 audit/dataset_statistics_plots/inference_engine_spread.pdf delete mode 100644 audit/dataset_statistics_plots/models_per_dataset_histogram.pdf delete mode 100644 audit/dataset_statistics_plots/normalization_quality.pdf delete mode 100644 audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf delete mode 100644 audit/dataset_statistics_plots/normalized_score_variability.pdf delete mode 100644 audit/dataset_statistics_plots/score_range_by_eval.pdf delete mode 100644 audit/dataset_statistics_plots/top_evaluation_coverage.pdf delete mode 100644 audit/dataset_statistics_summary.md diff --git a/audit/audit_after.json b/audit/audit_after.json deleted file mode 100644 index 0dda99be0..000000000 --- a/audit/audit_after.json +++ /dev/null @@ -1,262 +0,0 @@ -{ - "files_scanned": 6448, - "results_scanned": 49659, - "missing": { - "metric_id": 1021, - "metric_name": 1021, - "metric_kind": 1021, - "metric_unit": 1021 - }, - "malformed": {}, - "top_missing_by_benchmark": { - "evaluation_result_id": [], - "metric_id": [ - [ - "fibble_arena", - 336 - ], - [ - "helm_classic", - 201 - ], - [ - "helm_lite", - 182 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "helm_capabilities", - 68 - ], - [ - "ace", - 32 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ], - [ - "tau-bench-2_telecom", - 15 - ], - [ - "la_leaderboard", - 5 - ], - [ - "theory_of_mind", - 1 - ] - ], - "metric_name": [ - [ - "fibble_arena", - 336 - ], - [ - "helm_classic", - 201 - ], - [ - "helm_lite", - 182 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "helm_capabilities", - 68 - ], - [ - "ace", - 32 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ], - [ - "tau-bench-2_telecom", - 15 - ], - [ - "la_leaderboard", - 5 - ], - [ - "theory_of_mind", - 1 - ] - ], - "metric_kind": [ - [ - "fibble_arena", - 336 - ], - [ - "helm_classic", - 201 - ], - [ - "helm_lite", - 182 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "helm_capabilities", - 68 - ], - [ - "ace", - 32 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ], - [ - "tau-bench-2_telecom", - 15 - ], - [ - "la_leaderboard", - 5 - ], - [ - "theory_of_mind", - 1 - ] - ], - "metric_unit": [ - [ - "fibble_arena", - 336 - ], - [ - "helm_classic", - 201 - ], - [ - "helm_lite", - 182 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "helm_capabilities", - 68 - ], - [ - "ace", - 32 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ], - [ - "tau-bench-2_telecom", - 15 - ], - [ - "la_leaderboard", - 5 - ], - [ - "theory_of_mind", - 1 - ] - ] - } -} \ No newline at end of file diff --git a/audit/audit_before.json b/audit/audit_before.json deleted file mode 100644 index 7b432cb88..000000000 --- a/audit/audit_before.json +++ /dev/null @@ -1,426 +0,0 @@ -{ - "files_scanned": 6448, - "results_scanned": 49659, - "missing": { - "evaluation_result_id": 37071, - "metric_id": 37071, - "metric_name": 37071, - "metric_kind": 37071, - "metric_unit": 37071 - }, - "malformed": { - "evaluation_result_id_pattern": 12588 - }, - "top_missing_by_benchmark": { - "evaluation_result_id": [ - [ - "hfopenllm_v2", - 27444 - ], - [ - "helm_mmlu", - 2844 - ], - [ - "reward-bench", - 2404 - ], - [ - "helm_classic", - 1005 - ], - [ - "global-mmlu-lite", - 912 - ], - [ - "helm_lite", - 910 - ], - [ - "fibble_arena", - 559 - ], - [ - "helm_capabilities", - 408 - ], - [ - "wordle_arena", - 134 - ], - [ - "terminal-bench-2.0", - 115 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "apex-agents", - 74 - ], - [ - "ace", - 32 - ], - [ - "helm_instruct", - 28 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ] - ], - "metric_id": [ - [ - "hfopenllm_v2", - 27444 - ], - [ - "helm_mmlu", - 2844 - ], - [ - "reward-bench", - 2404 - ], - [ - "helm_classic", - 1005 - ], - [ - "global-mmlu-lite", - 912 - ], - [ - "helm_lite", - 910 - ], - [ - "fibble_arena", - 559 - ], - [ - "helm_capabilities", - 408 - ], - [ - "wordle_arena", - 134 - ], - [ - "terminal-bench-2.0", - 115 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "apex-agents", - 74 - ], - [ - "ace", - 32 - ], - [ - "helm_instruct", - 28 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ] - ], - "metric_name": [ - [ - "hfopenllm_v2", - 27444 - ], - [ - "helm_mmlu", - 2844 - ], - [ - "reward-bench", - 2404 - ], - [ - "helm_classic", - 1005 - ], - [ - "global-mmlu-lite", - 912 - ], - [ - "helm_lite", - 910 - ], - [ - "fibble_arena", - 559 - ], - [ - "helm_capabilities", - 408 - ], - [ - "wordle_arena", - 134 - ], - [ - "terminal-bench-2.0", - 115 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "apex-agents", - 74 - ], - [ - "ace", - 32 - ], - [ - "helm_instruct", - 28 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ] - ], - "metric_kind": [ - [ - "hfopenllm_v2", - 27444 - ], - [ - "helm_mmlu", - 2844 - ], - [ - "reward-bench", - 2404 - ], - [ - "helm_classic", - 1005 - ], - [ - "global-mmlu-lite", - 912 - ], - [ - "helm_lite", - 910 - ], - [ - "fibble_arena", - 559 - ], - [ - "helm_capabilities", - 408 - ], - [ - "wordle_arena", - 134 - ], - [ - "terminal-bench-2.0", - 115 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "apex-agents", - 74 - ], - [ - "ace", - 32 - ], - [ - "helm_instruct", - 28 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ] - ], - "metric_unit": [ - [ - "hfopenllm_v2", - 27444 - ], - [ - "helm_mmlu", - 2844 - ], - [ - "reward-bench", - 2404 - ], - [ - "helm_classic", - 1005 - ], - [ - "global-mmlu-lite", - 912 - ], - [ - "helm_lite", - 910 - ], - [ - "fibble_arena", - 559 - ], - [ - "helm_capabilities", - 408 - ], - [ - "wordle_arena", - 134 - ], - [ - "terminal-bench-2.0", - 115 - ], - [ - "livecodebenchpro", - 87 - ], - [ - "apex-agents", - 74 - ], - [ - "ace", - 32 - ], - [ - "helm_instruct", - 28 - ], - [ - "apex-v1", - 19 - ], - [ - "appworld_test_normal", - 15 - ], - [ - "browsecompplus", - 15 - ], - [ - "swe-bench", - 15 - ], - [ - "tau-bench-2_airline", - 15 - ], - [ - "tau-bench-2_retail", - 15 - ] - ] - } -} \ No newline at end of file diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json deleted file mode 100644 index dffd83d39..000000000 --- a/audit/dataset_statistics.json +++ /dev/null @@ -1,2407 +0,0 @@ -{ - "descriptive": { - "counts": { - "result_rows": 40495, - "unique_benchmarks": 59, - "unique_developers": 794, - "unique_evaluations": 178, - "unique_models": 5299 - }, - "inference_engines": [ - { - "count": 39618, - "value": "unknown" - }, - { - "count": 450, - "value": "ollama" - }, - { - "count": 150, - "value": "openai" - }, - { - "count": 54, - "value": "google" - }, - { - "count": 47, - "value": "anthropic" - }, - { - "count": 33, - "value": "gemini" - }, - { - "count": 30, - "value": "openrouter" - }, - { - "count": 26, - "value": "deepseek" - }, - { - "count": 18, - "value": "minimax" - }, - { - "count": 15, - "value": "moonshot" - }, - { - "count": 15, - "value": "ark" - }, - { - "count": 12, - "value": "zhipu" - }, - { - "count": 12, - "value": "qwen" - }, - { - "count": 12, - "value": "aliyun" - }, - { - "count": 3, - "value": "kuaishou" - } - ], - "models_per_benchmark": [ - { - "benchmark": "GPQA", - "result_rows": 4635, - "unique_models": 4557 - }, - { - "benchmark": "IFEval", - "result_rows": 4635, - "unique_models": 4557 - }, - { - "benchmark": "BBH", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MATH Level 5", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MMLU-PRO", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MUSR", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "RewardBench 2", - "result_rows": 1379, - "unique_models": 197 - }, - { - "benchmark": "RewardBench", - "result_rows": 1025, - "unique_models": 179 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "result_rows": 1020, - "unique_models": 139 - }, - { - "benchmark": "BFCL leaderboard CSV", - "result_rows": 3350, - "unique_models": 109 - }, - { - "benchmark": "GSM8K", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "LegalBench", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MATH", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MMLU", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MedQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "NarrativeQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "OpenbookQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "WMT 2014", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "helm_lite", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "helm_mmlu", - "result_rows": 2844, - "unique_models": 79 - }, - { - "benchmark": "MMLU-Pro", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "Omni-MATH", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "WildBench", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "helm_capabilities", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "Wordle Arena Word Set", - "result_rows": 75, - "unique_models": 43 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "result_rows": 64, - "unique_models": 40 - }, - { - "benchmark": "SciArena leaderboard API", - "result_rows": 114, - "unique_models": 38 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "result_rows": 46, - "unique_models": 38 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "result_rows": 50, - "unique_models": 37 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "result_rows": 40, - "unique_models": 37 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "result_rows": 38, - "unique_models": 36 - }, - { - "benchmark": "wordle_arena_daily", - "result_rows": 92, - "unique_models": 32 - }, - { - "benchmark": "fibble4_arena_daily", - "result_rows": 84, - "unique_models": 28 - }, - { - "benchmark": "fibble5_arena_daily", - "result_rows": 84, - "unique_models": 28 - }, - { - "benchmark": "fibble_arena_daily", - "result_rows": 82, - "unique_models": 28 - }, - { - "benchmark": "global-mmlu-lite", - "result_rows": 912, - "unique_models": 27 - }, - { - "benchmark": "Easy Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "Hard Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "Medium Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "fibble3_arena_daily", - "result_rows": 75, - "unique_models": 25 - }, - { - "benchmark": "fibble2_arena_daily", - "result_rows": 66, - "unique_models": 22 - }, - { - "benchmark": "apex-agents", - "result_rows": 74, - "unique_models": 20 - }, - { - "benchmark": "ace", - "result_rows": 32, - "unique_models": 12 - }, - { - "benchmark": "apex-v1", - "result_rows": 19, - "unique_models": 10 - }, - { - "benchmark": "La Leaderboard composite dataset", - "result_rows": 5, - "unique_models": 5 - }, - { - "benchmark": "Anthropic RLHF dataset", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Best ChatGPT Prompts", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Koala test dataset", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Open Assistant", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Self Instruct", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Vicuna", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "helm_instruct", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "appworld/test_normal", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "browsecompplus", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "swe-bench", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/airline", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/retail", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/telecom", - "result_rows": 15, - "unique_models": 3 - } - ], - "normalization_exclusions": { - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_score": 0, - "out_of_range": 100, - "zero_width_bounds": 0 - }, - "normalized_score_summaries": [ - { - "benchmark": "GPQA", - "count": 4635, - "evaluation_name": "GPQA", - "max": 0.791, - "mean": 0.30281846817691477, - "median": 0.2953, - "min": 0.168, - "stddev": 0.04912650528590854 - }, - { - "benchmark": "IFEval", - "count": 4635, - "evaluation_name": "IFEval", - "max": 0.951, - "mean": 0.46067240560949296, - "median": 0.4545, - "min": 0.0, - "stddev": 0.20767533842318336 - }, - { - "benchmark": "BBH", - "count": 4574, - "evaluation_name": "BBH", - "max": 0.8269, - "mean": 0.4867208351552252, - "median": 0.5038, - "min": 0.2178, - "stddev": 0.11398463853942328 - }, - { - "benchmark": "MATH Level 5", - "count": 4574, - "evaluation_name": "MATH Level 5", - "max": 0.7145, - "mean": 0.1555723874070835, - "median": 0.108, - "min": 0.0, - "stddev": 0.14625658002062183 - }, - { - "benchmark": "MMLU-PRO", - "count": 4574, - "evaluation_name": "MMLU-PRO", - "max": 0.7303, - "mean": 0.32874433756012245, - "median": 0.34475, - "min": 0.1026, - "stddev": 0.12833971558059434 - }, - { - "benchmark": "MUSR", - "count": 4574, - "evaluation_name": "MUSR", - "max": 0.6024, - "mean": 0.40635732400524704, - "median": 0.4091, - "min": 0.2929, - "stddev": 0.04536121071938266 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 266, - "evaluation_name": "v2_Semi_Private", - "max": 1.0, - "mean": 0.5578856391307715, - "median": 0.75515, - "min": 0.0, - "stddev": 0.44976366617156693 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 262, - "evaluation_name": "v1_Semi_Private", - "max": 0.9999805606556713, - "mean": 0.7136057730617251, - "median": 0.92835, - "min": 0.0, - "stddev": 0.3413295062389333 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 250, - "evaluation_name": "v2_Public_Eval", - "max": 1.0, - "mean": 0.5578486693149027, - "median": 0.8591871038330539, - "min": 0.0, - "stddev": 0.46020565690537485 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 228, - "evaluation_name": "v1_Public_Eval", - "max": 0.999984448524537, - "mean": 0.750460640659595, - "median": 0.9602438445183996, - "min": 0.0175, - "stddev": 0.3138616973551216 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Factuality", - "max": 0.8716, - "mean": 0.6400781725888325, - "median": 0.6779, - "min": 0.0274, - "stddev": 0.14060436598989037 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Focus", - "max": 0.9838, - "mean": 0.6965137055837564, - "median": 0.7293, - "min": 0.0646, - "stddev": 0.1999740938960993 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Math", - "max": 0.898, - "mean": 0.6002578680203046, - "median": 0.6175, - "min": 0.0546, - "stddev": 0.11530869084864068 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Precise IF", - "max": 0.6625, - "mean": 0.3724553299492386, - "median": 0.375, - "min": 0.1313, - "stddev": 0.06683254610514013 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Safety", - "max": 0.9756, - "mean": 0.770956345177665, - "median": 0.8044, - "min": 0.0378, - "stddev": 0.16859961817216138 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Score", - "max": 0.8413, - "mean": 0.602605076142132, - "median": 0.6194, - "min": 0.0576, - "stddev": 0.13540270878209892 - }, - { - "benchmark": "RewardBench 2", - "count": 191, - "evaluation_name": "Ties", - "max": 0.9063, - "mean": 0.5524884816753927, - "median": 0.5604, - "min": 0.008, - "stddev": 0.19526001389051642 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat Hard", - "max": 0.9145, - "mean": 0.6117941176470588, - "median": 0.6053, - "min": 0.2654, - "stddev": 0.1713479724227396 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat", - "max": 0.9944, - "mean": 0.8923390374331551, - "median": 0.9413, - "min": 0.3547, - "stddev": 0.12437365150350695 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Safety", - "max": 0.9514, - "mean": 0.75624064171123, - "median": 0.7946, - "min": 0.3743, - "stddev": 0.14897429003710377 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Score", - "max": 0.9511, - "mean": 0.7524326203208556, - "median": 0.7455, - "min": 0.4727, - "stddev": 0.12766260032441618 - }, - { - "benchmark": "RewardBench", - "count": 172, - "evaluation_name": "Reasoning", - "max": 0.9912, - "mean": 0.779306976744186, - "median": 0.80125, - "min": 0.2821, - "stddev": 0.16510278548710738 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_accuracy", - "max": 0.9312, - "mean": 0.6721155963302752, - "median": 0.7076, - "min": 0.0, - "stddev": 0.16692855101327364 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", - "max": 0.9401999999999999, - "mean": 0.6615788990825688, - "median": 0.7104, - "min": 0.0, - "stddev": 0.17084967242914786 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", - "max": 0.9375, - "mean": 0.6427752293577982, - "median": 0.75, - "min": 0.0, - "stddev": 0.24460198666555008 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", - "max": 0.9582999999999999, - "mean": 0.5703339449541285, - "median": 0.625, - "min": 0.0, - "stddev": 0.2059801726435246 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_simple_ast_accuracy", - "max": 0.9031, - "mean": 0.726408256880734, - "median": 0.7636, - "min": 0.0, - "stddev": 0.1625125032958663 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.accuracy", - "max": 0.7376, - "mean": 0.20235045871559632, - "median": 0.157, - "min": 0.0, - "stddev": 0.1699218603771948 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.kv_accuracy", - "max": 0.7097, - "mean": 0.13904036697247707, - "median": 0.0839, - "min": 0.0, - "stddev": 0.1515138492137527 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", - "max": 0.8323, - "mean": 0.2820403669724771, - "median": 0.271, - "min": 0.0, - "stddev": 0.208463795648454 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.vector_accuracy", - "max": 0.7290000000000001, - "mean": 0.18597155963302753, - "median": 0.1161, - "min": 0.0, - "stddev": 0.18379301567138523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.accuracy", - "max": 0.7737999999999999, - "mean": 0.23962385321100918, - "median": 0.165, - "min": 0.0, - "stddev": 0.21479676048452157 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.base_accuracy", - "max": 0.825, - "mean": 0.29009174311926605, - "median": 0.2, - "min": 0.0, - "stddev": 0.24897845144318115 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.long_context_accuracy", - "max": 0.76, - "mean": 0.24009174311926607, - "median": 0.175, - "min": 0.0, - "stddev": 0.2138372755020874 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", - "max": 0.77, - "mean": 0.21591743119266055, - "median": 0.14, - "min": 0.0, - "stddev": 0.2171396175036615 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", - "max": 0.74, - "mean": 0.21238532110091743, - "median": 0.15, - "min": 0.0, - "stddev": 0.194452693868985 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.ast_accuracy", - "max": 0.9065000000000001, - "mean": 0.7661733944954129, - "median": 0.83, - "min": 0.0, - "stddev": 0.18657086363085557 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", - "max": 0.97, - "mean": 0.8535779816513761, - "median": 0.92, - "min": 0.0, - "stddev": 0.182740318362281 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", - "max": 0.96, - "mean": 0.7979816513761467, - "median": 0.88, - "min": 0.0, - "stddev": 0.2273336991546167 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", - "max": 0.925, - "mean": 0.7347706422018349, - "median": 0.825, - "min": 0.0, - "stddev": 0.24427840192832814 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.simple_ast_accuracy", - "max": 0.8067, - "mean": 0.6783633027522936, - "median": 0.7258, - "min": 0.0, - "stddev": 0.14843039998882532 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_mean_s", - "max": 0.9959969388355802, - "mean": 0.910949171600733, - "median": 0.9723906516748102, - "min": 0.0, - "stddev": 0.16788751393048792 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_p95_s", - "max": 0.9983116129372659, - "mean": 0.9052860681766953, - "median": 0.9794227826729278, - "min": 0.0, - "stddev": 0.17750828285090742 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_std_s", - "max": 0.9978872247523358, - "mean": 0.8712378709255851, - "median": 0.9528616366965585, - "min": 0.0, - "stddev": 0.18715211182331667 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.overall_accuracy", - "max": 0.7746999999999999, - "mean": 0.3809394495412844, - "median": 0.3552, - "min": 0.0717, - "stddev": 0.1568359888890471 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.rank", - "max": 1.0, - "mean": 0.5, - "median": 0.5, - "min": 0.0, - "stddev": 0.2926814601721238 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.total_cost_usd", - "max": 0.9987048455669116, - "mean": 0.8673404362764129, - "median": 0.9486161556437762, - "min": 0.0, - "stddev": 0.2029161256124978 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", - "max": 1.0, - "mean": 0.7561073394495413, - "median": 0.8079000000000001, - "min": 0.06280000000000001, - "stddev": 0.16896574532662487 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", - "max": 1.0, - "mean": 0.7637614678899083, - "median": 0.8125, - "min": 0.0, - "stddev": 0.19862042242738473 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.accuracy", - "max": 0.845, - "mean": 0.24573394495412845, - "median": 0.105, - "min": 0.0, - "stddev": 0.28751797503234583 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.base_accuracy", - "max": 0.87, - "mean": 0.2646788990825688, - "median": 0.13, - "min": 0.0, - "stddev": 0.29552705211555524 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.no_snippet_accuracy", - "max": 0.85, - "mean": 0.22678899082568807, - "median": 0.09, - "min": 0.0, - "stddev": 0.28410639873751836 - }, - { - "benchmark": "RewardBench", - "count": 105, - "evaluation_name": "Prior Sets (0.5 weight)", - "max": 0.782, - "mean": 0.5625428571428571, - "median": 0.5757, - "min": 0.0, - "stddev": 0.17788750218625798 - }, - { - "benchmark": "LegalBench", - "count": 91, - "evaluation_name": "LegalBench", - "max": 0.757, - "mean": 0.5902087912087912, - "median": 0.629, - "min": 0.331, - "stddev": 0.11619442676283923 - }, - { - "benchmark": "MATH", - "count": 91, - "evaluation_name": "MATH", - "max": 0.92, - "mean": 0.5574065934065934, - "median": 0.656, - "min": 0.026, - "stddev": 0.2685588691111619 - }, - { - "benchmark": "MMLU", - "count": 91, - "evaluation_name": "MMLU", - "max": 0.809, - "mean": 0.6220989010989011, - "median": 0.643, - "min": 0.243, - "stddev": 0.12023218786489331 - }, - { - "benchmark": "MedQA", - "count": 91, - "evaluation_name": "MedQA", - "max": 0.863, - "mean": 0.6103296703296703, - "median": 0.64, - "min": 0.229, - "stddev": 0.15792234765120447 - }, - { - "benchmark": "NarrativeQA", - "count": 91, - "evaluation_name": "NarrativeQA", - "max": 0.804, - "mean": 0.6938461538461539, - "median": 0.742, - "min": 0.111, - "stddev": 0.1228501275789075 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "count": 91, - "evaluation_name": "NaturalQuestions (closed-book)", - "max": 0.502, - "mean": 0.3627912087912088, - "median": 0.378, - "min": 0.028, - "stddev": 0.08850543190907255 - }, - { - "benchmark": "OpenbookQA", - "count": 91, - "evaluation_name": "OpenbookQA", - "max": 0.972, - "mean": 0.8312527472527472, - "median": 0.882, - "min": 0.222, - "stddev": 0.16911788087383792 - }, - { - "benchmark": "WMT 2014", - "count": 91, - "evaluation_name": "WMT 2014", - "max": 0.262, - "mean": 0.18178021978021977, - "median": 0.191, - "min": 0.023, - "stddev": 0.04641450975187302 - }, - { - "benchmark": "helm_lite", - "count": 91, - "evaluation_name": "Mean win rate", - "max": 0.938, - "mean": 0.499967032967033, - "median": 0.488, - "min": 0.041, - "stddev": 0.24004497034928224 - }, - { - "benchmark": "GSM8K", - "count": 90, - "evaluation_name": "GSM8K", - "max": 0.956, - "mean": 0.6740333333333334, - "median": 0.765, - "min": 0.028, - "stddev": 0.24790177694247365 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Abstract Algebra", - "max": 0.84, - "mean": 0.4692405063291139, - "median": 0.44, - "min": 0.21, - "stddev": 0.1566784405169303 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Anatomy", - "max": 0.911, - "mean": 0.7049620253164557, - "median": 0.719, - "min": 0.222, - "stddev": 0.12203524533321435 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Astronomy", - "max": 0.974, - "mean": 0.8196835443037974, - "median": 0.855, - "min": 0.342, - "stddev": 0.12503810130124515 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Business Ethics", - "max": 0.89, - "mean": 0.7354430379746836, - "median": 0.77, - "min": 0.24, - "stddev": 0.1177001565076888 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Clinical Knowledge", - "max": 0.928, - "mean": 0.7806329113924051, - "median": 0.8, - "min": 0.26, - "stddev": 0.10518545005348215 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "College Physics", - "max": 0.863, - "mean": 0.5205189873417722, - "median": 0.51, - "min": 0.196, - "stddev": 0.13341576241396605 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Computer Security", - "max": 0.89, - "mean": 0.7888607594936708, - "median": 0.8, - "min": 0.3, - "stddev": 0.07740978772295665 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Conceptual Physics", - "max": 0.949, - "mean": 0.7394050632911392, - "median": 0.774, - "min": 0.319, - "stddev": 0.1436847973853721 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Econometrics", - "max": 0.807, - "mean": 0.5924556962025317, - "median": 0.614, - "min": 0.307, - "stddev": 0.12405156056525753 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Electrical Engineering", - "max": 0.869, - "mean": 0.7012531645569621, - "median": 0.724, - "min": 0.29, - "stddev": 0.10967007262512768 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Elementary Mathematics", - "max": 0.942, - "mean": 0.6168481012658228, - "median": 0.622, - "min": 0.254, - "stddev": 0.17076712953141734 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Formal Logic", - "max": 0.786, - "mean": 0.5559240506329114, - "median": 0.571, - "min": 0.27, - "stddev": 0.11667484646986527 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Global Facts", - "max": 0.8, - "mean": 0.49860759493670886, - "median": 0.5, - "min": 0.25, - "stddev": 0.11856767165669667 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "High School World History", - "max": 0.958, - "mean": 0.8590253164556962, - "median": 0.89, - "min": 0.253, - "stddev": 0.1104488482004626 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Human Sexuality", - "max": 0.939, - "mean": 0.7969367088607595, - "median": 0.84, - "min": 0.267, - "stddev": 0.14067149783040647 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "International Law", - "max": 0.959, - "mean": 0.8525189873417721, - "median": 0.884, - "min": 0.306, - "stddev": 0.09770414010589916 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Jurisprudence", - "max": 0.907, - "mean": 0.8231518987341773, - "median": 0.852, - "min": 0.25, - "stddev": 0.09722219971870344 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Logical Fallacies", - "max": 0.926, - "mean": 0.8139873417721519, - "median": 0.834, - "min": 0.264, - "stddev": 0.0972786763034739 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "MMLU All Subjects", - "max": 0.873, - "mean": 0.7308227848101266, - "median": 0.757, - "min": 0.295, - "stddev": 0.10005918242229046 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Machine Learning", - "max": 0.839, - "mean": 0.592126582278481, - "median": 0.616, - "min": 0.286, - "stddev": 0.12807703682255595 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Management", - "max": 0.942, - "mean": 0.8453037974683544, - "median": 0.864, - "min": 0.272, - "stddev": 0.09395052631917909 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Marketing", - "max": 0.962, - "mean": 0.9024556962025316, - "median": 0.923, - "min": 0.269, - "stddev": 0.08556236254220637 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Mean win rate", - "max": 1.0, - "mean": 0.5000506329113924, - "median": 0.517, - "min": 0.014, - "stddev": 0.2741845671999428 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Medical Genetics", - "max": 0.98, - "mean": 0.8162025316455697, - "median": 0.84, - "min": 0.28, - "stddev": 0.11717074761250226 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Miscellaneous", - "max": 0.964, - "mean": 0.8688607594936709, - "median": 0.893, - "min": 0.292, - "stddev": 0.09859535722376811 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Moral Scenarios", - "max": 0.902, - "mean": 0.5793924050632911, - "median": 0.575, - "min": 0.231, - "stddev": 0.19478445797799818 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Nutrition", - "max": 0.928, - "mean": 0.7968987341772152, - "median": 0.82, - "min": 0.34, - "stddev": 0.1008295839442827 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Philosophy", - "max": 0.9, - "mean": 0.7844303797468355, - "median": 0.807, - "min": 0.325, - "stddev": 0.09312807331625374 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Prehistory", - "max": 0.951, - "mean": 0.824746835443038, - "median": 0.858, - "min": 0.318, - "stddev": 0.10757030716441658 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Professional Psychology", - "max": 0.922, - "mean": 0.7793291139240506, - "median": 0.812, - "min": 0.232, - "stddev": 0.1177310844427953 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Public Relations", - "max": 0.855, - "mean": 0.724873417721519, - "median": 0.736, - "min": 0.345, - "stddev": 0.0757594653625247 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Security Studies", - "max": 0.886, - "mean": 0.778126582278481, - "median": 0.804, - "min": 0.408, - "stddev": 0.09570378540441088 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Sociology", - "max": 0.96, - "mean": 0.8729493670886076, - "median": 0.9, - "min": 0.383, - "stddev": 0.08587676004752948 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Us Foreign Policy", - "max": 0.97, - "mean": 0.8918987341772152, - "median": 0.92, - "min": 0.26, - "stddev": 0.09360413026947771 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Virology", - "max": 0.602, - "mean": 0.5457215189873418, - "median": 0.56, - "min": 0.392, - "stddev": 0.047070851318166546 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "World Religions", - "max": 0.924, - "mean": 0.8426455696202532, - "median": 0.865, - "min": 0.234, - "stddev": 0.08472202480187987 - }, - { - "benchmark": "MMLU-Pro", - "count": 61, - "evaluation_name": "MMLU-Pro", - "max": 0.875, - "mean": 0.6609344262295082, - "median": 0.723, - "min": 0.169, - "stddev": 0.1866150109050233 - } - ], - "quality": { - "has_uncertainty": 1603, - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_metadata": 0, - "missing_score": 0, - "out_of_range": 100, - "total_result_rows": 40495, - "zero_width_bounds": 0 - }, - "schema_versions": [ - { - "count": 40495, - "value": "0.2.2" - } - ], - "score_summaries": [ - { - "benchmark": "GPQA", - "count": 4635, - "evaluation_name": "GPQA", - "max": 0.791, - "mean": 0.30281846817691477, - "median": 0.2953, - "min": 0.168, - "stddev": 0.04912650528590854 - }, - { - "benchmark": "IFEval", - "count": 4635, - "evaluation_name": "IFEval", - "max": 0.951, - "mean": 0.46067240560949296, - "median": 0.4545, - "min": 0.0, - "stddev": 0.20767533842318336 - }, - { - "benchmark": "BBH", - "count": 4574, - "evaluation_name": "BBH", - "max": 0.8269, - "mean": 0.4867208351552252, - "median": 0.5038, - "min": 0.2178, - "stddev": 0.11398463853942328 - }, - { - "benchmark": "MATH Level 5", - "count": 4574, - "evaluation_name": "MATH Level 5", - "max": 0.7145, - "mean": 0.1555723874070835, - "median": 0.108, - "min": 0.0, - "stddev": 0.14625658002062183 - }, - { - "benchmark": "MMLU-PRO", - "count": 4574, - "evaluation_name": "MMLU-PRO", - "max": 0.7303, - "mean": 0.32874433756012245, - "median": 0.34475, - "min": 0.1026, - "stddev": 0.12833971558059434 - }, - { - "benchmark": "MUSR", - "count": 4574, - "evaluation_name": "MUSR", - "max": 0.6024, - "mean": 0.40635732400524704, - "median": 0.4091, - "min": 0.2929, - "stddev": 0.04536121071938266 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 266, - "evaluation_name": "v2_Semi_Private", - "max": 77.16309638, - "mean": 1.3257351367669172, - "median": 0.09789999999999999, - "min": 0.0, - "stddev": 6.199066844791538 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 262, - "evaluation_name": "v1_Semi_Private", - "max": 44.25900135, - "mean": 0.8916221425572519, - "median": 0.30084999999999995, - "min": 0.0, - "stddev": 3.2441508923523688 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 250, - "evaluation_name": "v2_Public_Eval", - "max": 17.6, - "mean": 0.6595584, - "median": 0.08, - "min": 0.0, - "stddev": 2.152394923590425 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 228, - "evaluation_name": "v1_Public_Eval", - "max": 7.7201, - "mean": 0.5021848684210526, - "median": 0.3319, - "min": 0.0012, - "stddev": 0.8240755952564907 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Factuality", - "max": 0.8716, - "mean": 0.6400781725888325, - "median": 0.6779, - "min": 0.0274, - "stddev": 0.14060436598989037 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Focus", - "max": 0.9838, - "mean": 0.6965137055837564, - "median": 0.7293, - "min": 0.0646, - "stddev": 0.1999740938960993 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Math", - "max": 0.898, - "mean": 0.6002578680203046, - "median": 0.6175, - "min": 0.0546, - "stddev": 0.11530869084864068 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Precise IF", - "max": 0.6625, - "mean": 0.3724553299492386, - "median": 0.375, - "min": 0.1313, - "stddev": 0.06683254610514013 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Safety", - "max": 0.9756, - "mean": 0.770956345177665, - "median": 0.8044, - "min": 0.0378, - "stddev": 0.16859961817216138 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Score", - "max": 0.8413, - "mean": 0.602605076142132, - "median": 0.6194, - "min": 0.0576, - "stddev": 0.13540270878209892 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Ties", - "max": 0.9063, - "mean": 0.5353568527918782, - "median": 0.5529, - "min": -0.01, - "stddev": 0.21529016446306679 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat Hard", - "max": 0.9145, - "mean": 0.6117941176470588, - "median": 0.6053, - "min": 0.2654, - "stddev": 0.1713479724227396 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat", - "max": 0.9944, - "mean": 0.8923390374331551, - "median": 0.9413, - "min": 0.3547, - "stddev": 0.12437365150350695 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Safety", - "max": 0.9514, - "mean": 0.75624064171123, - "median": 0.7946, - "min": 0.3743, - "stddev": 0.14897429003710377 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Score", - "max": 0.9511, - "mean": 0.7524326203208556, - "median": 0.7455, - "min": 0.4727, - "stddev": 0.12766260032441618 - }, - { - "benchmark": "RewardBench", - "count": 172, - "evaluation_name": "Reasoning", - "max": 0.9912, - "mean": 0.779306976744186, - "median": 0.80125, - "min": 0.2821, - "stddev": 0.16510278548710738 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_accuracy", - "max": 93.12, - "mean": 67.21155963302752, - "median": 70.76, - "min": 0.0, - "stddev": 16.692855101327364 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", - "max": 94.02, - "mean": 66.15788990825688, - "median": 71.04, - "min": 0.0, - "stddev": 17.084967242914786 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", - "max": 93.75, - "mean": 64.27752293577981, - "median": 75.0, - "min": 0.0, - "stddev": 24.46019866655501 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", - "max": 95.83, - "mean": 57.03339449541284, - "median": 62.5, - "min": 0.0, - "stddev": 20.59801726435246 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_simple_ast_accuracy", - "max": 90.31, - "mean": 72.64082568807339, - "median": 76.36, - "min": 0.0, - "stddev": 16.25125032958663 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.accuracy", - "max": 73.76, - "mean": 20.235045871559635, - "median": 15.7, - "min": 0.0, - "stddev": 16.99218603771948 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.kv_accuracy", - "max": 70.97, - "mean": 13.904036697247706, - "median": 8.39, - "min": 0.0, - "stddev": 15.15138492137527 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", - "max": 83.23, - "mean": 28.204036697247705, - "median": 27.1, - "min": 0.0, - "stddev": 20.8463795648454 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.vector_accuracy", - "max": 72.9, - "mean": 18.597155963302754, - "median": 11.61, - "min": 0.0, - "stddev": 18.379301567138523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.accuracy", - "max": 77.38, - "mean": 23.962385321100918, - "median": 16.5, - "min": 0.0, - "stddev": 21.479676048452156 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.base_accuracy", - "max": 82.5, - "mean": 29.009174311926607, - "median": 20.0, - "min": 0.0, - "stddev": 24.897845144318115 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.long_context_accuracy", - "max": 76.0, - "mean": 24.009174311926607, - "median": 17.5, - "min": 0.0, - "stddev": 21.38372755020874 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", - "max": 77.0, - "mean": 21.591743119266056, - "median": 14.0, - "min": 0.0, - "stddev": 21.713961750366153 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", - "max": 74.0, - "mean": 21.238532110091743, - "median": 15.0, - "min": 0.0, - "stddev": 19.445269386898502 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.ast_accuracy", - "max": 90.65, - "mean": 76.61733944954129, - "median": 83.0, - "min": 0.0, - "stddev": 18.657086363085554 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", - "max": 97.0, - "mean": 85.35779816513761, - "median": 92.0, - "min": 0.0, - "stddev": 18.274031836228097 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", - "max": 96.0, - "mean": 79.79816513761467, - "median": 88.0, - "min": 0.0, - "stddev": 22.733369915461672 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", - "max": 92.5, - "mean": 73.4770642201835, - "median": 82.5, - "min": 0.0, - "stddev": 24.427840192832814 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.simple_ast_accuracy", - "max": 80.67, - "mean": 67.83633027522936, - "median": 72.58, - "min": 0.0, - "stddev": 14.843039998882533 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_mean_s", - "max": 169.87, - "mean": 15.127064220183486, - "median": 4.69, - "min": 0.68, - "stddev": 28.519051991371985 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_p95_s", - "max": 568.59, - "mean": 53.85339449541284, - "median": 11.7, - "min": 0.96, - "stddev": 100.92943454619746 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_std_s", - "max": 212.99, - "mean": 27.425045871559632, - "median": 10.04, - "min": 0.45, - "stddev": 39.86152829724822 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.overall_accuracy", - "max": 77.47, - "mean": 38.09394495412844, - "median": 35.52, - "min": 7.17, - "stddev": 15.683598888904708 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.rank", - "max": 109.0, - "mean": 55.0, - "median": 55.0, - "min": 1.0, - "stddev": 31.609597698589376 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.total_cost_usd", - "max": 355.17, - "mean": 47.11669724770642, - "median": 18.25, - "min": 0.46, - "stddev": 72.06972033379084 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", - "max": 100.0, - "mean": 75.61073394495413, - "median": 80.79, - "min": 6.28, - "stddev": 16.896574532662488 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", - "max": 100.0, - "mean": 76.37614678899082, - "median": 81.25, - "min": 0.0, - "stddev": 19.86204224273847 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.accuracy", - "max": 84.5, - "mean": 24.573394495412845, - "median": 10.5, - "min": 0.0, - "stddev": 28.751797503234584 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.base_accuracy", - "max": 87.0, - "mean": 26.46788990825688, - "median": 13.0, - "min": 0.0, - "stddev": 29.552705211555523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.no_snippet_accuracy", - "max": 85.0, - "mean": 22.678899082568808, - "median": 9.0, - "min": 0.0, - "stddev": 28.410639873751833 - }, - { - "benchmark": "RewardBench", - "count": 105, - "evaluation_name": "Prior Sets (0.5 weight)", - "max": 0.782, - "mean": 0.5625428571428571, - "median": 0.5757, - "min": 0.0, - "stddev": 0.17788750218625798 - }, - { - "benchmark": "GSM8K", - "count": 91, - "evaluation_name": "GSM8K", - "max": 0.956, - "mean": 0.6556373626373626, - "median": 0.762, - "min": -1.0, - "stddev": 0.30260192099278316 - }, - { - "benchmark": "LegalBench", - "count": 91, - "evaluation_name": "LegalBench", - "max": 0.757, - "mean": 0.5902087912087912, - "median": 0.629, - "min": 0.331, - "stddev": 0.11619442676283923 - }, - { - "benchmark": "MATH", - "count": 91, - "evaluation_name": "MATH", - "max": 0.92, - "mean": 0.5574065934065934, - "median": 0.656, - "min": 0.026, - "stddev": 0.2685588691111619 - }, - { - "benchmark": "MMLU", - "count": 91, - "evaluation_name": "MMLU", - "max": 0.809, - "mean": 0.6220989010989011, - "median": 0.643, - "min": 0.243, - "stddev": 0.12023218786489331 - }, - { - "benchmark": "MedQA", - "count": 91, - "evaluation_name": "MedQA", - "max": 0.863, - "mean": 0.6103296703296703, - "median": 0.64, - "min": 0.229, - "stddev": 0.15792234765120447 - }, - { - "benchmark": "NarrativeQA", - "count": 91, - "evaluation_name": "NarrativeQA", - "max": 0.804, - "mean": 0.6938461538461539, - "median": 0.742, - "min": 0.111, - "stddev": 0.1228501275789075 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "count": 91, - "evaluation_name": "NaturalQuestions (closed-book)", - "max": 0.502, - "mean": 0.3627912087912088, - "median": 0.378, - "min": 0.028, - "stddev": 0.08850543190907255 - }, - { - "benchmark": "OpenbookQA", - "count": 91, - "evaluation_name": "OpenbookQA", - "max": 0.972, - "mean": 0.8312527472527472, - "median": 0.882, - "min": 0.222, - "stddev": 0.16911788087383792 - }, - { - "benchmark": "WMT 2014", - "count": 91, - "evaluation_name": "WMT 2014", - "max": 0.262, - "mean": 0.18178021978021977, - "median": 0.191, - "min": 0.023, - "stddev": 0.04641450975187302 - }, - { - "benchmark": "helm_lite", - "count": 91, - "evaluation_name": "Mean win rate", - "max": 0.938, - "mean": 0.499967032967033, - "median": 0.488, - "min": 0.041, - "stddev": 0.24004497034928224 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Abstract Algebra", - "max": 0.84, - "mean": 0.4692405063291139, - "median": 0.44, - "min": 0.21, - "stddev": 0.1566784405169303 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Anatomy", - "max": 0.911, - "mean": 0.7049620253164557, - "median": 0.719, - "min": 0.222, - "stddev": 0.12203524533321435 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Astronomy", - "max": 0.974, - "mean": 0.8196835443037974, - "median": 0.855, - "min": 0.342, - "stddev": 0.12503810130124515 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Business Ethics", - "max": 0.89, - "mean": 0.7354430379746836, - "median": 0.77, - "min": 0.24, - "stddev": 0.1177001565076888 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Clinical Knowledge", - "max": 0.928, - "mean": 0.7806329113924051, - "median": 0.8, - "min": 0.26, - "stddev": 0.10518545005348215 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "College Physics", - "max": 0.863, - "mean": 0.5205189873417722, - "median": 0.51, - "min": 0.196, - "stddev": 0.13341576241396605 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Computer Security", - "max": 0.89, - "mean": 0.7888607594936708, - "median": 0.8, - "min": 0.3, - "stddev": 0.07740978772295665 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Conceptual Physics", - "max": 0.949, - "mean": 0.7394050632911392, - "median": 0.774, - "min": 0.319, - "stddev": 0.1436847973853721 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Econometrics", - "max": 0.807, - "mean": 0.5924556962025317, - "median": 0.614, - "min": 0.307, - "stddev": 0.12405156056525753 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Electrical Engineering", - "max": 0.869, - "mean": 0.7012531645569621, - "median": 0.724, - "min": 0.29, - "stddev": 0.10967007262512768 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Elementary Mathematics", - "max": 0.942, - "mean": 0.6168481012658228, - "median": 0.622, - "min": 0.254, - "stddev": 0.17076712953141734 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Formal Logic", - "max": 0.786, - "mean": 0.5559240506329114, - "median": 0.571, - "min": 0.27, - "stddev": 0.11667484646986527 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Global Facts", - "max": 0.8, - "mean": 0.49860759493670886, - "median": 0.5, - "min": 0.25, - "stddev": 0.11856767165669667 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "High School World History", - "max": 0.958, - "mean": 0.8590253164556962, - "median": 0.89, - "min": 0.253, - "stddev": 0.1104488482004626 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Human Sexuality", - "max": 0.939, - "mean": 0.7969367088607595, - "median": 0.84, - "min": 0.267, - "stddev": 0.14067149783040647 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "International Law", - "max": 0.959, - "mean": 0.8525189873417721, - "median": 0.884, - "min": 0.306, - "stddev": 0.09770414010589916 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Jurisprudence", - "max": 0.907, - "mean": 0.8231518987341773, - "median": 0.852, - "min": 0.25, - "stddev": 0.09722219971870344 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Logical Fallacies", - "max": 0.926, - "mean": 0.8139873417721519, - "median": 0.834, - "min": 0.264, - "stddev": 0.0972786763034739 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "MMLU All Subjects", - "max": 0.873, - "mean": 0.7308227848101266, - "median": 0.757, - "min": 0.295, - "stddev": 0.10005918242229046 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Machine Learning", - "max": 0.839, - "mean": 0.592126582278481, - "median": 0.616, - "min": 0.286, - "stddev": 0.12807703682255595 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Management", - "max": 0.942, - "mean": 0.8453037974683544, - "median": 0.864, - "min": 0.272, - "stddev": 0.09395052631917909 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Marketing", - "max": 0.962, - "mean": 0.9024556962025316, - "median": 0.923, - "min": 0.269, - "stddev": 0.08556236254220637 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Mean win rate", - "max": 1.0, - "mean": 0.5000506329113924, - "median": 0.517, - "min": 0.014, - "stddev": 0.2741845671999428 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Medical Genetics", - "max": 0.98, - "mean": 0.8162025316455697, - "median": 0.84, - "min": 0.28, - "stddev": 0.11717074761250226 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Miscellaneous", - "max": 0.964, - "mean": 0.8688607594936709, - "median": 0.893, - "min": 0.292, - "stddev": 0.09859535722376811 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Moral Scenarios", - "max": 0.902, - "mean": 0.5793924050632911, - "median": 0.575, - "min": 0.231, - "stddev": 0.19478445797799818 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Nutrition", - "max": 0.928, - "mean": 0.7968987341772152, - "median": 0.82, - "min": 0.34, - "stddev": 0.1008295839442827 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Philosophy", - "max": 0.9, - "mean": 0.7844303797468355, - "median": 0.807, - "min": 0.325, - "stddev": 0.09312807331625374 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Prehistory", - "max": 0.951, - "mean": 0.824746835443038, - "median": 0.858, - "min": 0.318, - "stddev": 0.10757030716441658 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Professional Psychology", - "max": 0.922, - "mean": 0.7793291139240506, - "median": 0.812, - "min": 0.232, - "stddev": 0.1177310844427953 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Public Relations", - "max": 0.855, - "mean": 0.724873417721519, - "median": 0.736, - "min": 0.345, - "stddev": 0.0757594653625247 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Security Studies", - "max": 0.886, - "mean": 0.778126582278481, - "median": 0.804, - "min": 0.408, - "stddev": 0.09570378540441088 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Sociology", - "max": 0.96, - "mean": 0.8729493670886076, - "median": 0.9, - "min": 0.383, - "stddev": 0.08587676004752948 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Us Foreign Policy", - "max": 0.97, - "mean": 0.8918987341772152, - "median": 0.92, - "min": 0.26, - "stddev": 0.09360413026947771 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Virology", - "max": 0.602, - "mean": 0.5457215189873418, - "median": 0.56, - "min": 0.392, - "stddev": 0.047070851318166546 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "World Religions", - "max": 0.924, - "mean": 0.8426455696202532, - "median": 0.865, - "min": 0.234, - "stddev": 0.08472202480187987 - }, - { - "benchmark": "MMLU-Pro", - "count": 61, - "evaluation_name": "MMLU-Pro", - "max": 0.875, - "mean": 0.6609344262295082, - "median": 0.723, - "min": 0.169, - "stddev": 0.1866150109050233 - } - ] - }, - "observational": { - "exclusions": { - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_score": 0, - "out_of_range": 100, - "zero_width_bounds": 0 - }, - "valid_normalized_rows": 40395 - } -} diff --git a/audit/dataset_statistics_plots/coverage_counts.pdf b/audit/dataset_statistics_plots/coverage_counts.pdf deleted file mode 100644 index 6ac4008ecd57c03cc296ec0e43acf30bc6503d4a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16555 zcmb_^1yohf6R-#pzb^Qu!|P1j_xjIzI)Fn%;$F&;?u}xv0oJBwLg9D6^T7+0lr|I<&fGM_I}5DZ`ZHr#%A@P z*L>2wA(QJ2#u`iw%j6svxAUisRAXkANls3@-7xV-3D-ao$+%_fw$sz6TwNtc>J^gk0@j5t_o=jc-epWN*Dde7B*x%ewrj zt|=qR;RS}B1?{+G=|C3K@w!%{e?N|?w#4%*!mTe_JjiJB`PJ3jI}Pr>WmVPQXZ<7O z9qZ;FtgPSt_DbmSB_92U$~4q>3NJstq@Ut8F%!|#A+WmH^X0YgXHS6(4vM+Yi!8^_ zv*=>__>y;29H1-H5g!>=-!@e;Eo3Xa)vBU8QQu5C&+VEwHqkNWBa%LGJkg?W$4h-CV7)7^HfV7&k%cK~<_K@T zW&N}HXP>84V{bpbbYy{aALV^Eraf>Yos$y%1qRp)kK)MiS_BOh}u~KSZPY zwS7$^SLK`j*_#~Iq~3_>Pq*A6h~*q2DD#RM8jZuXFETa{H_)0%f7XoCaC&%8v8Av1 z{34SZM%!0=VED48y_Na!JUA4hiPW^OkquT?(%~8|eM7ArCva~%bN4K@4sG*YZhnH$ ztRpH-Pi)npRk%z0*CRX{8>_F}?mi)zb#A@eO++&X`m_i$+MZZTvDTPWbwMpGpd@Z5 zd-h3WV_V9noZiB|eVd-MoX1Y;-yN21l7lJF-Bdr-J=^U%ph~GNw-hnpkwf(8m54+C z#Ix6D!5M?1wI#WBMWy}u3mkfj-Q%|W6ZSu_M!L^K7u~D!(`^*xV4Wjs7y3uO@r(Jd z*bk!j+G(#AUo*zIgbX{YzI0j^VP6iiBzvs)Sr6t}FdQ(~kvoaA~xo>i3`pb1$ zge)C(xYyFnEO{y^Rm$1Nu?Z!|i6#Q>s3#AoKi{Gs5s~?tp^H^r*9>{3a{C3*H{+m(j3fqv_=53C;{>_V9enU!p-ne`u2qP5yxa1C1I2!H@Xi z2M#PB2-Jrf{)Bag*;j<1cXT9?Pde7uQqqys3H=wIsP7rmGN2E%6#xE_NSJQ8+{UZXs{=SUBn={;PS z@JY-r^VnL5K@;C%@}?Q5E{ll8i5dE_?K$ozk+$TC^?5IRXhiM~N_=4vnwlp}=zU+TPxAvWQ3;EMooDl^^;lq!9XlON-^AVl*CtpmK(C@0EfH+j6#s zElJP*%Th#8CSj~uPSQ?wFHZ}=1WuUW_n@Zl|^Wqa%PTlhj5&Z+s4Qhu1zy&sI* z0Sk2>-Q*scvHV0Bk`|2r_B0b?<_?}=Z0cvNC(TY>;Wkeq9Yw6HW_iGT?vo-833m7! zR&F89EOK*rYdW=}y~ET8cb?k{C+xQQ$?_>I-c)S%Zl@|^m@ zB)ZuB)6C?^&A)cba)c}ae&J(+Mv8h_yPv8{hcjO_oz3w}VgZSBeB}A{r1uP&%&rh| z&&jk!M4cJi|DfwXzlO7P$!Zk~zp=yyNHpL7<~Lfs*%?ogU|Wp>5^!h9v+v^51J?q%PG2Do1C9~xesnd#zB`#?DC5>8l?9vk6GdZm1lbyh7! zLQu0>fl+LL{cUk-RF=G0y*b$s;fStK5hl_%qJ&WAX2&8sqU>w42CY=NfN(u>`6&I}G} zoTE}jF*Ca7w#%g2HEH&6!yTs;W|@Nv?>!%HF4(vg7ed&y!bDC|XdCD9HeWNYc<-9O zJF$7s@$g|ka)Z#ilEu@LraDEcGzL;S;j5>qWag^M7V7rQBU{(5Z&+l%4TcXx-3 zn-p&Z;f0zUlicDIiz5;p@Yugn5LC&r3a7=nrQ0f73r#y_^Gf?XLNs-r-J&iX9k(KW zkM3ho^NZCJ+enVcP>J;y)WSOM*K$?ZSqgHTwrCxLOU3v{=rn_zi?G7 zuF@G7K_s)9dz9t+X8hWlNe!nZi8Wr;2j}a~p1vzz&?fWnZ5s@@Czsx; z`DrCaUl`GhEzJ5*leZ8%$oo)G^3s`1d%>G&GO8?0^m64+_cEgsLxOtWZ6Q2zc`|rj z=nZQ!MVC9P8b`yX7*1ctRScuIAg<79XoX*GPA^ieW6TZ|k87g7r#t&*+ivPQ)5TVW zF4L~G8#K1~F%gSoP5gEhUM#FPn8tZ`s)7lkYim6WB&+#9`A*F1?LP6LII|_`-66@o zu{iSJx`i+&nF2LsHA~WJe7VbquJTu}jIRj0oP|QX1QoxEh<(;wWp`^)=f*FI$Yph+ z6DRxBy+Y98)_DB(r@b$6Ya0xdVi*xD90t!xe{dM_FVzcMZqI$n{xap2K7BQTQ`TKbF7qU7#`8<``wwAvX65-85*09-j@;C^B_^nwT31?<20WTYiqmZg zGDx27Fns%Z8gXv)0*#e?T5d@0fUlxbmsSbu)OiNZks{2X0TQXX$T_7#s#kt_!C}4q zt!%{X9Svc6pY^q?4NaS8jb4sBOW(%A3-C<%2QT>e{$UTJ;}zrS5d<LA6 z8Ou(dT{ZiCuW7*47txI=tI1Wpfs&klinj}4*JRJ+u{mGl7|bn&Q&UvPQ+3_s{z!V8=!_prT=Ce+$44K8p9+%aQJkLC zxqsJckdDkB#lRRDkox({yW-4V&FJ!GCBAnNG7fEDR|#I5XW`FO_>%kE#>0lnGz>t9bmAA)~SE>(6cO z&ZtE&bXqO;RT|RhLoKW&q$2J$GAib;lPKbxoh+aVt;C-@x z?mCqXv2`D>+sa7t7hqR$gGlN+nyAan_)w_ujc!<>`>TdIgT-t{$zPnm9vdn|+lx15 zzjc3$cyVn1NaOrt0W2B-UIYB00Z2Z<|82$58cGiM2!f#*ikkBeuPZ9Fqk1(@FYcD1 zt2_`$!deE)wlKmgzPB0kOq?frN4mIC7t(Jkn!?t)D2~Na*5UMR!3$bv+jVi5bt{y z19sLLSM}v&ObtefIdfrtdzGQ-VX3}Yt!zV4=$5d4~!)NTZwB#vBIwcZPkKV49?2@*am!jR(DWqkM%`81IwJug5$VQ*Jl*eSC1Ei^3+H z&gylse{vI)vBh#zH)na{`hd`M-%r zt0Tb3bAA;f?sW0h`S^_mjC;ev{t!w=e?Yp?ScJ&}VsK5Xq3msUq7l7#)T)|UuqdvV zj3E#a>pg9J50A2}prZUlirXYy=K1E*xTuPjz+>Oue)kSbzWVXoc)|@Tuc{TLA2(`M zxUL3mx@fFyoAVfP*4~uUFm^dXGZ%a+Tv8=a?!1b1HT291cWvH1cP30heuDObmG6mi zJ1UQp9hwDFEXaXi`Zo5`k^UUL)4`yz+0 zL{U9tKW0Y1gBy8;<|U0x@=+SsOBk=Xu0`a-#-XGLBYnQ`hSN01tNk8SQP1z5bg(fp z$HGo*fjnC1U)b4fkB1=Iu7vQDo_`$U&8jn1IJZxGVboZZ)I=%kO)Fsl%r~0}9ml)b zcj{5+ixrWJ#4ULnq0jG`hO*oacf##J?Feql@`Wwx-!&o!8pw+TN$)*Y?w_njh)43hsp4f6FqvwnU5~_%uK0HhL|7{H(Auvg`9p z)9jafM;ZqWGqCUyOOVco_y=B!UDS5~RCD{Trc=O$@$@vS+#Cs?{jtGTyN7Lcct=Ya zpPMT_@y))Fqc|VuSY52|>*!@f6AGi$PA*_!V`2K}&dB_U;F@z|*73Z#h4_(+9x^-+ z!{_vPqlANn?M&0njFNp~nyyuMwm}?qxK@lxSxO&R5BEi#@^mJhqo#=4eFc%YfeJmn zyJHd<`uW7SGZWdK5pSRhZfNMEBUCe`5xujRg44HY_26M8Nhe6_UwgldqlS0-K8j^z-W5hU!O7A;c26+z7K~E7qcWd7V&6w( z!;8a)3}zgcZen!VR_`pa^rpSz^)_?uO|j+VERHbBZjz%Tb2Ed21a&dZJb^l5I*}~H zlp*E*GiJyK(?Ncj^TfHU_Z2E6{8M`#M*b~p6qb_&lJ6g6EvupEP!A`n+@{<)k7HZ1 zo568ROnKj*Ck1bo@syd;xdtsXYxif1;G)LVC`Ve+!s^c=++-e*yCy;=46x$1i zy?2V{z9IjBEiLvVx=srE#Tac$-#;v>R?~3n?9@uGccMd(ha~-YSU+7p>1Cmpn>vi` znQy8%8|JU!`*k&TpS}Dk^Tr8&mp@(=_rcuvEgCVUw9m^!4aWrs0bH%pyV>!#ID&Z zV$1m8s8uA2X(FdpprL4Gr}F*gnPrL2QeA5-9K>?u5&DM{txA+6$tx&u-%}OHR<2&W z_ZTDEzi-4v{owTN1>UE+)AgMQ?h%97Z#dcID-CiuSL<+0k>^X~s(SG0_ze9^F&Pz= z9N%JQ=iUX1N{Bq`SGP6PF1kQlEm2X+M2v+>EQcEu`X7P~v3QZ_vk0Oa+d?;a;n%a1 z7gu*AW0_v~S7ue;xe#O@^+rvzm!Et13d4|#eJ=I-2{mG$p2e{n>^`{*u-?+EwXsjB zUw2j=9Tq>?m-W>m`cWw6idVzT=UtxH+cn_^66pvbOj*Ok2+;|#_IGX5%pO^C+h$r8w^6pFr?n%>u!Qyw&>()0q z(Ttk`O5PFQ#Oy!0XLq&>GMUctQ{(kpIonNzW+}bUUaQgiN=_g+^hQLlRaMPki*c(# z*N`0=@x66pj(2a)_kmv2CT*s8Wi&zr&TCIgt3SV+et~_A8rv<)PLMdi3Xr! zH{5c2LsIvRn$#b5(UH;W<0OQtC|APrGZR%Dbv5_-ik)}}_nSmjpHEU$Df($rCwxBR z%d=W_iwvUu#Y1qsoaI{pnQTVjjl#2w6k6GYCSDHjP^yfrC28MI4teR&3`m|PadLaN zWO={my}B3PTwLw@?p^lLgb1zJH*~7YlP%}MD4w3%Ry}$gL%jd4^3K-1zABt~2BxH!pr3YExO;#Y72 z-2IfmQ`hlA7Bsc}vbOzCU^v%qoGh?!t;egk#(jtHrl7l>9v`AxKN!R5qkc2KnZSnr zPO*r#icykt#c;As7G)`I;>*09@scYaeUF6n&|bnqA(ki@{SOrWU>FX-Fu5NLGrqVY;dQs5@h`%f6DSZRWHJ241CrEGY z+e&iIeKo#8r9fI>m{ZJgv@B9O?@oFbeVY@v~A~rjW(`YV@pGFI4v+k_G zmx@iDB^R~W?uvcr$ua>YX$RR0Q}Rt8rY+S=*kqfIYp-frykc0lZ=%y`*p6D?Mi$sT zSGzDuQoowPPOU`NsA?aqrZ)N3LRE_7T>E7T=htZrx>BaO)Gm$pd{#<7Y%Liq+&+=xcfO%b(9EsCtNn(l(Y-wgccKObe)k>SC@0W5JSO7I`fcRcZm zMRbQ(Bp(P67lhSB==@E_{O6DN=E^|dn7`rHC1ZW5#nNj&e`{>ZJp5AgOZnY!2jC}A zxv+{!qs)j+U6)edv-(P(!gUE5-}YSJmZ7cjOfIJ%I9}Ei7g717pj1$`gwTMJh5D^A zHzCEDaB($i3o~X^L-a{uVUa+KMZ%SuSsqTRwE1ub=Hnu@pXdahX01!Q3YMO{cD5Rk zbWA5e{IT3+t_QDLIr`4_OPj_!KTMOu;I_0-4coutDO@9YXLC|2Pdo|rVTA1d(+7rc zqwEHQbFBqQGeknqX?L$bWxH+tV3}$-DQ-l92tWP7fRLf=fKow0mGorWgp8j@_X&Yb znlbv)-T}V6<2TFIY~$-X*k1~K_PUnQ8KnForZm30Ft(7=q_qPaYR8V0#IWhlea@~=t9j=k|D6g?2$wjUOA-j6*(OV%` z&0X~DFQpn>m@~i(fo8c{uXg)_IIaG@A7iLzTQK zeX!zyo%D%#e>wM~c za|Df*rz^ag@4dO7*>3At>9a$0vPz}F*Od|05~e)OiyE`A)iu|Vu;j@%CySpVcS*fm zJ@1r;OOVtrNKKy_YcYF@+G5-qeoM`fh)P);@3s!BVDm@LN2G)5PvsxndokPc;mpas z_ggzSwIP=uVc{B<(DI)yIMAv>B)F19p8yl^VQ6y0j=~=C&7XFe9Ya)iJyZc62n>!RM6^)qRUA9T9Mcvvid5^;(4SPLflOvax*)@fsNk z-vB4gwBJ$hJ=wLY8a1Tcqx_z^3DYme_sP2p^+NsF%!nvlxGFwG6j-MXm3tVANgkWG zAq|%-nAFN$?=)QQBBy_pFDhoB_>_B5x&D;yW2TJBC~LOKCQ~|nyv&sjmyd zzY>4^8l4F_1Csl_rjJ0>E3pM@zY>esAYIc*8rJB`=R3x)pHJqyB8|Kgc07n#vmkM; zpEN1^+&0DRm+t#T+~uwTeQI~L#S>VrrQ0paB5E)ZU+!4WNy`ouH=5PZrCXMU_OMgD zQ}77yZ50###4w+=;Xmu+OWZXJU$%^ICvD!XUpx0CG@$yvcLAq2F0m7F*uXI!+R0|_ zw@F{{jEy|=rHHo?mMw8vi}rp;{ViKD8Mm5WT*-`jufAp}rR}I6$>DB-7`+w3HTfuP zjQzBk-!)u+4}*_bxQHd*MDYEKn~kYFg$pN;_ED=PfWErx76103)4C(`0tJU%bDx-V zdF7L@Qy&rM3_oHPt0Y)3N*>QcB~IV}C_FN2N*imOX$i6e8=!VzeM0kP;_CIB#;GvVYnA+{$@_pMbI1kg<+W)rloGq z;(w$KVW(43J^Uqkj}1;d2wVggsW$VB%NUW;u#Ergt~{xZQ``dsT^5% zx26w!+b<|Q21cpA>fBT)OodufKTBYR$XmqIShzj8ooUEYr2j}+NV}?f;oQIU+H0xt_o8yLBbu?J=RQSxw%tc84eRNiG+2xp5XY93W3ei_9Ys*p=gAcS)!h?iSt%#52pLQCkNBsbkz(Ew16EdgIU|sH*0y;yi`_+G<7JX+ zY=lZ&BYCO*{WXPj9^a9(!nl{QV8#|z3;e^T0E{LH zE+bLpD#YLToM}?*hO+ir=ia_vNcN-Cb_0z3ou@hN#&6%Ft-2)o)QQ{kgXTq4)rsv_ zUFS>1bbab{TT|*QCOeIDb+Tz(A7U<}WTe&HiCd#(Wu`DUILQ38@U8b)UrWjKsXRJr z-qw!~^GF%^+Hs9*YA1Fy!Vcp_; zZMiv6;70fM&cy_+ThA_En|~dV!8I1MMW)Z`&k)6u+EONq4$C!2?l9xxAYV;n^2+DG z+cXzEsZk`Irgr&pQ)XFiQ1|0?9sM%q#fDqZj%dy24Qvyw?pq>fRD&1aBYovoCQjcm zg}UAiWvM#*^<@61g5#chS{n!)vv zUeDBm$#ZvzJJOhv&pdl~%m3<**Pf`61fg35mdg;}9RJrZTuA zXye^QuPYi~W0(AS9^N}o$JrH=#Y{%){Vm*{{l#W{@3`}hYQbmjA>5~$0cQ-h6l<++ zju&Y#ekwm+7k0Oo!37e3O#AG3v#}hEZ8qRVZMcqDu21C1Suun08KPiL6uEQWNQ7X$ z3-8#-sU3PF#Tg|oybs;wtS{TfX3eUI-S?}$yYF+YuVZTucTvUU z!r$8gjV-Je`iHrVQwzYUw)j`bX*CK>$Cwx0@-J` zxBP*`;B_4%VS<;Qdvopb{gcd!!D34U-I@3UvAS?seBXE#>2vE+RDr}3=X_8;)>2g@ z?;N)HU5TA{le3!>6id?(-tl@#JaiuG5bbjL3$@Q3?)tdwuxDH0VyIH6{fcZXua366}1KI+SjML35bO1IJ9hIFYi1Jyq zAS*zqT1t`z!?d%wiyeIDU(>2JEuR~2BYP>sW6po2!<3iyf)>8n%CKhVsI^DxxACHO za-*2}>d)44H?Q9JC?X@1act3H?>;UPF2tq%rmZEj0#eVPb%u+BqIAldq&-zI{Be=7 zERi#73-5y(zkaeiUu)uIG#aRElcluZunNHiJ(0|&@hxSF|O zoZK8;pa@WK7ojjscT=~63=wbwz+$MH*aLYMzvKNrSn7LX1rI+z8p?x0qM=AQ91i44 z!1=*c3J|@DiJJ@NAo+q94(vJ5)Wd()3?VR(Fm)g&1jw*B$Pj_RWF3Hn2{2>izf6@M zAP7tbV`*stY#9{Nxe5Yo6hnV12&C%|t}W{&puCQ$I(5^oC^M<~qE0ifFLDo}H? zcCi5JmOy3*@Gr*00*VII(bdAk!T|!az*t$kL17LUpgTWs?*^oy02wGUP?#(fyyjJb z3IelSgo2ln+E5_?VgeNcoRb+8W)6i}0E-1U1SHW|Ltq%-1!%X0!t9_hdjKUs0~F>2 zg*iiEz{IWqAyAk*6y^biT?M|S22w&`moesU*1+1qP5*0l$$=<-eD(34oPQD40e7)U z2H?A$iIpqBoPUv#q^lX2JtG823@R^86%)^c6#S?*)LN1^K@B1z_AF z;0!n!SOk#BgL3c=WH<=`gn=+%Fa!GfI3Ly0ZtGv zP*5m)&j z6tn=KP2iaSU=S2w7(v{E03&`Ba1}p=1PbbxazH)E#b1Fy{Xl{s(Lc(8HURw&GzNhG z`aIw?2m{-HDTg>AAOSxX{O1$g3I`Gdi2k+h$GktDhiyN#;n%((W3bNgL&(7Mzf_W6 z&-PMK1b8UE2%2EvD11?)wi-)jfH6i8kG!T%X_u5;*gmCfZ#%ZQ25=!eoupiDPhdP z*aFxi;F1qALfrwUctGS~o)q9Z{u?X}WOahk6FU&qa5j!#IOFMJVF>~36F&qntA~G}{Ae_wnU>I>FfiT%|3DoM zVF)w=)Y#u)d_tfS{s|KRe1G#9Ed*%KA7lAJpZq5b^_M*0{DPoW|9xHoI1r-z2?Jx%KVhi9 z^ban;C-fK10)l_Z7cPkSTb6=o{y%jCE-3hyKEr_^`7c_8`2N;sAuK!<68cLw07^h( z{u>Vvz*hK6|A58)4HE+Ha{d^L-~+wT@Awb`f5{R7B)?1efw`Z*;{(F~ zU-<0iVuG==a5?x-1vD|@4Pb^^ zmgn$R9#HWgpGw{yD!LvP9(XAF2SD4x!vjxnhGM`^2u#<`!rI>17K+`jCAe7W;ys`y z09i$4fEB#A2Nb5}1h62#{Z`n1t3l0vV+ZlK03hEW_XIq^-4FI*I(T;%Pl6R756C}2 zUjuJ#Zz1pE4N!yw|5zMG42eS^F;FxXE{2AqF(~lE3{VFkBY?;qiu=K-oU^kF@Pz<- z{jL|V`llk*@y@m$c2LANabjdb@*^k?F#*gY6F#zbuAd{h7+G1`Su&RzO?0-XEVKH?Z{=^*CZ_R?R7jo zT*jN-urfw^>c9$@GnsviwZFJX_xtEgeb`caX#vfZ#kOz$o!+}To(NcEDxe?TNf1wYnUB9>$b4u_gu8x#UZBdY#Ah>%d8vY9 zy+d2C!B@2%XZs);cX@Y1(>euTT8oJb%Trd(RUQeW7v72E{kv@O7syg+f;n8)AKO*H zi|dg8P4 zHFwi2V_nj1?h_MLBZmirkpx|pi(t>%Q?m=k1+$6AtK5)=)5zoMU3?jq(QX67_ng>n zM48duVwBjImHG*B{u7s{*EAAA#lmI(!@OV3>@ zH5~9aEG!VyTp3bOv$lL<&pdmT?qPYA;cSW`er! z^_J^!z^=BG(a8#X>6u6=7k@U_m?Gqnu@dq1CQ}$&;OxN+M^2rWx7{4=T^7v>PQRX_ zzkidHi(KcmjTtrjBl^?JFF#pUJJd1FH3&LVMRBaW{Pgj#Sb_6O#VzX>xQBkc^9yE= z9Of)l?V0_PHRjW;U8Y5Qqnc26SlVEcgV0UZZG47c@R(USzNs;WgGxEs z)ji9wukRw~0U0ATwI(Usk^9ci^+(=uQ^m$soo`o~WsnGaL_Wr69BcLpe^l)1fyIMn z4xOr~%jV=Ie90&Bhl}WMMHgU8uO~C>Fg2)KM4G?x5+VypW4?Ln$bJeT#)qX>f>}GL zahaj2x)jyQT?a95)frstmdIbzt_Y>OSu*w@YXvH>ZQ2 zNgt^7gqO?mZ}Sa%$~qNNR!X`9GP6%$1hAD>>u z>kjbsd*0xZy1-u<(6HKH>%O;*Q}X{*894d@V|i=JnG zJ@A{p(uT+9QN$dp`&!r5g*)bC4vdwagIrsMT@&X5-ltAvB+lLRd?e|#NdI~v3h%yn zva-zQsqxv#QiCZQH>`K%gU#J;P8#`Orm-Cxw{s3+AO(XfLIRHanTQZ0FbFh(n*D-- z*~gA3EP((b?|(#Kjh14@AUGA}oAe{>yUFAqUL7;yTPu}s)jhaL9_ccwI?fwfocU<5 zv1sM`wJ_?2c^+oEVux#l*M+**N;WS9deT~V?2YUXV0JF!dM(V+O+td-)28k800aMT z#!$#oZMb$3dJj`5D=TC))CO`S)YSZfSU4H=h-+;QRD>K#d-~e*>$TVS?8=1E)RUl0 zA~8Y2(f>_aN*fpV98NQ}1d)~-+I?b9P2kbxuUlSnz5D=PZqRjNFt-OVUw%3wFzOwj~?L&4`JIWo3NF%#sE_}F%{7j_c$rXQ# zvqwku>OG2|9|Utc(+p1*H<_beU?W_{z!09h5R`0(n)D~1CH0#gfmFTVX)*D<0t;)JLz8@{j z2o!sxZuz%TkYM(QxN$@qi5#Vx%WKIJ+qvm~Z@OSX=SEkmWK zMKPCtYf-f@F2l2B}#z6y@Z$(A9Q1jb3WA+Ovm<$A4e!em7p=KR- zUa<7F2KG#&@mFh(BSAbm&8qE|?WvazI#6DXnqg|dI#&4b3tZwE6kD$frHZMo^)i*K z#!mVVO&M?8^~l~ zsv%AGNq3K~oS4r#7#BTimSLQo7dP7BGlU**E}My*?R61Z)9B~lV}7YOE1B74IU2(| z{-$qY_}q})Wig7IIikWRChBKq#Sf;M9zJvNrx!4@rFCvy&Cf&yhTadvD`nUn5gSV0qciPtOi;1{! zHu|(<^2I(A$4rew8OI!0mC~~LlVAGg+iSMo!}6B1A$%0p0?p-)hGJ`k(g{2&Y~$-4K{w)looc^4Qy-4P2GQ5QaZkd)yl0} zh@;PrLEo#ezSCjFsaUMM!SbVK^eu`pmK$T2noV*B<5l(sjdYjfh<^9^p3VDpe7$#} z_rMl!*iH9~$^)ABB_f+Ga!bdxb%nA;&&fV?2^EpwUGuP5v?|CV>qArxh0-1se$PEwcP32SGNdU#*wG79CEGT)EGdW(IBn)jZB?s3zenL#_O$VX+qL^HTijly~Eo)&Pt z-j-d=Yt-Y06YJtP6%wd%*Pgp!Ut@ogH+y<&ZQ*c3?h_NKTBZ6N@4T+&aWygSe4l#W zJ(?QAkz-DJddda@wu}R{bxlzdFZ3EPV=jvyIQ1V7`C2@_lJDa6urjvt(u}bki|QGT z{WQ%TegnT2MnqRXP zJu6&$S^1PXVc)@tP{v3(jbLS24ZCXSfq9YIf=o{y`=Y{l{b^hOePxcUUi59>IL@^@ z%d{-QnXn5?r~`%0<6j~jUNTIF*W>jm_CR-g3sN>Rjm|z7p;7Ti_MeGn%@x{X#ko#? z<;=mS2NiFS9dti!?{lqv2ATV^FEQ$@2|BW#?cm<(fLm4UQycWo=gwM_P$#(gv{RiJ z+&|c(cB2yof@ZA}f@P4sbJbVCaHM!*i{r$Axg>+d$(&cs)E8m?c|4eFV&C2~-hNX) ze_V&QsbD4iQKn@$e^R6?c^hh7;+qmWV#XxH;vt8(ZWx8TX2gPN0r|*Vl8*40j5q$lb%qnEHABxqh0+`xr?ZgPWKRp-bYshb_&HXmFZD z4=6O|>7CNNw{W&Gz{Uh3QoxiZYn1l6)06cs)#yc3so95`Hd=MzU9-#7O9x~+&3y^% zuH8yBhT@XdYXR==t!x6ipE>o4ZbmwN&#}5NON*kKoa*%$xDYqUb<7Rf{^hA<-qX!p zFMG|>N$3*Xx!a*j1Qzw*y429pco+=K?G+Xrnf8SyACE(9zFYYd24mk@(?a$z6&W+2 zKXyhuyj&Th|KKBsBt>C+AbYPOL`LU=YLomv!m(r5vdU=pt2~tJNt4Pd5X#HXL1<(- z8Qjw-pT~*}tv{&NO0AaA%AFXBcYBf`Vy+elODf#gW956{MR;EZraIX$^*!Zh>2^=} zg3h|%RsD6VvGM%Pq*N<$$%bKay$6yfNXSZJS%Le96j~!%j_w(hiej!RSg=gHWb=-_ zWcSwDqwKfXlBUHT7>&MogAn;(8vA{BUfF!T^6v9>yDgEjkCm%BC=a2{0-xBYS5zMP zesz4JGgwmQcz(CGgPH!r6CBkt6}3FHB)BB8#z0~I!P`p39gpEc&|F%>T^54}<=vQB zSeJ|CDG#j7slIt4#3}lfmhL;ONdK9`eFUfb?92PKX#F~7K3)>?yFU$k_vCzS>;v}a zZ>q@prRm@2EaGEshufd=sUQ2YA^Lo+Cel+F=ckAJNk>XPRyZnTGpW==Zu|N`z38oex>0Ru~!U@fB9{O z`lt{G9hZTD-MM)^VY>&)?ZXd6mQ?p+i#hLZ?e91v7d+B3-4c>^LuidIS@oo{fBSH7 z?XlsK{cjwv_=u18H#2&a51+6e*GEJwckr|uYuP;y5iPW>%ouAUr?>3sE$M^?*{9JeCt!c8YFWH^-6ioRj7HOEr3n%v2J6uZ|lxYigOUaeu#;~WOsbHhI%EeFgroR#YlGxUE(TAz11M8`DmD>N-aQ_ zJ^sr9f6;}i>r4>+uU-;^W&GbSGAX48Un=IBVbRN@w(xQ8L}_w2KTiEl-{)g^ut$!K z&efxH)+Vc?OWTKH;#zIj+YO~;E+Tn{cpPmKNPkbP?+*hh5cW4 z50biRZjhi%DueqcBh^tO0J{g3Q~Ujc%XH_|?!+ssZaK&aPb`{WVpU}*GP_rDgsk+6 zLcz@%@0JCvOAA?DmJ5N`jXd_NC#svudnH+0`3p)V3XLkN@~CK+(nu%v)(9p|(MgT> z?|x~mKzW1pz-V)jZ%Q8do?b6~%Pteqp?gY?rW1n8&+VOXov`IGLdh--bi4~)6WUa$ z@2`}<{fZ}l7Y|$NN_x7;%40%Yr3B&zjIWC{dp)KcGT%)Dr@oL07%GaA&;Y574D}CA5Th+ecmb!WSlhoUYb!$C_{HyS&qVIl zQ`Xc)iinV129Gsfm%z>+{dgKT)3ZN%@SB7cQ-Hc!BOZ7EXB%Jf@8@gz{t1f34F`+j*&ultc=l8<@ zo`kvjXoXK>=BupiRtJ|#zjYe=Dvf%7cWziyg!-r$>LcF4$l6fGMzqkHDQ<+BkYZ3h z<(f)PmDnx8&Y2R6A7^C858A=6Yq`*{s!LNO845@=eipvX(5wAG$_{i3~rOq8mZh1f&Fj0!D!;>$mM7z0lWFsH@xD=;)M7f$T25mPg55Zt_lzFE4ZGO z^M%P*Xjq}b()I8aNfFvXn#ZSJJ`kCAHsv##KNrp~xsl-eZQ%gx`}jn|j$4BJH%dfu zdc8{y9oTK9ToA05j$qrco%wiAL%jMa;VsV_63mlGrjQc<;%~2=bOv0C;)*mSj6rt& z#4)YNOmJsNT3Dm~vWBl>3r#~Zzrwwd+kR^2)K+6*)zWY7S9XfFog5Bi%ln|O+I*x! zcA2H4wmEhU&lNXni4W^ozPWb6=Ss)-{$??1&skPMMrLfyZMBeKjH;WzIz zAAgmeD9~4l$lkwC{Qa?0CyQ8LQHrUA8@(o1h`M-F_{qTj=SxxM>xr)S)Xkl0XqTAi z&^^KjM+3-wH+v(b83tps#{M_6SO^;G?H_Kc#>S} zYTYjyKqbCp*JjhN97qVggqAZlr3j~rbZ1d_K~OH4o7A&1xP{2tTk@mJ?S_KN-=7V= ze~bLs)S;)Henpi1wllqjH1@SR#?=tDG;Nq!iJ4w$AwOSKantAWF?Utolq>JK1S&ri zcx?847?vu4y-I?U*wRLk%AEd1Kq)OXI$-y z!-Fg&=z6Ksnd2|t6t}%lVz5@;Hd^hvxXY<7nxjX{)TuaA4WWpu+JiKUX;XYp-Brm& z&Q&)=kmz)*$e|>>d(>3;p*jh26hR2mvl|h6&3QXj&s&2o&Fotm&uB+2(vEU}8~<^) zW<0gjzOBP&X4+?65+5SM{vJUzi3ACW`G>WYlC~NFObr3A_)IUk1v@vPCmfc(b(VKr zM6l63MDayk2<4f$y%!EyFds&Wbm{tuM?aIAwhMTSj}0-h96l%zV<6i$7$kcGeMSL! zGh%NDuWnJoQa3|l9^V?v_}8~t4@Jt{FTU5x(3g(qzmVoQqlBojkNSGkW}bwn(foCHPNm?dRZ9hZmf;ikI@fjC zg?F0eh-2e>DYvr9yNe^A5qxH!?PscF`0Y=6Jqnj$%ldrm!?-1L+&!{%^s8I7h9L~z zGM8W7S0D9s)>~IThgf31dq!mW7{&9vD0EER@|3yS5(=SRqJg|^X`NcN#4Vj2m$NUN zpE*qhG0pe*drjr+vU4F!U=X>Rgk&W0K{W6q*?<1dB^s|r;FY6N0(Vw3!>wOUND?G` z!JXAPUYMS?HGkl)7KEc=MRk9EVqdex*-h;)xS26GnbE24tp?s^)OXQ^S{JV44Fgv* zEc^_;*=a4kyk*>us}teZHe zUPled?XL#oDtNr;BaU0Gq_0M3r4ouz6QeV&iBHkd%QO%+i233qwPZnc$l>Z0%caSg zH;V~sylEwFdOd=M@(N;so9uIA9~UGiH*CU=gGrej zRi`B%xQcjx($zs#?OS`+F8f5v$gj?*Ir&A!@Eh~{hIt3wbL|aKiV9kuw9PR}iX--y zjxYu2QQB<^JeOB|uW_5qx}}>E=9S#D*mmLQ$Y%0~^U;&zzRsL=G2QqNjBl@Jhd$A3 z)rq0TJqX?{Ea%EfVD_q8qVEz<`>daSwt!t~jfBu75;hd}AGT&NaVLST8LE&UzlKRV zWsF@_HnN_5nnr)js3f!H{ypd;Yqg|xop`hXr z;=bvPHmjpYm=_Xwd}YWUh3b+3NvvC%ujpk`&`*fZk??|Pp7GVwO&TFzT;zuX7*lnGy%xKpkd zF6G%@WgfrM-5Dg3&Aq3j?#kt*x6@|b$*Ni(Z3$A9bViQnM}iY@0glSnB4?RL%X~M0 zQ+WO6%l>0ytvYL}Yc;*gAKrPV6b-8 z?7!?7pL*?753GOvATj0}Gd^cdEym6Uisoj@9ujW9n#0S);rl((NvQl=+`B=ybZyD_i`w!c_iGlZ~Cif9$yzo zRmMr3kK(RtE-1dGLCCd}v3ko{%-TA0@riB&A10;E_(MthV}e+&r?Q=aLu}5thE!HS zb;g$8h4*c%o8&VZ79{plfuDf>-aE$qLw=*7{oC(CyDmQSF_!W~RBtUOH%sR2v$mGT zmhBa=96U*Jl+6?(gM9C}IroXq^m!dOb-a(UXJhSL7Co<8sMIXg+ic37SR=R+rGK1; z0^hPcYcTB)pC8K4PQHq+(|Hx^PV2UDBd;-D?MW)aH_kXw^pMvwM88bsMC~Kz3_rrU zP@XNhJ!>+1z{7Jk=S#o9QlI48CiA!7+gGovU?Hi-9=&m;;Dyz~Lc{T!(PL>{-#81! z1EUq!NjL^lsVwpz_QEhRCxKL!N?|26nD@ncT#eY*wBpy`fRB$d32M%CGm zW1q@}GX>^|nwHG$vMEA0CS55`VI@!T6@FsW8JBi8Cp`X9m2^o{eyWvq0lL(!CUn|; z#U;MLAX$Bn@095S*=vF`&cnR}y6f4n0B5%cNBsVl0N986dn}RvFqW8@<4m9gei`yA zpB$za_~Lsk-M6rHY7kjv7i%4gV9R>19A`<-#RVx3wg(g|aH&?Q&aEcBL5jN~gP-9r z?>!H=ZpR`pTfkxjs_MK3UdYWwCoI@3n$eWrn&pJFga8Z*kM}3dgvMrFxFpc&>}GXhTJMCEzBjzSLTg{# z)y#=9LfB~0M-!Q*vpxa491>2%kx&hZR1*Oly8chqXv84sfTakz->_9m>1vOeV=|Qu7=)PE;r}f zH0-O<1+zUf*YlpFyQW=qJ?;7;q+PQ-rACHQ|HTvQYOM?R?l}8yIcdBVY@uQ*LTK8^F@(bOb3{s<{imLDXf@374Yn{nRTQx074vx5{O%t~NYwY+m@5-LL95KNSlpyN5Gir8{oXjEiVU znKX3oYe?ayN)L4%;uJU1{_bIIs$Nh_qh>Vds{Dcb%Awf{E_NpmWJGHvj!`Ehj`hSp zdGhH|(}Sf=@)QLd5}S62-&43Bmj;M80bueL?)V=i6)j~WZKI=d1bYid4L#6W2<{#V zb`}J{S-{jSw#!Hi1_A>+DB|6%2==ZXE(9n7%sh3VFkMee58{0UkO1ILG%cKfQ~iJ9 z{n1(e$1#3UEEWS5MFFSwk#IO1IG_*5g6H=^^cof(1bgC9e=#^5oa?~R(5%72j%m%og0Ng@= z!jz!mfV65*a7RoVDght|l>x;87%ZSTpn(+>W(|b_-EE*S;Fbn}(jE{NsCIzD9HB5L z2+SG450Hn#+@LT5Ko}qbg?T~|2%z_QD9jrQ^8xq(PE7)r5MT!O)*g0%;NX1u{XPS6 z9De@3`@cc{ZKQ~*B{eL7(j6^q-2u(~%K*u_TY(ofaKLcEKlb?ziTT*`9N_@pyEKt z`v34i{$Ft*#U-HNgTi2-7z`GCAQBQdC>ARYMc@!n6bkj@0bm#j^pCax>}S92K0o{J zd_#WL|7s850PzqXfF4mZATjU}MOXLa1J(rg`N0wRhM)ki2;!Ci6!G%`r63Lvz{k!vk$X^!J1U4H2r~Wi zAzA~_j%YCe{P#Ce(;y72|Lxhv2>}`SDe%rWI2DK^2r&J7-A}qd-`jP+tl{^zpkheU z_&LbH`+tYVe_xy{K@s4R5AeBaPz1P40ZQ7ywER)h0b zk^WKgfnv5P`a%KgCe{)|C9vfMe*DjqgWo^nk}?8HOh$gM2U+@AFNgR&fd9IZ2vd*) z8o{hApa>8#Ob+>fZ3bkz+s%Gs;kW$fe|-IBMgIf-wtetFASm8X4u3?}A6G45C+)4l zi~^V(pvc6_hn_$*Br>^u6$10ScnZJ;LNI3%0umfoM;8xAdrPPnN(=!P!-+ucJUm?8 zkHKJ0zv{$X2)0KcFd);h_Ot>nB>vvb)!GJXX<_96kok!MXi6kt@8YZoUgQ)~Jcfh= z?gI-~?1TZQA9upASU_Pr+F^mrX(vn^4%p5g?SK!!J??~|Kx6*19TEu|>Yp$; z5)RCzKVd*T08IN27y<dN7a$uVegJ;f-NS<5L3Bk3G!Bh|@bfEaDMS7bgh8$o diff --git a/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf b/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf deleted file mode 100644 index bb2102a3c78924d2a0507f8d0dcbffa6d8984a04..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13934 zcmb_D2{@Ep)Cr}rB%~5G_TB8p$ZqU}5VB-27`w3~loGO5C}kHSN=k|>l|4(*LS$dE zBuj)s{O>!_*C+q;_59C2p6lNCyyu>C?z#7#bKd)&mtPOBDvppqL-_A@L+_MA5KuVO z&Ds$nBLjsQo^~ffVafzAg0q`F6sAY8CwfAW0Kze-oE(JcY6~Jt{nkO%&D9Hvq9DLb zbgdnUHeOKd&r20wFEs-%f)^2r{t3_{czF@Yu22kk34s|n5Nt`V_E5?7yJR;T1ELqy z3}A~_1w$UQFtpT<8O&r9Z0|4a)r6&^s>3)h2J4W<$^CsH>;{oLd z`0EgDNdzS~Uw|VV_>;t8B+v*X28xDDNnnB60#F1nI)H>H6!%j`MORlh;0gia|E>}6 z_J#_ziLUlu4p788TUC-XFlZ=D)fvzPo@nD{O9W;2^db`pE)d`3+C(h_PApr<;?mv| z0`&A2vIXp>3hKAXMj{WQMd`2aquH^&wtvQbcG1(%FVxjrt6VX%@@;)$*QmjS2mgHg z)L`G+>e~7WuJ3b#4>0Cjmh`dq-;&kETV}rN#&%ba;vLMzRn#qU$V2-^t3%%22G$E;!_ z-XxxTW=#J*r~HapSN5FJtgM`U?A4uV#~IvCbHUFJCkWXd--+>r%7yQqC9|4~&fcyt zS~2o6pUOA#2v2OAO{ojy-man?!@FHWOY_LV3K{c=lrsb5TqL-7Y{)~j(@P(!A z{qeEZGH#h=3cIPX1x!;xMw~3TS1*7b=|@JdVSBU39)<5#E-tt;KE{0gaOjl4Sf;zcb5d(Az2b@57Zzutf;5mnQJ}e2#w9 z+l{h)x`+I=u~1jezlL}Idn*I;F7Fny>2|?)k%^du5D@$OEDmQhaPxcBGqT z#`32LsT~|vpbJ@0!F@4t)jqDY?SMo_)_#|x$77o2TW{PQ-fR27>~)>1bQ7}(`9KN6 zv_Z7phawtJs2bW^%Wh}a8+dzp$=OJ*k?p*48d z$iW47wcob)2~F3HykR!;1|E_Xlf=&3FNocBw)_QJgqWMmNb+_0kO*@sSFPh;KTIz1 z-6F-tB^!T4y4w`S8eieQsE*dU z`q3t+Xvj7R{6dy#C-}wdyc=F8RhyoiN@T-q$eP<_mpIHue8A9k!*R+>815sQY5yIs zB2o4DmWVX%^*PNPQ%U`}rjqCeO-n`g3{BE+r1b6v4}D}tJL zTummj{$eC2-15-bnosc9c_p;O&O=R?`OT)~(ln^&PV%+iXSu`keJ`_`g6)J3*+h%R zs^fVjezc6S_p(2-FD-P}JLIvsD3ac^R57||r1+C$*n6|S+10qRL8pCmW>T-sj`wye zjMpdcVxRI=vt`WQIbH6AJ~qsMhO@WkiB0j!*QJq9Ukj$OIeikt)Ai&t8B1n?8B6tp znp3ZI4T~RJtleEMc2E^$yn`jd-QyxAsLxF?CQcpP7Go0p>LoN|QssDZB)7mdKjHLL z0hUYw~PPA%*LbXPCX1lcxMVte!|o%a(j=sQc`owz6Bd^KCEaxz;W2x(}n6XV7FL zP*SK3$40U1AkYkI@e2levrV=gi9rJf{XYylMz6txMv-n=mkn}0Y09_#qgzON2G4snifG5jc2(v<2T6hiTw>n+s=fL#(qAx z^xLSDhg3p%dZcp8!)!#kjz?M^Nzi;zbKT?4F{=-#FQGoMIyV)&lnf7Zk<+r^?=L_! zrbu^!96dM%&|dvZF*-|4w#JsFYgdmcu8{u%zvv<_UMB>)Cjr>*$`EQbCs!r zPrCP|{phl)*SHW&6GqsfxFjMQ!ywb?vu1fexJ-Bsevs;#X|qZtGPb0Y%uSODj0(&PN3wAeZ3J7i?`xfpU ziIw_q*)jTME_ejPL?4?u^xXU-*%sQ1=+0-1xylgbV6oSH6^oThhXN7^YF5j&#j>CA zmMaijx8#sdv`zTs%84)Dt;PMPULG}1TJkGgNH45=+~2Emcu1!#v*82Rm{B*@(9yn2 z#quz%#@ROLZnv-%z67T38ZW}6wP=QlY&nU=o97+O460S@8!OTFo^^Fx>ST_q&N$Vb zZ07xK9BoV$d@*k-*rL;f5t@OAngn{ARbSw8Xv^M_YJ-)=?>DJ_V>#6=92M#kWW6k~ zcabKqIGLu1g{>VkZ*x*eqbs)RbgjUJA`K&zxY{SuG7$SHf$G!>D>xg+`y7!V&S7gF zA5~+sK2p-*=5MybcV^s|+vw zXW34d&?7r`IV*C8r)V2q#7;iS}fVurf9$}in*I{N`R6+DdnJ{&p5 zI?g1@lW*mBk~fFob{Bc~`5;yCn+L4;+gYQ&vzOlYhbN7xNq)YnPBL&4+fV(7fC@3R zY>e;pX*g1p;zXCql)rNL``3L4zFtiZd$r^{7w>ciXlT7Qd?YZea9E_Lko3VEi8PoL z8P;JwQDXerb>UT$O5|9QzBHHr!u+Y)`frC$J{#~*zOjWaz)*VA0zsn@f3g2!HDb9C zbofPYh(zli>sGJSX?Ne(PuJRExeFN(J{ntr<|oc363gV`aU$yCkJ%j6PLbv1VV{tP zp}4w>I?FO1p=Wd=KcwtWb&AUbXk=9jSt4A5+w~ zJ7;@l#GYU^agC87lbdPwA2?Y8QHQys&L@7EeOHvxW^k#bK@0sZ67Tx*>l|IRZ5Hil zX#i`WQ>;o4ODLnLNIUnZD$9#l>)^)xAZH&;xkAhIf{Y+S;JAg~E}KfDnH|a;(FAnF z^H|#57kjS=Ej)Bg8`X_G+-g7BUS`2z1|>SkD@CT)acdMTFlkU78hYH(9ff{Rmp*k% zvf|qF1@Cj3G@o1^dJgA8oV*9BmQv#bu|hgt5Jgirf*yX|f(#<8t$-cF1brU(*Emi>dgt=)2{FM{diP3VcQ2reEP9)En9Nd zW8o+bQ31=QuMH;9yqJ_4=JP{?!`?3#`v~OE1Gj@74(#_!;~7;jig5{HWgudlIF|7* z&%V`^l)e^|-dG`I&%i-963UbwuU#JR7#(||CD)_Ux1wZvo}M-X-EGB|DWMl8OH`g1 zcl_+b(BC=q^&6_q(9ERY@nEaTC)%Fsb2%M(skb{XI=0n+f+S(L?WFUgz0YYnNzKK8 zV?Dp^MMM87uaKhZhwOeIT=SAoxm~3wg&1#fH{>4VpLtLT(obVh( z&i&YPh1kW z?max(u9SW6CuZt;>0;j(#|LC|46!)@Z5`e%cIcXc8#L0jI?pRKlym9~OFid;zmfH4 zmTkpPid2TE>RXZNImScxL@4S6sVe9=R6q~Rh*jQ8_vRtpFNibxY#%^h;>_-|yXD(K z$Jeg%FTcWBB$F=%OH}>3OSek1Wbk>jISNk?6s0%CTL~7Xx?=m7Ldz>iIMo$z9+l& z$;^>sjE(me!ycwvhw3sA7wrr!9=yBHN}GcFiAG^7(fWb#D!~01QF5k0k&Q39+=srz+@ajDqJ8($a1Nr*PGwa-E ztJ~^6Sfp;DOYjtElPWxbDWNGP`nxTUp88?86C@Cy^(B# zm8l=1HoPmNxy4wwh|iJezgWvE`Wmh^aE7wwy(>c2MR)#g`*uEIyDl0*-Jihw!<`MJoKg`z5gpYMFw`S3Y8@l%TU# z7;CPMZW-ODwbu>}&)o65>Kg2$$Tbk&!CAm;jC=OYg!@&-i*k|L@vqW1Go`#v1)V5G z%%gjpB;8)R<8Ft9ow?mU3dJ*+*Nm+#oLMP* z|LwrEeCuOVhb=U?#Zv=jhr~;r8Or-k@2rq7ueOA?_1;*32!fsQofh;)c56_?uEIV@2%$oubF!7)sELT zSMs@GYZD+S8!t41SLadFD`t?3|L|1s#srh>ST|LjtupO(_5&l$=ll|Lsds$vF|vMd zCO(*_^6>N3peK$y$KA*6c}!6X^Sy7|LYIYBm211pl(Jv&}9H%OBLF$#Zq_FB~({;sEK$~iHYykLY-yi%iaitkVoi~)R7k7 z{L!UVTG=yBd$!Q-76BVV>Te!j^boP&EfNX>9y%#^8MMLAXBOcKp3p>Ji z&5j;Uy-xGE+gs;HAM=S>^G{!@lMkCJS>M@Du1oiydECD=W&ZgFecl;`+Lux`UiVMG zy5O2~0@t8f3-igpJzKYM97&Vld05}=T>^KPc2FKnX{dTEbWn^d29wpK$K?)R`dW%f$6Xm~~2Tpw1E?|Ne-$Dah z#agJpSYH@DMKWMeOPBX9DcFncYWU*+rgQw(+B5d#uQcJo+l(LSG!?;4iS`}GO?K^E z!@Df+Z4lvAD|(VcxYbkb{f*xWlD-$ z?n?GE$CDlXx(3hNZmY@~LW~l z08egMW4QK+1gekdWNK?7Zzo=0%Mv@xN=`Jcm~c;~ri<^8+Rv4Eg*dinKXJeTeofbn zfn8gU=7x!YRKrJ+Y~~Mochxe}pNuv3AK1P6erbiO@}hC}7FydPq(UJ6A`ZYD#W7JU zGJFEoO@G(WqBENd;(Or1X(19r)wwWrc(DJyYd5pLu=oiLDVJKiAC8g{V;o?3q+fSe zY{u1`&veEyj9=!*Rljd@2iV)=;!WOW3hw<;B$oBTw}|rqm5u7XAdOVSz906JeR(?A zif80EJgr*@zeNm#l==&620o<#CI!DJM+;+CSkaW$jYy9iUhR?-(Rm%Bb%bOlx+*LtVyop_7ZP~g7Y9wYT;;ZlVKwzrkdSC)x~Vn?ir7vC-9WfS(wnBA{--Uw)?HB%$GmaMXF{Oyd593 zp0&Ek`sTrjurq=-3~Xf4(*DT%4#{05K31}dJ0={NBNXos8QxiFwU~a*%9ULpBWtd4 zS8P(dW{+tOPwG&#gWyoTHK*B$A6g1CAMIvMKfXF~`S9+mcI{gzc#D7xiTR6wOhr$F z3_QO8LCVr|!-SWT)8mFB7gN_x@{WlKHdqDYYpQ~2En|0{R@W)0!m;LN; z<`MBqu&MPBhrlIcg_ePH3fIw=%E+Yfox!{Y_pi=(FvsWeEwhcyzPVW_R^oZST{q20 zE{^|fiu0rj;wdR|Hpy;WS*5F}&gLm+irwR|w?b_1)O{k_nq{Rw9iGTq3>@ILxi7f!er6MPZfKYFC@H)Bb@(HK&*CF#vRs~@q~-H4 zOn%?ZkJ3G3)~vC4+f&gmGAm7jnSJF$Ufj_h@pd&_QFTPjv);B8Tac!y&W%K0id~qn z(wIjf^on$l+19qn<@4Nf8L?UPVf?HqYKYH#kG?-Gn_6%!fC&sBskRu|79kk$O838g zx_F!hnOBic1w1H953_wSE<={`0}qO(d0~cnw)}zHULu@LN-MhaqQl{?+9tY(Aa6>|aen)E^{5?e^!# z2|(0{u^dFN+cz>S_zTUlwU0D?3iC1TVG+Bh-8)>1FD_6lciCaF{j*1xR*XT-nqnu|peF+Uz&Ym3MB`#pvk+%&D@< z;)F@n#=RcixZCy)S%V?=r!z)F?0iH`s7aTDaHTvxyTgyzET%4n>n4-$qsB)jU&TK| zM=vly_Cc(^e%d8-uOj$x#iI56r^(i@S2cK3iaZQE1x=KcB?4FXPmlJ^{h%+-N}&I9 zlF?g4u!z4p4ZPgM2v}L=>C_R3~7++l+;c|yVgUhb)O z-&`K#JCzS)W@C+-@e4|0N++I+?%5+1TK3%j`SGd~gj=mf+G1RvXz8hSc-0*ydin=@ ztaxA2^6Q&f6^0800PneCSD33kr>)bj(hB0nmp+xPd|4gj82z?NRk0`Ak1>ckk(;Upjrvvu9Lmp_GJex%qOAqYs*>D;njqmQ_-Ks z{9L)JE_D!l_Pn{45&GEh)v-&vaCd{KL=@fG$*ewA^Sj?;Q$8A{p1ikTc6p0IZxMf@ zB>zI6m{_e#hY)nZlmo*C8JD!t%c`cfQ_oU%OPdy@zr2$NeQ2w3W93+!;kA6@vlG>k zsiJ+Cmsre10uM*?CpH$VV8ZX1UvIGy6=t2g%5%CvGOd0*bV$EYIa${@r#_?jPVk$Y z1rxJk-pSf)(3VRE4{HSno4uEg95@;}`5qadIy1O0$r|dJ7RFyeKP>r{%=~oW62JX> z-+SWQg$DNTk=?_gU(V?`<@C19PR1RZiQlFh_a;~Ij^kChn1fl$gXS;3C$Hq8zOEF! zF85N58M`OM@c5lFP2)Jz%9{o$mGM2ac9ZNTO}41K+KES|Z6@cMRZm>89OJm{ee%or zo2;0uH{AZW?viM)&a!YPUMrZ-C0%A6i|TlHPAr3aM^RN&$o!kn799!dx_$O!*)k?m z=TpN$S8->YRc*yivW}Ga{Qxpwx~&!hMn_*ATUK9w`eC7`%{TG>5ML5wOES;(1NrZ+ z1)f?ty()83ewSAy{}vOsRoE=~7v~Td{U(=M;IYgfa>?q)(C>SMi_nvAv26n$D@X5t5$mG9YjFO6`I1JZeaJweKKG}RomJs!ZHLK_*d0cP z1{$nXVS;1lpHxPe$lmdf>NzB9J}}AlxZdVv1ApQ<_kjdf>a} z%)(F1#ZSHM4zk|8%Z$6+visphDq~r3kxvGkZ(E2+q~g=!2W9T{9}W9K1^cE ziJqd=M`Vdx-l`79PFJ$VbYyRysYqM%Kil52v`RgxL)hYU2k`EHoUS2oe-Sw8=xMpq z0w-CcY)=(#Z9D(`^a)w;Wzs@IvrI0%t*xA7$xZ?50WBKQedZ8(WV`d~bU%}Mbrtun zxYH-R8!D%7?&j49m7St{lR?{g#T2eW8xX6b%(tM#9>h4v=a2GtP%3A7=ei{6$>{Op zdTv9U#^Yp!U+jr^aZaBFh*61}X5~ZIG=K7nP_8{Si9KQenL#6e9% zTAP_}gVtULpTzG2g|ApoUY@A<;-DIG>ZVU23k%+@(M0IY&La^xQKJ_x8#78FHIi8e zM1|QN4?8fuN|cJoDYQ~y@DON}$b5RHgC*(f(-^$|ttL7r+V_v-1fu%G3+jjM5?4~n zIE=oeiaAPWC%l?jdm?{%ss2$z^W3+LZgu~ei7SwjJe)B*(-A^5F1$YRlZj_neIhqq zYN-1l7uHnoyO*uG_Pt654buU4)gJCB&Z)C*4q6A&qIKg(cU_Gi?TmZ;xc_0}-T77O zL}j}z9(jS+`Twl4pP7V|^gx&r!ISv2rKYQDs%I*yNG1`SbqvA4jO^*9>_8v`Jz?5} z^)?cNfxy5Hc%r8bndI)}MusBRlL!sGt-UBoiXa1E1s^520I7ceru(zA($CyJaY;!G zR2&5)^daGJIFP#smjqM$K=L{SFEWXe|0e;5gAN$@`_;05z(B$DfV4y)#R5i97f5P^g2yOE zP#gdxKw;M4fS@p2pa(z)xBzaFfC^yg428KsVXgpID9jxS^MJqrR-OQBD9jtkZvm1J zVa6m|F9%?BKNC3r$?&8od*d|zpPBq^Iw<%%l#ovar5e=9@vB2r#|3iWNUul4OBVa)R(~ZGMf(0Tag@Z~;Vxb5e0*XSReii_R zkwX8(1z;Qf*86Pq-CRR9?*GCEXn=Gm1>i@~49E;DSR|5CDAzz5Efx?AgaMr)7+_g} zaFC!BAOHmyxKHV~j`yn{&P(}((kSMxa8UQ@XiUHui*Az{IF!26wWt}Dj6kub(n`>|@P-YMi`uBSqd^hU# zd%sr0@3^31w&bxf$w2*2df)Hy03M0}PneE@<`#JPqO^2@ZTYjM2RM+QEh8ur^cDaf z0aye-Th>q{#fm}zTZ~-yRYWM@btrg0%{h|dML;`^T<1fv{K$1y6lZ{>aDxCwobsv$ z;Nc8Kt#fk$oeG7XD=^JJTW(NbZYeO(xuDkhxkFLw9?Jt*L_hD7p{R96fOTFU(@%#( zd87t7t-GsrZeCC{g&SnuS)tcCg8mEG=oFsbP)cYC^rbjB^v?nQV|h0WFX%2+5l~8m z@O$DsNM^&&C?bCMs=q!SfGH~iNH7}$6akuam?H9jMFafiKWM1`K!ZyFVIPGc`hUfQ zO8|ZpAPLrTa=&?$-y)rV2K(Dm`UeOqe|?1i071QO4Ev{h{rRN@Op9a-#ty&^0o6K+_iIkDAXRe}uqX<(5z9ImlBOxG^ad&p}awb_reNYkzxCBlN>fq(&?kNp}x%|2( z;YPL>hQNTB#@5>g_$uM|Ztk{rP-}vX6ToJJ1b|9mKyq`%gWquo;iZvqz(%2wI0PCl ziN=V-QG#%|pzv>+@g)=OAaEe&fdEEz{RfrAV1Sir2mJ*D<0E5Sz;VEf;LR|s)Sq}LaQB11UwjcrIJm?91p{OOck;hsNW`DxK%#)d@y&QxAhmKc z3?~IFxPQ|_fp7me!>~a1!zLISjsvXXzx$%mpxxU9!(uj%7mmgO;p!$l40x8e8HWCo zFGlhYUBNL};9P5SU#UNI50^xO_6z*|(lf9M;D7X$l=@>`!LgJlg@5b8`dD5ykjxxY_}p05nGwlkVw7AbU}^KLQCT9m3DAqN@t| EANzzB?f?J) diff --git a/audit/dataset_statistics_plots/normalization_quality.pdf b/audit/dataset_statistics_plots/normalization_quality.pdf deleted file mode 100644 index c6865465f2d27054d4c835addc9be4356a167b70..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15186 zcmb_D1yod9v>>1`5=yHe14@X%Ff+^y-6<^~tqk2Ugrp!!DoCmzAtee*Nte$(2=xQ_SCzUS;bckg}9-IrBOT9zBm!v|q~It0xwhrpq{PzPgc zh_Emers?5m28BssU9h$e7EqWP*22sgiU0;&go=tn%CKG05zcTKQ!9H5D0=@c&cQ^(%mu0s zkd>ANSTS>Vfx;B*02U zg?^_mS+~dSyf{J(FTm^Z$`;AR=$*~Nef{Hzx>dr|U86WMEnC=|>d!e9osK$BO?h>) z#IB6cRiY>J&_<(@+%_*G=B+?z@L~1wvMXe&OzBh6OF8Z|$rn|Q9X8Ln&EB8Z{&=T^ zBmAtw?UMy3FJ`6+Xb=a?H`FI2#7&GgOP*fn*CH6~&)9fd{nh)LP<7$kesBMKxoL_* z$-}VA125+LZIZ%~QfZ=0(18^VIW-j6}o!jg1vgUzlbe2`)Z!SU2bkPK$ug@a$+Ioy2wjNVIv=}Y)$&F--(x~FFpCne8G%qiV*JJ#!b=#rXVULdOjesQE2zD{_jI<+%3$xe!2 zmor{tYEB*v_CEc(RoI^ODK~0%86guc<*3nJAGUgXRW@MT7`QzhF?Q_yrhya%}_hPkD zu3mLl`h;#Pfx70-zkg+l&}(m^TXVYPmAg)y$kn%VKL-uU2ZO@1ne5AOFWN89lugKsbB+&6YslP{~b z?kyLOT7R7NwJ)e%wu5x`1eH_H%c=H38Ob=t@XM8kXZ602&sp}*Hd6#irw5h4gVsEI zGyCq!X2f@XsC&qckdEX~k&H{PJYVqKlR!|uj0AbCTK?-tP=bVwGEpiQAM1H*5m(8c>KRrXD>EP@2FgPdp zh)ay~(CB4N*|)dgGUQ`A3~Cn?$puQ<{7Yl;h3Gc~c^B*(dNi<&a7bDCGMIN15 zvaT_GMlxWxxE_=}y!^o(UF7iTbM^L{l$|LDD~_2FFtF@9!Tk`k=P^Mq3P)i0e#D(U zR|^95p@u(UV6gek-D1#yi~SE*i@IGw_L7&F$m@|2p~0q>nQOxsM2; zU4-f;@lL4t4_%cGCYm%zM?`3b=4>&+iYlGh;?hpi1++{N!GxK@!s&btVE)$txgP?M zd;)*LRFnj+o|l+#@mhoTOD#XVlyzIq*t)6RcckvlhZK29xfczOeXV7@z7T46gXx$l&<3s$}Jr^-=C}JP8XwC~P+*R<24^^C{`Gz3R_J(-xAX0YzhZ_x#M_-V(!Hw(ZO9R&*dQArc8Jx!M^m-kvagJO3@J~t}a_4PUv z0mY|Rsy>=A;TMKABJ$J6s&nQe26!^~B`%#vwc`&+kXB)3VU#U(e3%*@6LhnudlT-K z&7H(urZ=R)5?<=0ViXRWWTMk1PzYf(BP-WwXyd)wl31Wp%lyP&G_r~Qq3%rcmhGe; z%f&YNF5|9*8z*gu?%Z0SY(m?Xd$6+IU>WDxt_&m&uc>h}kf=g`^nO35xAWMG`oyM$ zXNLsG`oeIUpP3*hr93@N73pBw-@$gEfsQ>(3GvM%vOhI3s*90h78*Ik}2EIxwSRv7Vtb!pfN~! zOV}yUNBP!3B7Ksb*qx4&7%!bZlC+`exNcESXAR$u*BJ>kmxM*5t7%P~syI#!S{~DK z%WdekS+~pQDQhwwRS8QYoTk1veWOi3eLPz3h~H#iVLJDQ$42UzFEiT%%L6BN&jdYi zzAihWlFc95hRrCMQPbc^<+c_ta0uj*#IG*M=dSd_rVJ;(7M}KDjVv6c$UKx5di}CE~dpQVWe6#hLzpWdpQoLhvO_&|dcFE9_#H2>+^I@rzVOYMR%jjUUs`YMMw14XPysLI6sB7y}@ z^$?EQZKiCJEsXBsHO^&X4p;%drsn&e5P{=U9U-+4FPLHGTsjTb~gv!6N}tIp0#c@&>(piW3HK6hK; zJlo}0D=j+EtjNUrW7j9%Pr5dfjbd^09VQUqK1?frRsX+3Fca8(<6c#|xO$W*m2b$1I*msySj0?fZq|C< zjdZMkV&yBcM^j&C%=%QP&PSr*I={@mtfa^NLDoG@A0ZDgct?mmiSih97YG)V4sz6)gk7nj@T~lkio|T)Cle$etVD*eEk2dMwGguJrSIEaGJVW zJi|{xz8%@4LAS6|f~s_bBS>nP%wLC)T=u@toP%|G|5momZ#X1b{oY-=(KF$k6~VFc za(z!~EYyaH-o)2CqeSc2D+>B}gJw&^KdoyrqHOvHcqF?FI)jhRxy?&^xHswsvpl}U zW*gg)ntw*O-${Vy9jgHcTeY*=;ytTsE5$RZ^K)CvOidZZ`XV(l4e9Pr-nGpr@Gw8~ zXgG6NMVT{n+D=nbR-4V@_()A{>#f;(%_h{e!^$wDR?&M;Y|-s!4sHb%5fwKU^en06 zFDuie_ri{~td@(;FQ^R^ecZjWemi$*lgKYxSL^@MQsyquQi<1vCJ!4mi8OXI@InwU)Qo<7$O zbH4iV`-FlG$}g)Fq%s>d%bk~Re#NPO-ZJIB!dVj_t8RoldU7`Kc&LQ3zpS{jWfke)CPSip<^{I}FRL&bQbUBPRgX6FX<2J$V%O_u)l)iW9r1K>!kI1eCL`LJ_-CI}m z`9d4$P9CZ9Nvouv+o7FW%|9}~~KcWVT@G-1ndUG4eb(r@P7E~n9*WTlgI?Fx_T_kJGSr2~x&^Va&eyAfs2XdSLs|;Vr zf_^f#;FOhP_}TZ(ueXHj)l=!f?P=`q~_cI<( z_^@54mB`f_E^My-0QHvR1_pn^*zlY(qkh&BK&=ddK)iyF&^BD#Ep%j-C=G%WViN=74BAkm*Fl9I#m-#W7q) z3cj%Wxkvj=Jg?OU4<@6k;&c*vi9U#Sx$-Vw-1fSowQVvz^X%LG1QU#K(*%L$Q{f8- z$a=s|1MxRmy^OkoeLXK}#TL!BIKEBMPSUx&o*eH*xoFe1r{{M zhdG=Q&aYY$;-YkeBx41z8JXDmLOcGDcaGs)H{{aT6CysKY9*mxj8MjmeM7>k)eW(y zCqHL<#@OF0`(}rY-Q|s}6H}YI7xL!|aM+N*ce*Re!y5_$uG94cg(gqrl^U z;AgShm%k#TCU+kXyDAPG>5*+b`^8G$E-uV~C^I38t2e+@)JpJH;v;C(Ihl1D2VuCE zH8->Vd}fsTxm&u13cNyTiny634viim8|Jh_7PpZs@3Y!44F#XKE8cxQu_)GAtZR9I z1`pU53H(hm5GFzX5(;dKD*f3@)e665S_${geJ7l2*dSs9|4HfR z23h>8wfM#e@gmvEw?qtlhQ7sCN#zyiHtx*KcKZv92|epmvoX{vxNxdUth|PW>;Ntg z*cTvCe-Ug&qJ+Xv!%1&!2?X%)`aQX~u)Hl1!BXa1kzV!S!cDudW>t+IG}q8&ra_!t zHvQUBRWh%)3!^tUyt3zEJ;hgRBA(LMbygl45~b)(Uoi`R6l`_bqhWe!hr4d8I@CZc z5iVd=((rzm^r%RC_v@)MZt1e)F_#4$?pF}`_wEozTsIAHNa{O{MwMD#dOMV-dSMZH z@%1*8%UaK_eMs0`@eAo0E-UH4=*F)LBqyMe=k90flk@Kw&r3&+7fXzruoqBGa zPMJDL@K19g57=bDF_^z_L23$MbgY=B5x$O!Sa-?p35wsn(xjHr#XxyVA3r)+S*Ze+ zn;N6+psTUVSLn!1vfCuA@_d52Qo%=qK6>edH}`U7EG0ziiyQxVDeJ~{N|_}88~LXf zs5PIEU_I=+kt)n>MF|@egC063`z7ee9bLLVnWwycr{+O88(H)2%?`&%^exSZW(JkT ziPp0r)KAZDsT?|DMYh{r@nG{|ZzcYm((;ven>igYPk%Ai10+8nxyJm(?h1)OT;x^b zMG{a3K4ODR1)4+71saG~$E7@x3n=0tCM((Y_z-wuya6?isJomJN#e=A7DrJ2RlaBc!Cd+Ogh%2L%9f`WS1iAVrya# z|fR1F}2BUoC-)3OFFUtC*wl_Ak{~RLG2$+T68~;GA7Cx{O=qjXovK+^5Lsfjnc$X$*pX?YeowGvX%y93qi_vgq( zW`^(^O{IwL(Vm!U^Ye^fb#WbgR=S3JaS}C-M5K+(JpfAC5^M` zag7hXJ{ON|elnQ9e>BTSyx}#!iA$bG`wja{J%N@B4KTN7kH0joT|yAXIWws{bjL9d zD*0!@Bq!=-0^f5nMxqk>2kz+#KF}AEOgN<`;c$0iY~0xFMv0klrioxnk&aErObWsI z3`nlTtC2!oi+HgO1K+u&Lu@+I+y`jjfQ%8z{};Ol?kI%xGaX=t-)GnXzU+Y3$B9@Dwl(M{9POD!+tc0%p%y|^PAQa*7d_10HiT(S3QeZ`NV zx+Ki6-ui7xpQ?7hSIXFbq@*eGR>kAIVt$n(5(64m`d3C=B-AHDMOEp|OwJ%1!YKp= zh5XGHNIqB3aC6co%!M+YIU-c^kpc5GeNDodznJ3M=_>f$!#dYRGi9~Ur@d@D*L%88 z$~ej?BSF@RptY5D$gZ0^f0ex3nnE&1^e%F2m@?&Qn&GQ3+kwDrOa5a?LcwRX-mE=k zzi*kgNIP^la#)O%C^4;Hz)+@NF)y!DYU1^KX&<*YM=@VdjxrYa^z-E$2`E*yiK^}3 zc!62+xR%s;Q>pAuanzgqh5`cOE$K(=|iGO6LpbUYgMDXu2gbT!fS z6`USpF$R< zLjE9nYcRpnKpJQ5Ir#p%DPC>*1bkNW(`PD!samgk?<{^JrKxg{mWRzLsqHon6<*t< z6qU*i-p;%QPcbnVzH9^{s4$#^GsqbHHpHV6T z)7C4Gs~lVd`v+}M%nwlEjjIBAi3 z<5RBB_6BElKU)X03h%^teqBC6+Z%mX=WQB0%}ya#`ha^O!wGy7*&Kg`BskrU#lmQo zGN$SU?hQ-l0n8tebt3rxA~1nxP(XB$UKb^T9TVRMp5D-hw`$T7tO3@e5*&HpokW`wZm|j73#XQ<( zA|AP5-B$!sx2`|nEFPh$TfJqpeb+Hd$;hsnY?YFnub=bel+Pj0Z!)Wu)v5^DN4alj z-y451N}+m_uNUmYZbC|pJ6}F_E6*}vu++^+MB?zA^|4Tiya~+L9tfBs^ zgW&LtF;!I7p(MWMv>Kh8$K1sNnzNOrTRF5xntqBs=J-Njf90joeAbdg-{+lNr0Yi#Z7?Up;sS+-E@a3xpI_7CL5$nawRJ54jUetcN$cT)TqC+(-O64 z@$yJl`T23lsY!cBBT22i&eZ7p)P|6@+J@~ue!5;(X9d!Z^FcD#3ddu2$lPe64pD_# z$(YHN-kxbI;fzP^JP&GlF4;O0c(|c*gxz#&jr=UtYHe$e6o>4YU`AIikgzR z1=&^_Cs%>T7MAJn11vmJrvuF0&g&3Z-SHPFXK|wl5i(g%+6+-mz~v!lrxx1pzTgX6 zBZbgGj8;C92Uj4Yxxe}vrCb7^-v!7i@Qi{j-8~x(+=#)GHMFs1k9ur%Mf3qv&MHL)|IqY zJHPGf1wDC0XWP$=?xf?i9l!tZROKb%r;c3iV;UEcl}EQ;c8M2@=z7)aw#C(#Pjnh( z>pVH>oMEMnl$KI;C2I?pk)E`=agNeQlgRQLTb-nIukxcqrmy>mU~X~!D;?L)PkxIV zz8dy%#?zj$HoVVl`1qUH)WBlR_KV>p0#E(%IVBuvaa3-#s}%1ri669*uH?{*Y#ku< z0m(Q}L-XIy3@SP1|hIr-T2 z%F^tc^vLu#%w8E!t%zd2P%_8I=B_@mxGq3T@>|sdYr{S6@p%(xACPqbA!>;VWV!F1HTYPVya)=OfU6b1tg>%B}&}MR`HOjdm71E~~FZeDdYacqq=m z*>xxV4CN`$jZiy|vaeA+<4)Tuc}rY_1Wz@tpD@@|sIdqbFHmRxSbC&3B)NwP2Z=hY zb$YzTNEXIEbG@u4R7WJ+>-O+z5rgq*(m+lmP&_ewi@zSnGdes$XGI<@=_xrU^C{nb z*Orpi?1}WJw?qEM6eRl0m6wN}2jOdraC3f4)#osjhdG-@N;^Fay{$bUQ_pshO5g~q zRXJ4?bUG`YsEUQ(N$r8RoQlvkTY{{B$kkiSwQV{1Y09_^OEHr-jQOg<{%tv}iOvtCFqOUX*|Zu?ENGnv!Qy(cZv3dIR< z&nUgS+zf7O5UoJ8`pi}MOp7@Qn9s3GvWJkIV%RWK=1 zqGuPm(qszNd!AFZ9VQP@Ql$ z0-xNF)KK#+MUmBW^66x@_Vn6b62rBDWdcER2CW0QJt*Zx{0+C^isZnqS86@M|4jXM zR6?d!Ps7T$L3}ntXp-%1VHx+lDJ6O#^b3h#O5b#DgTe(o^E|$m`?vGsX$j&zIg53> zqY|&A2S-1z6fJChmSAF@)L$o;5rY@T^=7nPovP!DfXO%JLl!|IW# z&XJsw_eyQc3su^JNrn1WzB;7!a;^!T@ z)$DCaO4OqO3R(O_$LlVVDtZs$k9F9`EAYN%`OS0p#A7RDn1(tOSl%5+vq#(Aa=j;a zI9Ew9k&$OgJL$gZ<2rp?)YiCW5zYODz=)J0c~1xA#9sKNK^br9p}NOcxK|H zK*#rt7+tju7gGbJoElOE-El|RVdmQmpRPGrDxOFVQ@uM)5_5ODKf1Vh?0M_c)o%px zQsxI7^1y#})ZfdX_iCGAl2~W6?>#wHSzR^V^Ab2KtgW&pxLR<|E>f0Q9AFig5_Z3j zK%pQoaDud%vkA`1(ZvA=h3{8DYq%P_>{VEU1b`r*g0%zcl>d$Q`((-Qg~{A#Gz!X% zM4+GuUS3|H*q9d$)*^%Gm9Z{3tG(i79$vtWf=h$7Q7S4bU{>Bs%&L&`y zxd31o7>FJG!HobniBeccc{3{uOQ0Ygj)uT=_J5$@Ku)Cq6xdstNZ4E0nt_W)!^O<* z64+73y6;H@C;U;908J3*6tEw?FRZJYWvk7XX&~9t^|};M->c0$%xmWEjK+3iyLb zkj9TbIOb;`-@o@$z&$+R{U7syKFH%9P7p89kVrs*=Eczux;n_s4br-p|$W>sU}R2h#XqWZ?Qw1@y0ZwF(ptCaxDjrwbfn?DZ}J zkLC9s7KkF>d&W?}kM7+y0Tuz+gY0{G#CJ~$xM;+F&@zJp;b(8GIj|P@d!VaF?33E_ z{)jzl5Wu4&_NiGy_ukC{yjVez`_yc~$g+pa4lwigo;?(>?H+6|ZU7(O?6KnjMeY*= z!ws-3_J%k@k^5rdpvZkf&cJH=es@1)?Gtl>^6iWEJxuZK69gmI-fkCQ%@w-0Lj!tl zzytmr%T*}fJ~a<8=MT8@Xugqm4!q1vWZ{s|GK%Bfd3kS ze|{welac_C!c4GGIEWY~f%spC0WolY*e@*nlED1K*Ds6XA0Q~*er)^)2&(Ge0rUIU zUob^0Q!pg});uWk-j`6WK#<#Ga=%<31>XBHSqk`?2~2u8fDFaa*1^Tr${6Z~hi)82JWk2_eX54XR<`BT&ARvI7-2a23Q7B-knL~fV zz%&W`gWB)I;7EROA^ZkIqX1j~4g=nY{tm+c9`AP;KkuL8_`zrDU-*C@fY#&g|Az6xF+gSa?=a9A z{su!J!G#3={lo{D?@xJ=aLgZbfe`hFu8{no8~QgsKHfiU$;)?u{`k=Rf5^cn0D8-R z<3l0-lnuql`v+YVKj@tPJr~W(|0fJk!XM+1f2=EBG=}#N{?G#GKjvciK)?QPx)==T zeSd=;ARC~*-|frG&nNK5Tz&yyqxl;?AnE-Rh6F#;`8Qo)>hC-QaT+)U`}a6FFA}K3 z{2hh@87`^0>9eL1&6h=HN)+F`>J8(32a{wz-Gn4 m0d#MBeqX`f+yT1h-uAq?vkMmIvgb}fXM={YvdXB+LjDI_ct4x~ diff --git a/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf b/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf deleted file mode 100644 index 14dea7df2f137c9e8488875c442b033217bc6d3c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22251 zcmb_^1yoes7cU4X-Jt?9bTbUYP}1GqCEZ<;A}ApxB}z9UC?#DA2nZsf0@5i+S%4z> z?nV6mpEz%QthXMn>)hG*oU`NXd-m_dY({lySxy)iFE-=jNl0lUHVncIakX&378Zs; zwf)?zAy6rEFLNhXTL@I$+}7F?!UGi0gouh_Tf10+1qJ?9LDtp93j#kb0M%EsaIm)Y zg7E+PD&y-Vr{!htWewr|MNl{Q^0M}Dfgr%I*ibDyb1QonTL|CDQx8{5Eo(1`5x`bj z77)eS*9!twa0Wz>Jo%J5`BZ?I{7W3z-wJ?ogVK9g1JeBxAF65X>FVua3FrgL5Aav9 zwz4;uboB)|as&VP`S`iuFm6FOKLWzX&Buk{=Ho#CjG$70iacO1o(GuE(iD|5@^49_1p!vk$uC+M!8`RiG_f?F!}8W@C+wX6y75Q@0W>$w$PDv4r8oU1bjS_$kHhl>ddagc38m7UyxU zW<_8d#Wa6u5GfD+c+~TUWJjsjHm5`~MKty=G>dvNQfVs^%^3tR>NYfd+Z(&!)Ax)6 zS1Ffjr2IWsio4&=%_^wAQO`A{X6qng#jz@RSuA_J^|c-}%KYO~XJ~1z^ zQyeN{H*^FqFpE{Vs=asr*;~#zc8Oivfz?!8jbgo;T;gFj=Ewk-ztdA}MNtgC#R$lh_P85W8SK;6gJnMsbEznY2wtw=E56vhoHAKjq{m*m z!rj(}o;y!(@pwF}REPLW7VG(BQd;RkqaurmK@CLX(rD7kaB=Itt9QTcLQl1!rJ(|+ z72ky??ecFcCw+J*XT)}vHt_jP8S>{3Xo5S}9K1|b2AJ|!1p9lBwnV>#X9rP@&X+W% zDIZOmw?%$@-9Zz~c;lG}b!<2P+o2t`ogUpC`(~LM1wX8eJMDsN3*pQj$R$LRd|sN#wtThkKz?3<4MI;=LY;J1l0 zx(h$88tupKb@{(Y^c(3ooVf>~oP0;iQCm@S-pJka{rN`L+xirGxeYoBA8ifkYxVP+ zR@c3BV~ZAW+(iU4O|Bh&l|#&2lpik0l+fTGNcM?M(&OLW|K@cA-ZD+vM5>6vkLtY( zHEzT+739?J_qR+B{c^wFn=ef#-C|{-oiV>sEh}J1o~7<<|7xJu(uOUA|5wi1`MIWr z7Vm8V>D*}1*ee>Y8gX~yrVqDvr}l!&Va2%`)1T2|FFH2S)>iB+V2;aee^o)GUsEl! z-5=Fk@aHM*zm~1S^z5kd>8Et=trZV5`&cwboY5J*>zJK4yhU1PckzPxpLE@S8WkK| zQ8v#o7ae5p4iWVvS9W3Nklg&-#UajEU8tsES2)iXvbWN)hmhplrAbl|tgn@k3=00T zu~BSFL1BA4jPsVK1x_7T$?odbYDueK$MyQBd4eAh!8ODay<5)$## zZzgz~9E%4h0mG)%}O zBoJ(J`ZwC;*JjOo<6I~D$#V6FF^g~8`$I?#jFs6$Q@3*7-wmdi?OXPwN034}%A`RF z{~^`L+~8HEmmT$kr5gD8$)cLgE^}Sx176#jZWYtUUODx#_IaB_i}&B(oXltZBwL4pKe84d-MdyG)gZn=UGZ|vn=*fd zZOg+CajWi z;_})#w_bB<{GsTILZ({s4RQ4mkd3h*Q9xzktXYe0_(x+8roB%!TjX-BtYzth@%gRH z*AZdYS)@4?V+w{kf0i~E`l`4mRmcBQX45o@XU8me(9%g-E%FA&=Le9WFB%G?_nH<# zKK+`mt!sJrhh+RbG|ia_0c*P%c;`=?i&)#X3o7iNvccDp&I#;SdMEMDI61>K)39fj z`utGTIm@*7Li%rD3~FB+!|3~pD2%X0CD@xc`=@LRVK0XwZgj}5X-iss?<~s$4J~uW zUByZxBSyy!^o4;AOK25oV5p&&RDW-BfNZ&iq+2|^5am)b)h)v}_nT%<3_!e){xQ!I z-=HuL&?mF`3#iYW|1CUIAsX$WXrTq-^D|Y9o<_xpC@($Nj=AIxAu3s89Xh_RVPdFg zk#|VTjeG~=r-zl==KSgTvL%?4>)4|n<6#;JN{Sc;S>3NX*x@_z9N4>uOs%w52`uwK zrTR|p!DuMPg*8OOlJAokF8>7c@$qD}d$Uix&bLUr?)~xusja-hVy4!gazva-{9r8~ z$?x+ZD@$JN9zU8ZUVA&wSSjJ*HxF5I=s&GE3xLiewY8c-8f(fM$Li_A0AEkx&WB-N6ZvBtdMJ2u27 zq2c8IrWu=w6$*9!V{MryJX`7*n4A)UW_QQs}H(KYXO~#pD)f|%jve` zbvIl--AY*_dS4n(6PA4G3RbjMk=P_ZX=~gY-Do-z<@^zx1(97RKsYMP^3RVS!$tFxWvEA3<$~!JD$(xDNCmPAz4^^wbeoIQmoQAvw z6@%iT0;>cneN@8x$d)9#p5t!H(<}MV!dA|`V(u3Mbne&U-MSK%@8~tC_quWtZEA|) zsGG{YL1gDql-YI5)~ZzFn}ZVTUmsi`zBdiW zARxeL)<5hv>{5<1#oxRr=gLsc_|))ubrMzQT5!P&idWE-YaGw?55=nX#fMVqEbOje zEp}wvOdo!2kmD-gEewR1r?D0c z0RXK309uHcS9oX=m{Om$!jxtQ)CXiW@D+|mx$X4icS1Pj80?rV=mo#bd-u;IzVi#b zUo_(-`EIo>KLu;fX?~JlCsyXEn9@sHYZJTL2-VznMSQ_FlbjMv2QrC^3x>Ve&Cxs}14UhX)52#@{paAyM+*@c9HM!V&!cExSmfMB*YhCi>?a7;1#)Xjep@R>C5t zkFN1YXRSoHLFRhfbtb|$W;L>ETh{N29UQ7jgpHC220t~*wDxeMQtE%Fvgbd-QX1sk zS#3Bm;!)Z#%vy3ks9nwib6= zMT%?m%9r=tr3;~f(U&sKDe8OY*|6{Q$6V=5?+QtfS0qnZD?w9tDY=kdY2>bwdp%_p zm-KByrQF3kbL|6LU8>r8Gj{^~@ld5gNIy|OGc2zfmHP?Bpbxou7d?FH&UtE$`pmZ| zZ@KA4xrMNb?;g<4Y)2S(^KRVVv1Bt~ylRcJOg*!=!ciRKNRs@btRdj?)$DPxZ;T=f zgcD5MQfA}Np6}$(qmk28*zDXmJiI_09dj8;Zy`bTfOGzLZ!u~r#G%75o0iEApUayw ze}1Ili2wHJS*-$%A|a&9r$oDwh*FI1C0{b%4Qm;GUi8lhnc?=MUfxI=2DKnUve^}b z+;Fr3Xr%Y_4#&i@%_>$zMmWYR3Od@neRPxfw6EI5J#IrJBr$uDt#i8awhHc~L?SGW z36k#_KFhOguD9uh-oL1PPy$m--Ql?F`IJyLG1|%Mbt0+D1I}bB(s&uI`jEBK;wn}# z7Vei!9D-T}Bv#ygnHLqDT^47(IUmU5IL#=hQ|Bl2d8)~U$#W9=wBHhXX-klHyW>`w z+sGZUA(r3_OIZ@Dtv52P5uYXmwLTVFHtHu`+s(i2%>&M=PQBRzu>Ur<%Iiv^o7|WfgITN>QO`+M z-CZ_d*sGK5*SdU&7UjAm|B*htGUw@BPsNXW2@zP`U+75iD;*O&UX^PlR3F|5_Qtb% zdm(Bvh{UD-$}2X?_ecc$_bl_|wdLpiZ^7VjUTtpm3hr~nMK(p)OW`)ytl`FHH@G6t zVJ*6~6+k%PmQs*cgOgNutS#(N5)r(z7MCbfC9R1#;;G(mQI+AA@uI9?UOXNS{~RrO zz{%w)Q^9$8{#zuOM5QPO7_RgWKWvknw%v*4N4cpouMg@D7DKPRl&%sbq8*zuN-21C zzofdT+WBX`TJas{d%>fdihg2Aho4sreRbq|ceI5WjAWEd)jSS)u=>%c{L6wwuI%~O zN*VVVPGYS+uP-L6XM7lv>We_=&G4EDTh5xaOTO$9)yi}~ANJiaVwzJXH&toFzQUy_ zUt}cHE&TGH|GBKMnmrJBQs3oI!rV@T7w(nxhIQR#<9YWE)`Pv8beP&Cj+w_{6Czk2 zVVqad_Q_ni;#<6glD+BDuHBfg^esl6Egiv>nqlVovby&E$3Z+@#8EvQgC48>8JS5E z=2fgyZa;~`MD!+|JKD-0-M4(~$#ly$hvm8hti{RwNwjRTU?2Zz^5lgBir25Lo{qEG zf0D7p?O5~<<@^?x*qJ$$jz{C~XzpBe^FzPDv~a_Z&kF%!!P%c_yqJPokT?vuDsrlO z_|GuR!})nML5D#hwqcSz?J$?p(&;LdZ~A-GE35P+=o1<|ARVW3vJWQAx)pBSMvpWUdpxW$ zkz(H)NE}>QQ)&}Fyy#y_oeG}ba409}Rb*mDB0Cbq!_W8M=0n6Z;XMb&#egC+bjusx zM0Jzfr=P{P70gjb_)cHt^Sauxh;m6abn?1%B*vm~0Z+VkRM{R4w5HjeDK-BxMOe=g z22_|PHjW|)iA+e43=hA+8HCbka+ZeSuFa4dLvC)@hzwvv@QyvhdnAP|b(>>|q2))L z{ZVIIZ8 zc$DODMc~>)7nK#%(jvl(!9u_#O=w)=H5-Ktj5kxA-o5E90=jbZA@ zYBA#`y*&DmEtG1}g3d=)&uGRzou+Tcina8Mt7iC2Fj=E%4C;%5b>r(xfYd?fst!tfqYprEm8Rhg>aH;(^t`QU)FxSe`Q%qNSM--VNe zK)#>8e9kr@_4YSXzj_>=_gGF6RChAZ-7;gNsX#N0l2-Z=Xnu`-NxRdlvIAIF@-@+l z ziT6ncV_Ns?56h9CQf*~TFxAfM)-nW^+WXyZbC&h*4}P+jX9sf)DqH+*6gU!m0Ovi! zID$x&@OW{Io;w4d>)b@i{o%xRuVd+G6xY}DoDw%K=Vuebop$PwjVR{}*Zd2yzIT1T zxANr$dBjic;#qI!Aj4@6^-1;w;b;7G7_an+67}7nlO_5eNUn{(fu~l+r&U#kc3+ND zX7RqKNTz;6)25?uJXgm@UqjetUU&9MHpaAcbv>KTqPRWhEfLL^hRv{Xr?;gP3vtmR z)U0HD1|PMECIw1|HTmBV+$*NjUKhM!6O`SSekG%Yn#MA7Yz}L*8dc8XVZeBVhpue- zqaDZ%6S4b_6lCrJ9$$IVh{)fPGI}``U35inK$pD}H}JCG*b_Z-P}6D`>+ZG$Zd;{B zU`7|be71=!RGdO@@>ag(Gc{#Cxcqc6=>#QL+wCEO;B(8K-hotJHq9H@j47WqXx6po z+E&{Lwy2h>d#wiFqEnRZ3Wt5W>qbwo_N+*B;)ujQY2*iu6h;UEl3qdrmUzwr<3tjg z+|dANvRQhL@#aqA_R9qgw@+f*T&nrv?N=zW^ z#;wba7`?vl=Vqot9#SWdh0jO z`?0TnBkI$*E61M7B;$FrTjRN@B(C3M5%eDiCsr05=Iz3`(9;Vz*%VhgH$L%S&M>Ai z4NORb$NwqTT48twdu+sk61D+2K+hRE!tQ(GvnX7 zIMN%sD{kWG{hiijz%f~4;hk+8PTZXv*J5>@QbQ(;oN`pCvV|PUWis;^Q(pxHaX$%i zd}i;XzrOmZ@&lS>O26w|1C!)Vb1)X2vI%(TqL}@(-etElLp&G3Iu*aVn1Qz|cmuy-E7Ptu0wf zpHRV4i-aKsRbKFY!LQkfXgg^v5^0biOW0W^$|~^^F48besUL!2P<7Ue=X(5Z=y=0g zj%t?%l!wWn=rOtS%J14DF7ZTGy?7W)=DjfB%S^-nb20iO$qGI@-4iqaYxEDzT_5v2 zem;*{_ok9Wy0~E3_gmcP?49(Fa(wH_^7dM;92e1Q%;90$roAa+K0Q~fGaWIr@Sp6{ z{M@2|F-$97ww22$jVK)pR8Shyu3=gfr(v6_vL83*;nCV)TT~%5tk+w2`QHCZCi-K# zh7fhY_wDPQ-8)yVJ)3ivdVs_hNU$!9`wU%!h^@ry5XVIeP-!E|8n{pTdI91#w>~3# zLm&)g9{mbEFonfMaP;VeYqa}RxNRqKa%;>Jjl?%&< zTG0W-4At~ca;9Lkz4@=z6MbI|HiXUOwBECPxt{)rqj=Ot){??idd5L&>4l>mc>%Fu z#_DaZkTj2Z&FiWUCOKZ4&!x*qy-lb&h&5KT_&y((voLOOc~D+8MdXg!k!3RD8vJft z>~tK2*)qv4grZKKgzQmEy1T1adp==9q=)}b+9&vj-DJza?qvd(BB3`(^hJWCd3n#U z#DGXtNTdcwPH$|kK_ZJmudL5*zC*1?BhW|R^ROAvt(f{6obi4ggp%x?21s?6Lt>|Qi&eVyqM)Ic6XpIIY9qCR{rF2>N`<3Pp#mA*M%8my27GK zt>l*OEi9B1c7!@U<(=;we>hiiHFR6dsK(}_FO%M1zSU<`Fqb5E;pXD|>H^N6en0c* zzkS>v-yFYmL?4mv86rEaS}G9LXI@zMQC*8AkJCZC$~Bxr61BCelC$}ydG1u^YvJV} z#)Rq_f`{kwqlj+HaVjh==;vnJj#Ci_!)a(^Lejo|dsm$|tQA+^qs03zTH59H)+T0$ zRRP9wV<1VeW1`FyaX213+X(GyyJ-Z!#qHiFp-w)CX7Pc~--VfBOu8oiSe9)%U(QQi zjy30vdY*_u5HTIk^8Kkp_Oe^i@^0nN1{@wegBKoTHQ_sam*pA+F?Ymhi!Tc;LUf2=_uIB2}%K@Y+ywg8y5$~HP z*}_E}rb!yaZ^lOoQVyffIPd1};%&?vp>(Z1FD+@y)M&IjF5mmf?ia5b2x{P_$cV&&6dR|~_slwU1tCuv4O zEo+PdJCo_FtsZ?yNkZle!jOgeMemn*Gv*%abH$-g=PvqZ(=ChZBshnW;94UbFYige zzVTL(Pv~AkPHzi~E$(H^rEvV56y@d=huFkh1CQL>d|T>2Z{uL(@lKkN=5wh>idajn ztvIav;LeUMZ0*2@wT<*=9ViF&S23nKZa#cpl2tqw;V|62ik;4nasksHqk_s{1C1J2 zkR$}f=8B(56N>j*W9k4jB(d60Z5IOZ)kD8AQ?*>oPgr z8_27qHyCplB4Z+Iyo{UTAkCmR+}|sC8=Q{mpA)p1Ar{HN`jG+Ay>ul{Zx( zD;R+i_G~wbu5YN1*Q_30`w>(Abr<7il7ZOY2INIDmpRM8`+G44g*aUN&AUNO+Rk{K zmprg)^_L$)u|K$7%Bv7h6f|~|Jx}o-rNToB4Xv0C{S`{ydz8n#cqkh~K?d0Q z&ajqYd1a2nKd29fn-QUT<*|^KBBtFO|D|{2JxhMYtZ>X|02P9)jx2jL}29^r?db zS1yoe(lnO5kmbB3^*&mR>E&marSG4+?5S(-moT)!Q`j|;5`E2xH>#83@`aY5{N{^m z2Lvt-*Q}736A3!!L7Zils@~&_4#VAR!sa6se|XoQNq@0&<%m*o+Dw?xT&dt?A65u7 z@DUv%foo@ksA#a^%T-Oh-m)K&Pjf6H86QNsp$)+I1$Jb3?`#-ln^#fVyTvigzkIzX z{6ZtI(Qtw%+9sjn`KbCz+()0-*kr7v;fXOJ6G08AH0>BQQO1Y;4xLPnkvL%+y>!G| zmRtO&3l=7)s2@j6Bx;E}LeIs%1 zijX_c(AQ@ckDeW(bd8&2A+aSAn9B=0gDpigjlh}CUbdE7h~ivkhHYtySisTDc%M_@ z>vr^WwX{#I6p8~MDV8X#CAhX!8wI-hUAr6!CDXZI!N|-=_raT%eiieEdso4QvX%A3 zDNP?~&cdh_L#|k%a3Lp)Ov`Kc1MYU;Xc>Hs?W)iINUcY+ zL461zX$U#ypr6al+m~MG_KHfU?_Ay*{x%1n+#mTMl9iV&1b5>m&iwFZo{p1$+@2tN3KuPCahsZ@mmA>{|D z;5qdhy?$O)6#hp~R`iyCl4ssNPh4S0$UF1yPE}Y_oX+DJN@4W!q2P<-(%52}H{^RI zaXf^C5_0SD$mFUd#xg~6%UB*gDS)ZuI_s6FG<@OXnBRZgqL$GjpjNmtpX6Sf%weVw z41G|JGiK?3<3;2|Hlig}KVt;rgXoYq_on84z+Ihv%jJ(x4j*J#@(Xt_plLrAMzR#o zi)50(bC#7X84U%O7u>i_du03KsE#!U!1*;1<)dKERP>LuM3zbno!SVdH(#y8tGd!+ zT`7etTfSc9Aojt|HWxH6d}zL2?JRI-)GdzVmRvq_M*IgvyCmeB8N!14{iLv3Yv;Wy zi(g9plU;86NKnt+c~4bNs3-VrN1wJo?`1Pv=gO8)ZoYum^-#k`*f#H!BcJPQH^Jht z$S3#qO?Tjli$}#%-b&*ahGn}LzS+w=r^Xs%Jj^KJ7zwiywHJ!cEP^Dm%KRX66@~>k zaMBvBKTOnMjW#e*;J%u#vV7RyT1jD$vZ$7%evAHi1PuCFKRM3*Hc-uMlrJB|-AIGFA&4h<@ z>Ba9jYIp%}H)d|J1eC5rhik95#Xr8-G1z==Qj}n%V9PqLDAL~4uXFk90cXcvYm~8A zCQQ)2u5*407e}Q3-RmWKp90yrWK$v62Td5EBM0d5Ay#3oS?{m#A?odP-%eJjDSn1) zzTSs$I2bzjPYT;8eZ!^au$K-`>e|`BVrQYmzoMsS=kP_F&F-=6&_Wf*wmcbMC3W!; z*;`YI(8bs5uWx7GXW7F~l~UWTvcDDu*v^yb6{?R{0oL9Xc4fYGrS*-9~M1ODV?z9+L zpwytV-KxDsf+;ZZ@~UB^#x-xH{{<7IL+W-=4gAYJ=G zp+XuVgrvKWOb%dY*|$?s|GnWa9D<1d;Z-^uk#=;gTfK0IikQ*}H7Qa>xd~dHm#pGy zpmoGs?Z%09)Ge&~bb+*4AxP_D($`CYoSV(}h_Q9P`3TI_GyV)AmdOgeRe5EDRQnN@ zxu45BxGHU5O~y}x2|xYIV-ghjZeH&`+2p<*Rrf<*NoX5=bHFm46s;Zql1laSLNCJ| z(#H&Ys^>1);~l+gO5e>HX+~XB-n=%tTQ&go4;5oXqB#;gk2nJag!A#~0y}hkn%q3R zI^2BR=PWG4NTs-n<+&nJiF@Zl6(sc`JyZ`$eytuID&S-M%^TQ*{A+8D{|s|-I1jj4 z$_+;&4liQ*y*nu0nwtAa@^9{36{uz(io#RWWiV4vr7KbO?JuKzW+LZCA`TKbEO>_X z8LQU43Bo+vM(!_zp#lc` zYj^q7M(%_6xc8B8MuNw9&h&z4DtG|fQcY`QfuZ$UpA{Y^N$nmvO0cbLncX6nC#*0j zsb)P_7b8`c-s<~$Q|;Df?x@9P@I3=BGUXIyV@aO}R+fRxA}K6O((-h2>UFrHDdVlo z57zKSK2D-`SxI5sC%?4Rchf)Z5!(52A03NPBhL8}nWyW?p$!ffR@_!>=?viF+tY7{ z!}nMYr8*~@B#U0sJwc(P$oP?UQ&B^_%|DyAAr`Ie4z+Ll75;CHJNlnI)Z*VX9zh7| z_?4twWE2hzjvADzn+eQvABmfWGxN&K3U#9#1IK`Btl|cWC%#$4Cjqh)M1I> z9`ylsmKmY%oJfyV*~^1xv5+L?s2X&hs`rufMyfPf_t9%7y;Tlo2o1j(uYT>wpfQlN zD81;>l@su#c6RrZ@%jUtk|6QU*8-Ma6@LA-w(CgP!V1qv854 zm3Bfov;-=-(D5;Ro`X68kuf%Y-#{qc=&ED}rMiS`%EIiNh4rmEYvG5MLOnJ5jsqWa z(bx;I%Oze-R~y)-iTyMVUi*5ENne^1i3O11BKVosBf%X$+{Qh!U2$6ute&p{Z^l*% zkDifdY@y$|jiOhh@~Rqooqa}EaAV@ak+kz3Sq~e%Ty?`k^TMePNB0>{;W#E;nr1l* z>TrcIC#i;9MB5St^d)m8fh<%NKRs!0BE#QU+QY$rVm`zQrM+MQwxa#%3yJYkd%%mp z5&<*irAEJ=oR_(I{f@3p0sFWF%_^ONp0v>3JIYI3@EL1I11o(o8_seo;>1M~k2JlO zHMa~j%#`;67pc?Ytv?c7w4Sr$zNhAjORg-6{y?8epyvZy5#hM{W4ZjChL63omk16= zclS}-BJ_%oSPcmlg2B#mKtNRxA58*x74VzciG?3dq|UIP!IYm_>~Y&~M#^*F4k){q zTkw_GpJhR+(ZY=;MwkO{4!1_5>oLa{7h?v4FAkB6!UxIzJDZotN0L(X-{v!u9aM7^ zjQdtoT|%{#Eelo1f>9jUZp@UZ@V7kkctbacM0g|+iAUfp<1Mft2Mj6FKSVL0gyQ>( zLTXVt;AWO6o61m_iof)0-0oCHsglK_00jqy-FRq==wNBnJI(>6g0S&sEiB`l*GS$u=!^2q2KT> z4MDQa2oeJ$12t!f?J{bD_<(^Y4zO=PZhF7^tU%-?fAmb(L*0v5CAENs?txo-n`ej~ zMBr;)TOPrWOUdE4cqI&t(IYXVJV}*ZVHn$HMxEq@?zhG5Ef{$l?B;Jaj9d#Z%|{bj zqk8r(paNskc4NF8*S<~DumxKoQypqjZK7RQ&d3m5+54ek*;AfAEoS5jQ`1zL*Wtv} zf=C(k?{h3s43tB8_tc(WPxT3lvnR8i)iVHR3AM9j=y1{x5Y=^ zG#9`h$I2o}3QOm=y-}^V*D$cu;}1j{y=~*M!yp@Ayz1U( zjU|F}L}R)!^vN})3KsEq5Aq=aNW5p5>-=6TLs@(HW8t5V@d_6wP8Y@XZt0-Ds=nqO21CZ>eMqL;$GiW zytViN#$fWnexq58(O${tX`~oM?gyc%j}|0}CFinuU*@;z-zM}G3wv3ryyWeoy)Wwk z+a@VC<@hdy-tj1!H!kt}nwi2j9Hw5a!c$~nmC?LSE1H*BfD_3`k|_sUXQ=tv+VttW zOF5Kjjt3QqW=PNzFL3(&KQ-e_Qt+Ucz?1>^fN~640K%eQM+Uiz}!6>g` zPWp{4lTT76`pmB#s(%&SSaug%T6=%gP2UIq$+_9)dgW^O^xbUp0Da$!cosgsV(wQJ z%APnHnA>WqI*1m#TI_c!X0jOBH=vA1u)p2M&&2$$dC4AzfAKAy*F1$ZQHhMVSNH7U zUIVGm*fjZ;ewch^8pP(}lO!f=IqSsB)?URA@=O@3jEa=6zFLj+F_&4xJp2W|#&o2x$vd>8_ExNdwN$OM#<|REKHNoy{@7#YcKrzDT zsen7c9wBq1dhXr+-cC`Sc-G+0&4{fIdjTF^iHlwrq* z_e-Aqd)6**1hoogf$eBCvV=q7Hcwj}DNy-Bw{_@&qO?+xr=3K_} z`<~|m>N0xy2A@Cm8GMUt#3tZu^y3OCS>Q^^4^OW#?#di2QGtzzFDr_(nqHw3-a(%` zQjXVzMYiY{S-3{Ptumx1h|{F&CAx*MU-f^JZ4a-q9>WpFT}s30scfsg?pnmmdKDj? zsOKqL+@R7GeW{GkNuq0%MA?Wm8c>BD@Ib5!I zrF==sV5|1M&$An-(I2D8kaQPPC=!ntPB#*{m!4*p0(?R!bA!jI;NA zNW50WAfVl#FZD&^!k}5H{-evDh4y-IX(=^tyuLUY=|%fnti(au7s#E4wJNheT;4fRKmp49n&SjaSZ{z%(rYJI?H2!_QP>EB;e3XJL*w72D9= z0(xRf|DRFLEDbw}!*lNYsuf>3CeR*hg+ekXgF#sfR?yv~)m z9y3`e^T&{ewkUm((twz$D}sCl_2e&6BVO3F7tv(lSJ?WqkSVu zH)Oe_0HcLYz+F8(P)_ygK2wIQpvd)T+V;M(%6t`%LOU_bH`JBn{Tm^*THOqYv;o7Z z>Z}?Mu0n5FJ3Ytvf{!X9xj`-2M*%lR26hk8HdM@!EZPFwbARIu!Luw-tN$KTfIUx? zkmr7eBH$sH@2P#lk8rH4MEUA3Fj>qgp|evMV~g>OI30eT#W(I~r^QP0GxY9h`KHsr*>g5#Mp1NcQcu*%lU(2rT3Be7ZON$mOXU%@pr1(MV}L%sm2DyNCEWVL+ZG2jd!aWLJGL+{Se?>d^{t95W>z z72gEcMEfm4jr~*n7^>4_g^!9ES|u;;j%D3t+k&A4RUdR z%o>1110+z82l!>)e*z{%{Hs_1UrT*1lf=77_@R7NfpDF6;s{ZYnehVFtyU1x)L?Fi zr?!SgDIbv6R}rPW2bGEnZhm!6=lS=M_4BTJ!q{}Jw$2LiV_HAr1igpun>P$kW-zQX zI$kf`mn23Mg%QZ2X1axVNvay=z>4*m=bBJ)MJUI=Q8ZeE}@ zy7DBq&B4zJcWS3NV^5DbDp_>rL;SnhRH({+1sdq}mRV4U!qxPjWkWTV#LjX*j9ByuBxM?668i~p1xq48@d8(<9#QQTMP z-z<6P;(z3<@`m{}CUFH!)kcCa9I8{mQSB1A)FL|t$(xi9__7v{U0))oM~;8hyif2>ciO7H=R|iJEoN4@ z+dMVDf%;D`D`VxdHe3aRIXBrU+8C-&H(c$ME@j86r7UA5r!0>p)z;2F?R~s`h?XX0 zgJca0_)*FK!o;tfD#wXZppxdE*1zuL)MO3R4cH|-?9H82v_Y$TczQ|MnR@^=p~~hb z_dEy$HWaKNZS864VejVU>H&d)KuHq<)$+FRI!@ODG5|u5s<|_e^yJTWe^r+JmF0w! zj}HOigacVkc({Snh`@F6*EAD^pOt7J#VCq1E79i~j zh%12XDKaiV785Xi%m32G{3c;TrR{BOtbyb!V73(_2poV~o^IxrKq44GH#aaVi<7mD z*U7Diy{(-W1e{jd`MKFyy8vWs4|`WD2zZpp-`c|!0(Et<#)f)bcZGm6dK(~%3-G_a z51_0kATt73PyyP7tUw1PUkti0uY}0euApr1gYA0d>70P#*~NI*=6x z7#;io<$xbF16g38diGXcc7SEU7xLc;WsW=g_wSYd=SBZ=h2y?M*6)D6PUg0rfa%ZT z0TP~;V4@sBz#pJs>);no9&kP^W$q^b%eee7K5VG|$rl88{z3sr5@c^F;bQA#4G0d^ z^0Ic;1#eW$eUCK)Yy!Lo|4SQmtRJY35riMus`&qKVgJ9j0scw}yZ|r)K>!W}T-X8v zf)G9c-@yc75I7wE>jG#90p4F_0ow0+Cv|?;J^hLO`}sfRfi}Q)jxT`Uv1LGJ;KdL8 zkQ2O)zXO4T9}o=u4phcQ0PiVC2OAUs1UN1WJU^~?QtqF6Cv6A<>q5sguzjGt6Ctp{ zPhQ|{0NVl;{4FHNCQ2*Ft z0Q$c_k1Y+-z~}$EP8eZ>0{m9^^e6Z#9QPm~^uM3|=KK5e&o@b9vqVUY6p-O0e` z|MEHfd$KA4IZp2YjK`7?7&yxS?xcX%^4FaV5WIff$wGi}^!TY9-~_+!TNguqXD{0daZM?eAnfM|EZ2MCfU ze146)yeInDKmhMO=5-wUj!zE(6_2AK?+Hg+2=9qjc3|{8E`1VIf!;V4#UAL`Uk@B1 zh!d7hU|c;e>Yk_ugkdn+9=GEL0a8{R*ZE&O2jcJlD8It`@wz-9 z=1E9DUd0B!dH}BRs~t}WaLV+!78uKcqh7!6kI@3(i3DC?JU^y`7=Z6YS#JpNW&l*| ze+Oy46A^sCuzy_TI^=lv1u*r404EBLY5u?)adHRZ3;thR{}-vi@4w@&EDUmt=KdX+ zLAicEmw^2n0sWKm4k{%96oOisLttPCgi7%IUyA`?;H22Uf+eW%35);91#12m5j4VI zL{Poo68=ICzf$=@mF%rRNCv$8pe2s8+j#@g{#eM9EG$4)ga1{Q0y5Nrppykah;B}< zUQYHF5Fa=fjGIf417hdp<>n~_g*yN9jLX%-mK7TcKt(HWOCaOjzpJ@f*+4AJEgbP<%Gt=%?o<@DH;#xv!`jmNsrSsI4~lfrtyNAX-?JU1&512 zXuucHdr#5e;LH3cjfWo`LjI(2^KhS*kDG@d#D#y9g9(7)=rj!ujuC&B;n zeB7sL01xn;|Dzl?k03Y{{Yitv!FT5rjUOD!PSFH_b9tv|yxid6_h&g?-oN?s0`Blv zIeu^;KUG%%9FhK{A%IT*iv|PM>rR(L{M{c20WiG$*&ZLy-!|Yw05gMA<@jM>Dx*K^ z^7DWr>M0rm3|yyZe89xxG)?et-w@yh$Miqz3IH8^nkLBow|s&;V37K=t{?&&9#7E( z|MCqOH~cU8VBCCv`8|+H6O1K)v#oVULG*^ohk$=c&MDXo^hzcn!415#fqL; diff --git a/audit/dataset_statistics_plots/normalized_score_variability.pdf b/audit/dataset_statistics_plots/normalized_score_variability.pdf deleted file mode 100644 index 9af42030016afaea7577f2332ebc577db172ddbb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34031 zcmZVlbyOSS)(4ChcXy|_1a~b`pv5T`T!RLe;>9UmTHK|$B{-BKK??y26qn*o2@Zvp zo1SyNd*Aill|N=pGRf@O>)HF+KMA|;OI1E0eoe@dE?gmIL1qtI7hG9cW&wjh zPY|3|*u#-oK-b0*jJX# zWtRBwsY-yan!c}%FNj(6zi;T;`1*pp-I>Myo#G1UJK5MfyE`(A|L;?84?BI3FS8l) zwl7tYp@0H>nFU_EAwy95-%qqpuUQ(EskN>I_CcH?x2$7@3ThAUh9x(7*6~e7!+7 zZny#Y6QLSYZX;f6j}H{VTttu1pQ&)RB@SjuMSGHOeTeX$ZX+&40&)51tFX}dn)S-y zxy>{;lXPMVudI}mk_IZ2~_=)=Wc-AcMS}xVd4E8Rp?8!tm;H(-I|KjZF`sPN!qv#JcdRXF% zyT{Y|-CIHBfhXW`aB9XA>eBih7UN&l%{Sx0OLrVw)(_C1=8a*cLyzm;UoYs-{`~&6 zef^@*%Btv(WLjguAN;4MTHYSxQMz@kyDRB?G|k?x;0;6Vh#FznOHaw&{!^;$d4dm z4$A!_m-IOD-tzBbDHY^nPza^P29v@SDd({6o!v{1cVW}66K6gtj?)-_pH~FEJ`=lr zp?Y<`PxWV)^zo30zyC>U+N}1hI^(J1U|PUO`fVR@(s)pNR!uVnj(GX^ctqe`=%3#! zXY!%<6AMqb+r|qXFG3#HMtZ|lC5EyQh8`mF&IX95E7F;t3o|r!p#mcC^;!DMpENq> z0^FgnTq!(`hpQR9znmIHN<$cB z3DM!sAZ;YN3;w9poL;Mza-w_gEb?`7oj)M(F7JCH+H2pJ5FAu2#z(4Gh<5S0v~|Z{ z#JbesthC;Y&a`NG3n)!e^qMi!GTtTHN_m^EP+(-qH)M8P=gmQ(%Yu@T*F&CBmA-jx&QUq-g*iT*yPLv9CY1w+Ax zlfso*x_WbQTWY~dCZc>Hl3%bFY*o^90e`(iPNrwKf*YH~r6qSbMkf1b1z#xFhT`T6 zX)1Ci6AJdGnmSb_39Ui0>@@B0eK?YBylqjhh#`|`OlK$$a=z~aW2I7x$$PV)OnRli zrXN!_Cnh#0`WI8xd9v-)se}50NU9ZV&@Sr@!Skxh*~8Jt*c*H0TN#lwco=#g%Mzo! zuae&e)r)mC3PtYsRcWec_hT6$go-*gW+>fubqjeZLrw_4O5T%GfVe_LIipp^VX@8e z2mmGd{otYi=CIw5myas-bLqHkm;o)#n2%Zn+;Lo<(G4Q`i~%Xoid$Xf_|-GItxuw~ zrKl$uBzEmyT8VCbBJS805$$i`oRaZ(w8L&+Z%}+$$hpyY>q_r-aSE1UN1ltkM0AQu zGcItltxDjvv` zNmpt~t?@&xjl+wcm;$Wn@%QAs#AD z_l`n?ao$|hJ4lp+zOKUa2vJzftgR1Em8GM1ET&F*pHHo8`-P7+TG7zPA^asinl0s{ z%{cJ+n|(y;qKI3jgJcj;kzwpR`HO<&=#9l1HK>~gw(<>iwb%%zn3(I>9uRx>hihCO zx?W01#(Z|O5~<#>;-SSf3G69%ZNWk21AmHO`$8-NAPT|ZPDbOj^N8VfvpkCxY1>2> zsZcr#gNt`+vAXwp=R*MB6KR3NZWcpFnb2|T=YzN?ICL*|D}Hib;kl;7`vpi6za*iY zvw!!_x^A=VjfGN+j}`2fMMgx#SDX19c(o*)WhT-CYt52nz7M@p;#pe?E+ICmC-oS; z8~z`0QGA__Mg3|Rib7RC8TAMXs<)<7lGCnuUqnoEq%*%aPL(wA-NCW0T!I|FBptoO zkfj=+^(0XbLCu2EqubwwCX8VekMox%Y6wk~^q2%OEOi37omeY?@V zh-O6zXau}b{nnHqhBr~ek&#o}hv`El&~lKn+l+yzEPb0>-9m*uye_YIERka9ESfMX}k8RLBe+Siz zN@jeZ$28aP-g-+?XuKH11@i}di4`t=MP2e8D>LeQ$D?7&EzWEt=2aZGFqV9*?;@=f zQ}Gl2>~<015VOwWTwWTntdFX7pLvSl-*_iq)e%f%-Kvq0P!MgcXLth3y(B8?F7Zpr ziX%xba(NP;aH2XyWK_XzKaNbguwEU?oywG2&L6qklo-9vSmRhjUJ+SR90s1>o_F(w zWskZoO(jNp6{!S<3az)b)0;;5-(fqSU!NhMwnw)K`n0j$dig%>*6(i}xG1}GR&o!4 zic%O{@|4}d(X2;UmDDxcVF91t(74NAPrMqjgoY6u{@MTUQ%Np zs;_ZM7?{Wy(Tp~O`KtonSR4#DqTWRRwev@u79<4Ml8LH1>;(9DKJNwJZg! z+u1aL`RE7UMe_P2OzB7^yd$Nk@f1b;AKIU5W9U&}K17BsK6w2#q_ed#uoJHY{V)G9)Z6bp(6ePB*?HOzM~rD zIEo*DIAW(#98a=nXt^xq6$`}Csn&2n)6Iy@I`0l6GDe)a)yCmrMkz9-0oSA@ZG1ay z*;la99O_)5&*J29f1!JhGK3A((Tm}oT2U_HOfVfNgd-8NGtUA4P~J?0{z4OeCL9Tu|Yc=7b2!#5xl7r6(~%Pz2qdS8&WQ>~N779be6EY?daL9lIrk0Q{%hLzO%OGhU#zUAz$O>JEa_^MTRL=KnxGcJ zrrxe4=D%~~`$8e#wY*GI z%VwEiN^N9jwXD-C<+#7TZdHCI(*sdjx?CvGwaDd+zGvE}=WR8xJ*YV)t3WY^cpe<7 zSa`p&$yd55NWbu{LBlLjTS>)`DS_#fd$-JaSNXKmJdr{L=5}_LniM(~7q~zi^J8>H z%0cQSzc^wjU_i-nvk!5cqc!r3Xj{bpq(QT7i)XQ4ykBTDXr*L+=OYeNlO=U4!8$hJ zCB!;T^={-)ex8klcm=RJ1awCwlDzzk7W}>3AW&A%?H2=Kwge60=gn=NN%ZsQ_2s+o zF^I>S1K%2WDv&<=iIZ0KKq)M)`7Vj51ec;eI!Qbor8s*p`(lhYGlC>|!-Jt3-GBf) zq=nre)shHp5;M-Eb%9cY@4=QFOhjEqFNC2yH~KkT2sQPRheIVFC9klL;*wMR4k(*6 z;a>e7PA|)09rjFix{%nC6@k5Ib3P^6v6Y-6^h@^_1wN62 zk`0%D-!F9afIf|Zx`e{yer(j)@hyLw+!62283BWPOn5GdHmsTgM4v7SVQ!aPnOy*m zZv+XVC~-RmHTi|561jt7l*hcZQl%@%zeYzUNR!KVk-0?@ROUL zpKBAF*_umGvtH8Pe&X9fJGO96Xq+g%#82B_L(!{H30frUeATFrK=}#o4CS?imoH9W ztlmg=sie8hgTqAUyK#mq)B?XtA@wkJU!Z5vjUOIc!uQKr4cqh9r246YmI$Gg2WK_4}E>k1+8& z+GX5pwF4j?UpyadQ|B05@+Qf2T-|-!WKNne2ly9v&B=1d_U9oZd*^%>R79X_;#6cV*$|5|@@H^A%cMq;WERiK) zLUB+))8YWnR3HOWrg+RXi}7wQSLvletoniOblgDu4gGGt^T_`huJBWu53*m#9V*+P zByrnLx2j+rTei#+^#=|c65o0?5z+l@MeOi;?^fEylv8jVjT>SmWz*1gZZ?0g2uY%# zYA`mVBM$3WD#0Cax&>7~yxHUQ0++DQ7qg3*8eg z3V23H#dg7S-_M^(fAT^sxRczFk1Nc#!oEEy1FJR06Oc)^IXL>0iD+M!y63rr(4{A> zs>M`y^KP}E?CT`O!eR#cXt#b_Bh>FpBF0LktjX3`PLKgE+4|29F)yO^giW1v#PN~Ha^_`BTD8(F4;+Of6 z^N?N?d6IAMUP1L{!r#45P7^WjZAem84Jl0$BFy5JZ;oKV;CJRKbUt`Cd3A-#Q*(su z+B)1!DtL4*6uTy1u8a7uR>>(M{vCd{fv(~So8d;|sJjBzWqd-slU4_;FUOtAJf@Q8 z!%E7N8s$DWLWp|@DA5o z{uMvUsgb_nusqFx!qs_Y;9FL+o^L92#|yhH&}opcbY{}GD^~y(vqT{mC!@=v55aC^ z`z!xW5n6}lA6AexCzPm zFtb`XnPESasHZX#!PFwHgLkeX=9^h~xc`)Vr;{;6bR@U z5Qc|1$0_~PtcFUJN>VQ18r>(x2`kThNJ7}yeIFU4w!qJ3Wai8$98IP?2)1wS5Kdars=EEU%Cr9y zu_b{Cy)Tfl+cTR|8oU&c9yt*$#aS8`^)Y-`+9vSU>Qang?*!p1)E=m2UX;_fI_yvK zaXb540q-Y6d!J&it&N&CWn;sT4g{u@Z0&tYh|8gInQdy0YBMur>iATce z0MYdE8)Jk)>^sYkuhrWVR-)o{KxVMIU^S3zD>-*R&lXn5r>U9$CICBSp%Z!xR4`$U

W`_7!{S8ZJws18FRH7p2$JN$EEXJtS98DYNDghWwjnl2bizcI7@W8Z z0$C1kyF91490z5cycRR~RYTgh(Gk>vKXds!mKtIL+k8ptn=v0~YlBs}5YdzYk5WIv zkkxm))h~gUBJ3OUBqe*^uiM$ki#j6eUniRFdWybdf-ms8K%9}8yKe}7LvGx=>xE7E ze(Rg*2FYorMb1X6IIK&OemsI-JS1UfP~RW}neBo%lwx>HlU%TCXz?iLaG02=wl)`L zP^ZO*mx`a{^sB6~3ImS`8`+*o!GfN_42Zx9V#3_H#>wa<^R^P7qoL1zKAVXpQ*v$_ z;4TEEjz?(&v7mS|Y9CGY>!+<2K>D0hD^ z>axq;gh@%?Y;%E^t!*yS-!oFhdV-DOL5M(AXTrFw8%Bq&v+odv+}e>Lsw3v&cKrJCSUiS&wNXaa{bkP*x{ ztHLSU0tu_bSK>}ox)zh;%0yV~7(nKXU6k>K)n`H3I_ZgD18J{w5@8UR`&V%w`2mf< zAFR1bH5fywlj%p(jTgo9#3^(BNThX9D+E=;pQ5sb3=;IHIz~IPV&(}NiNyLuUQQ-0 zmtaH)3rW}A8jCD?N$FIhrefaVb-#_Wdvh6tA4kkq=|?AV`6`KUbLRO+2ikDu&J`mW zS5Jsr`UT~iOS*a3QzOXM2E86ngU*5j z^xSD)v`Jmi*O>%{C$)lEH2oy?z>S(NTbOi*)+)i!b)GWJ$g-q)fL8Pf`rWsWZdt?b zB=@@bRTd)xaTp>Pc%RieyOu@vkUJjr6T|vxH~p^7NKvS?IHY5*kneou96S{v*ul^F zb0^uOApKLs&Yl2YcjG0quH%j}Rq@RT3DX@@{^Arj9vo`u*;or#3! z3Fc4Pw+p1;y^jWsY40;k6;OqRf(pTh0bi2{8Z5La=-6?3n&1T^R=>qOFiki0EuVL% zj2wIj-C)b$_IQ~nWjA+bLkd91joQz5H;*{@ah9IN)WdWN*i{XUiKZZ~D z(Fr+WXdx5M4{HVLu^A-x(lP~K30^Y4ls1-C4L8ZnFj0>Qynsg;=X1@6_MovRtYt05 z0dgY4b7ntakyp25&<=kudYx(>8qr*e$!fJu!lv=GFEoOk(Wy_Ed_@iU`cqVVvT{vB zwge8YBy8`Mk>`;gJ#I0|aY&K3`kJAEVefCaJ!s?x{y+}ENhqh@F+0h=qnbTS2fWq9 zS9dWB+(Tzj{k825h==6Xo4h*JAl;JBlcj`NvF$XTe> z)3lC@v}VXEIrVhkIqUpl7>_g6$L1g2lE>Zlg~yDRd#^`|o7p+HeCRL`hDLbgX$;MHNTmX~)y|oTxFBJJd9dL_ zU27cmtg>cc<>kdqr%bPuIA1%sW>PUT` zmr8b&M>QbEvLe7$ZEtGX{dMT&+mxwmF)7>`x|i%5vGc{ z*l(9Or`>~t)E!6`3CG~($DE!6;)l#QDLp^6)5H&B7T8|)%z)dern&SCNo$bNHK_%5 zP%n=)!1B)me3n7h%_sd{vHg?O6jsqDtSCEWKU&AO;_UMUcg?b{xi50*P9YifvAtyz zPmP1RKQ7>tG?Bac)(ZkYRcHQ8!}k$(Q~oRxFZ+3a&PDeVOi%jRZDF_$y$oOXHs6%~ z*}38e$cE7;7(K?116aO^^Os`l)#McXb+rwPQWha&TWEz0swnD@yQu3B6&=*O7+ABx^cPP{b#5U625`hgf>tD--z4DI=s*SF>hX;6CU0%=pIgpM|lxeQch&)XDao<8O40}8uKlQrEn9(S->2Q_XN2381Ixb zD{kNrDzbTivuhKb!C@vUo>CP)6+FDlrPFixGUV{C3O-a+QYW)8AmYgzs`NS~MAr-) zPgvoyUCgAZ0VD#>!$T*=L7L;gxb0*PAXx zypAg)PphGykugzfF7mG1=c5z~O>nYQyO_HM_fH=*iT-~9M4|9sky zRBJ^M()O1XsVDoTa-=PWF-SA6RP&=)38 z@#qMY$n+}7X&Rq1ydj;YSE}3hnER7s6qB+EEw^m(F`=eKywxaMh?WE{Ou=vUKbpp9 z?Ggybm(9*V43k6YI$s3JWq|=HyR!65SD31;&JCJ&+gKKiI5pv!VGFQCx|~ zXzycy!$?ItG|6({H!tkz0|VreqaGJf>0^jsUUvKW!8v z-uB%RcbG~;WIwo-&&#As04|jBNG5sP)_{`xVN7Z!Xl@T+AY4O{rQDjLL zS)5B(pSp~*;QpUVed`*iZJA3e9Fzw?5Vfgm_m~`p)E|AH zG_R&Wk<5R)5tUY7CTpUdz1C89q70 zv<-}fD6amjTc-?G^rL*LN#IOyjA`Nf!sa^;%GdO=G0KZT%q%StW1>6AN4Gd(k)`Dgp}LFy?sn0Il6v6@fE3{Gi5)Yl z>9rj%kHGTcMoqP_iL&Syk9wDK0&Dj1s3DcU^FC(&rO)~)CYvIQE%Gx`2ELn!da3e| z76ABPmxKOH)7JXqVx3?5+h1<@qo?y?i7AxRFsBiX_$VzV%1rLT_N(aorXk(7gu9r- zh1w4q-%?F`D~d?7sfB^6tkVyiVkDQib20^ZXAalCk7FC(%`(tLM-uot6=f zU`Y_IGnr_GAgNjeD+Jb!J9?=%kKSoOM`v#nMp>c>pPBWjzReBtvY+spT*IB?Oz?XzHAd2qiR>RV)xDe^Em=R_d@hK%>$j|6gS}Zg{D-CPVS~Sc#D=kAH^KtYFh+ z;n1_e0=mq$8%mQ;#34ltYM2VuViGJpwa$G|+}s3%C-FO^K@OJNU~jj0m@sF=2)h^Rld8;o1~flwTQ|YPKh$Bt|!lC*IoY>-W-9 z|MsL;hOSGk=Qt9dhDkEQq!VQ&_0ZAS0apz+Q~I85pst_syRt!n`H;$%NtXKt{9|Q{ z{c~d8^~MQ>h7?ZUIm2X$c!Eo2Tr?+!at|2Xzj+IPY#3tRZb+e%hq*{f zh~&8yQo>w68-}v&9P~QaQ}ONUb5qH@r`gU)1+%mNqSeiFT`!S1Zhub7!*is2cc6{K zB!@>g$pdRyMqvME`b?xwGlhEd2UFJj zi^|$qrBK(-+MHP;@W);4npo1!jVE9vnQ@g|GHbGribLE77mP82$I=37`W^3+^cdZ% z5peyc&U|26YNP6J?*J7#EDL4ozn?+dqYLqL-ykdcCGg0D_ituGwgw9O7F_DH2p2+7 ziM}{+7KPW~N2KnhpG-FiZfddsYEe=HTuzLYw?FSf=nd=I4%c+_`R*ktc?CySW1@Tt zev!Dc>Fsr0g}q=YMH`okY&BK*>94%!6%CHNJUsVpGxNtH+dp-cX~p-oA+KL8GNVSy z!y*K4hh<0t^?zv^^V}xZOki1sTrC;9qyInRZW;pt04|eLcVJ!YhHrmgpCJJYXga*c3%N&!9!h*i|(1 zW*4_le?rzMQ#9)v=_kY21DLBFXq;xU+&T4BzE437OJ!H&vjsiXk{U4lnvF~_}G588#T>_F-$Ww+v?R`^N!_a4B7hyW&zVcSNj`;K(1 zS7o^H_Hycbp?VLJofRL)V~eC|%Tt7~Qi+{huZ5xKxU>x8bD_1|1MCn}C#+(GNH*#r zQUk8;DEx*unPXra>Qn+Z85w&&ao%r@7V){Y*`Oif)_tW7_kAFv#?CC&+_0i?C00e= z-ml)h0-jpk;IVG|7GuIfqRbEcOFWf03-rf}fRpsoS6o%I|8J8Ig8Ff8W05s3#eZA6 z=ZH<>!4RLJYnip_DQaWL{8Y5whK7mS*8!d66ky-cMaQx1G3VOW?8WHPWbR`N@S<{X zHxh*>$hOHcj6U>IK6K;AZrrS^Z>mJSX+4%^G?t`1`7?8*xk2r~nLblrl7by~C=I6a zsX@w2?Rqx+o4-WPm>r!JOt$op1j`Gx!%^X$xxk5KBdm%i<#oBa{iAP9)WO9e( zj2ni+SxTrM=DvTQ6ts?dkkyD!9$Wb^M~$gLM%98wReN9wOpf6?<{zBnt1wOGESBZT zz3Miy${utu8|oU15w5S|5vka>UQ#X;a}()g%5fP}CON_w zjoBzObu&%K!`&FZ21H7Ou=#6b(hVo#h&<8iA&W%jd=P#3X0W&oM1nEX8|UsE{xG|K!LOA!Y>n!I+?o5YN*okYC(C#G6$xncAN z11|Nr`e;K+RnMMX1w@~_gHp~!3iPNMwFjZM*es?d<89bb?{Cs%fSv|{bQ0nfTl_1& z;VxSN9V?MIcBo8#8QdXce*8Y3sONFgEvRBiKY?iHYdF^R5wUrCJ5FsO5!E_RoMY=| zn$GDT7vFrI)`Hd*lf=GXVx;NLu7tel7tynufM2EuuT}+G>o;2F8VKf$!nEm~JSi|R z;h(i~@n`W1!+Lk|PdHcSE%487KB__QE|i$pyOt$aN?5?}v&!&u4&P9M%Bb)&gTE4I zn(W(r4s3@6ymt$jAyHvVZ6Iq{DA^JPPwcHR0>dUjb8eYkj z7{TVS1`g^NNBw!np|O#*Yp}^k(6x(Qry~5w5VcjrM&9IY4xIm>wv8tfc>JowsUi7y0G>s?v`SWJEDa~CE|?fJ|-ClX2ANB!6PtcLpb5=MhW z-#_h1OKk8=nLe+x7^p4Z(!1f^#EW(Ht6uD2|0)Qw;J#dr`_?jXPU5B%=f}sVd8x!* z=|XaymWuP)M{b+ypI(jI7+)uv5H>L<2W^cl%Rf|^yn?IRmhHie(Z@u;v_WQasdRNK z3Hwz7~wOB!xp^7#a}bs>8*k!X(~ph(sO(ti{U@a|e=QypqO1jX9-oTJVnNNCWsY zCs!h!HSZ2u(WJ{x|5(oLe*~FB{ zY&(dood!itxEm_S$87zX+5G{Xmf5d}sI6>_yKdU~`#e48E+*lFBCT zzVz!;&WG5WKKbCxGe*VtI4~$tOqAsSeOzO?5ymFyrzd0Uq=cQ8z~|#y`W}&fu9119cC*=$K9sTQ?rO? zhPQxwwOmQjAM*nekan*BK#{$He$cr(cm&oi=xrE=vD)@c>nCgm7mZg)&?A?YYZ=ou zoN_xGolHCn?OSa{2|<790Rgn;HnObeFpRIBgf# zassTLQRCd_2EO^$6@g9_FC6cvrC-Vj`_~8PoS-nEg}r+SQkndLX+ER zkd5ZP)5a`TxPD%L1?3p}M_s5D8y<@dt!z}fPx=~LG5A8b3tm7tyi^PUbF)U z%c42n#_W%C)vuOl2Ot{J1!}n3=_b6ZqB&^xIGghoF|S1w>~5|J`|1oHTT78hibeDvM|M# zH|!9DO8eB@3>-&t8xu{gt@uu)@%fZmgy_8#mNkQ zCWA4xJ*Du~BELCHU0Gcybz(>?%Nq2%lCJ#)XS=^N?LWVUNzkFeX(fq zvoxx=6lJ0!Da77BamN1in^)(BT`TSMj~eAb4C7(xKa}|dHhtk@pv7o2`*(Vll)O~Q zCK`(nqoGtJQG0XUaR25$H$%p5+V3A$!APf36+-5?wx%9{6uGhTLi{e^SjR`u>C}Dc zOI2Y%fuEQK!szn3WKE?Y8(1C^TVcv>{nMV3KP``>H`zPK!p)9P$X%;L3u?K>5!jP0 zT`>GPzYGf>`+KTmk7S52T&?X@B5zT$%2;W+vv77&_nRilg>2G$!mo?(c=1y&fPjWY z!%a9^w*skpw&EycZ)gnDqtdZ+#e=CLLgHq($3XsO%O>}2mjNBtA8VWs@WQd7cNQHQ zgOUuJFh5Vj-M8il5K66MKp_clRJ_U8kjMmKa`{|+nA-SQY%|hp%lAK8dH*ps@A1@5 zJUG!!p`}9r>J4wxmcv{LrSkc;(d5Xx*~dznsrqV<+14tW*JGcf zTQX<-+P?SdL?MMDIMqv^n4b5tZ+O*#tUlIHJOjv%;V=ss`>m=I0-2hwb3@i=8x})HjHVLYGUANw@_Zaw6+;Wi0k=JQez(gKrl z>rbuR{5uBWpBRLXwQEGxoz9|3GYUQ$5R_NQd5Oc?)@(~yk>XnKAWv0*Re2= zU`PnAxlO|uddB!ad+o39ay&+iKbXfL*=N|CnScKlOVSLQkAN4Z>NfPh$Q@Z6mT{XJ zO%3y%W(v9Xa#E5FJTTp*Zn;$4sU|C3vc!KKq^TQ@E`RjKNu|?>rDsJEyE7mM9+BGOVOo3**Tl$>b z5X2Q`P??^Zi9{jq%q+((?BA@%98H3Fjn{c@6Mq_Pxwv>wF>k=g&X&*X zpp7JJL0tr_%?ZgD5!K`-IxdD7(&lC{lf$UYJkNc+OSa?%kUG0bL&n#{MV^UJx$KuT z$EQcKTq~^Q#EwB*QB&y%PN-Cai3B=tYNxH*KP&V9l~acjSV~ME=kC5QHv=2$#M?9M zhHjS5mYeHr`+B7VEaFjb^{CHgtCc92VCfvbawKJ>)5cm^br2f<7BsPueuqZ%?$YaqacpkV9HcrsAy?73GiS*thCZD=QM2PZuGG z#Y&SXjosW4dEo2jEJU&8zeFRax&KON^+vM)7217hRhH768E@Wp4FwSDxUE235-lZXBez2>@X+-s4K zNNApT)5`lrIg=m;^1MslM|M+ZMV?kW55 zE$dGIkH z?{z%8{HaUf${F9|j#Q=^jjB@PVdiCo*(JSZt>58Ou_oP>!>(g=>dT`3U&p@&Bl3>z zzwzc9u`pBNS>o+$w0h}&+$hjHl!*V+iXrV5)CJdp8}0w}>My`670xpbXz0dE)^sqw zNB3LSeLT?SLQ*nJ>pXeRVn|A60;yr=Ct~&tL@+_y2SLk>M%RCWP-SnjuApB zZ?}Jy*yHAq`FWd#qE_BwWjo;S9*rb$Z-6}`HThWZJ&ece?|0x*1ZUN>Vr%sV3igQwuGTAkYMv zG`?BcFkA?o{y?(bRwp@iLgs2#f5OA{q1->7kP>J~<4)ji%X6!uOwi9i5ZKSOP=G#u z8hstfhmpB}TpgIe{Cy6H7Y7Z3rk)ritfZx?AZdMsnkQAq8#~dkD<$}1CVqHHI(xG| zXmO*0#ryLa(W&Zj*6OWn`g4@1 zDF@;`ZFi_xw*rO$_JWxj{l3PUs6JqzDWOlq3s2AACUxw?OxCsI;Gy4#pa?Z15Q}sd zgDc;eD=^c)#l3lm3&^l$)3_DTWzDC>u)NNfebO$_<#p%xi5pMd9}-^MdZEV{z$k32 z^Lj++c+gH>pvXsKL`^M?A}OCA=^8Pr>RuPUS&)6Sy3bO|$+#RIoAV9vb!cgb8$iy? zJ0`4}BszlsH9`5+0!=*kh+~HWSn{cV&p{-wr%jsQVFnk*J&#M1`7);#r?#FaM<(Bq z0q0MSKbuY%hPI?+(8jqs#7~_}8mhOfI&Xbq_ssG7Zc>zUUX|fLVmo%7hUB!Go8-#K zSxrOUSjWjJg9Qc|B%jPxMlS1r@el=|o`GGwRIf`Uo za{n?r(Z|;GK+OX*@6(*fiu89lG<8SW<-9eAuUu^5)QlnSl>L-UegHxO%IF*BqSX|{ zZfpBD$0+ulljZJDx|u1L<`PzT2G7Ime|dnrr=Bk=Z%So+HK_?Tys!qFEM(NmdFF~h zF!@~r!{c(!)ZTpZ$TqpP69mvO?sA*r_%YDpO+IjO7+R$WvnL4QPhq((t_G^4xLN{0 zys3Z-y|v8Iopk6l(Xq8$r4!qZt2~2t?cgFuh(wylVSM7Sue}CbuW-q{Fo%5TTV<6C zQVf|^#Q{MxT6Wt7IwQz~zQbSB zW~JIOET6UPrMDx#dB@)>`{zNg+4Je&<7*bkiw8M=+P~;XUA8i_DfOIPOwR#+0$7Y* zk^ptQ%3bglA}HIEo@XFwI--7+|My>bT|)9@upELw!3W{H@!!IEi5)|oUec-9WM(@o zWp??4z-(pfe1sbz^<+Ic(3Qp&)Iys^C05^6V-ok$g-Awe~;@iueDha(Z*aw_LTk$R5Xq>~R}uI>~=r>`$x{v=xLo z`*MB#kBy?760kBnTlDq2*^Z5=>&qQKqfIR=04Rq2t-Y>?H}3yGtvv-)R87<%pmZZr zvVertvb!uL-5@31(%s!5ol+toC4v%4mz0EnAP87=OP7jt{clnH{FMKEp8x!N&O2}R z&di-VcV_Or6L0QdU8pH~*B#Q~Wel#(O-+{Q;rF)p3{AY1iZB@R;XKkyx4LX*ef*n~ znPSa)_OPaV2n5nIWSPB+Ssfj4>5v!gWNx?PAAsAKRz&FxQ-csqjpd|CU8FKyBpoOXJ z2s!nARgiz@Cb!*=0QT(zKEl##b*?v1B|1oQnr-D-u1xC@9-BNBt54HZbRCzHU@p5ev-Zh8{V1pVAd{cP%>g0HI&;pw__lNzJT-GmSf z`-HY~^3UNp##cv^+H&_@yQBs?kBhcF;emC}npvMp5(f&a;7i|mTJ6T7H2ywK z8kMfLo9@CNUW!Oqi>h(5^7A;Ow;AyhC}1C1X;Sv3xN;EWKl67sWmgn) ziNsz|UN+B*=#j5P@oDN{3EI1uWus|y=P`u1hTBRry{*srm+2lc!RjwwV(elUDUEzD_S`#;``Ze><$4|0&ytsL&}NS0`ZnYpGV?KBDnyHl}3d zbuS^tD^7io;bGxX5bWdC<%hSX>`Un)pc2&aPJZ@FRh|pXzx5WwO;(N6tX-r`bv?;B za-$fnRpQ&%zAX+Q_5)@2EVTEOfRFyI+UZVB=AlPn`V%J50kj>@^2}IU6~oURnb*Hw zn(w_>GZ@ca-S;Nr?sJc_mQ_`opbDMFokI(5>*!IIj1KliPXX!n^+=v@JTEhgj~#A7 zJ_m_w0)0nJ+sGCC)H(NjFth}@W->;{F?Qwy7c~Rl)4+46l-4=PoR$`5WN?|x8+jikb(aY`7A`)di#X8#2`Z?vnd0dTF{p7L| zW2T~`V5>Rkc&bF4t#FygthaQkgcwnC*~9%GrDOTo21Bj8EhBiZrW8~ z&XO3H(60oL@y-s4aNX`4-J*81dHeC&JrT=U4B{rUs#41J$oQvu36G`rw2PlK@i|^i zzxNsIfxDh8S&>pWJw;60rG9m-$Dg#v`Kfi=@^1P-D8M9fiYvoVzbgLvdydU`Zf@<2 zYyh(%D;n<|6NiF_EAljFRZN4-Y1MOhnBT4BUUPj-I*d5$=di1usVo?&F8H3cdkc~y zTMKLuTBLhlFZOG}Y#u0>he%g=UoV;wxT(=FaDQHYgLyP!x~r2^e2@$a87f*B*XA98>ZsN2l6QSX@eDtOgT|ipc@U(1ooX=v`X@D>1cGZN3&xB9QZv~ zbk<~La*YR!s!19EBdI%HYVg|K2OYE4+d+}ow<%4l{e#2>_}GVrkdiY#IQIBHyktG@ z7_6cMpZ*dyk@Q);>}&S6QtaI=RlYJ{J+!POw|1ebwMS*bTs+x&<{t3@UMnjMBmIe5 zgQ=d{MTiak=<2hJV`%gKpC6;+jxbYPWtIIj%!?)Wfn1Y{S2Jk{xrh(%y`pMSh9ByCjIL1+TpDbrN()Uzsy(Da^sIo-{-1xjso3awWQ191E}8d0Az zM>QWO-eDm4Rm-ucIm{7d@`1L{*qkkcF7I_bGD_Gd9DPh&QyYB z$80^1CIr(ZN3NJe1}^fkIa94BcTkiFPt% zVV3IcL7Ucg`vk!KA)zx;u3@FXk~y$fp9$>MXORO2(@@Q&-nYqY?J?XW4N(>DDKWl` ziLL^Z4olH(%rN00$~4-hSW! zjX~nZ2H$$v+sI8EMKf-Fsgx(yF_5QDEfc)%_JW4PUe0Yx?Hzu(bm4Okte@qJD#aOy z&1OE-Jb~LAT~M*5n!Z6bpNhlqwA!RsAjEvT9=!oa=?ki=9=Z?l6O)?Qdzav^rZu0r z=c?QUMD|Yh-rm-Ol?v)7EB9YCzTK)|DR*kyDj_{QS=n5vc(J#8P$J}W7}n?S$QVhC~Ws&9k6ntF6?%KKqFYzb_woxRsA4s3~{7{lDy!>JNDC>oXlac!3Y1 zd#u#x=h?oZv3ugLce1w^({wb?ra!&E;viuyrKaf)s&KArbd(?4_T;}9w)<{WvB~3L zbz=rz3_eB+Zrf_tus`uQTI@Xe{BGri|IXIQk%#Z$(f!H84KVPp=Ed>a!NI|8y%QJT zFBLT}j%L;tCJ#sib&YO3$I1jV?AiqKn#^4~AhhMOvB*xUruk^Oz z2S#zC29~=^-5PrAG2GC6H< z$)Mm8*9A>^jtW=agUu7cNnstjx}KqkwF~Mu8(LY@UIZV?O~mLvn(`Ac+uj@B{nmD} zc6z@NPUuICL9ig=cB7RIq2uFO?!gWJ1HWnRRBu#&!Qm4kyPrt; zA1&CQ;K81cWg~77`*H2nkGsT9@5n;jn+4&8@Nk^nFm`&o76PgR()$TRTr2kb4OevN}7f-bh95^{5@PpMDx7Ou?6Acuv9^ll)V)HMqRBRbyCGp=tY6W)9J z+)^koRf4ws8$L(M+IQM_kE|4IS@QDK*b;A!$5&r_sl0Zcn zBy8g3Hg{C!QdA3Qy1P|vG<0q1R%UhM>Rq9uV?~jWK_XuN3Y`p7Cu_=^Z{Euvczt9l z^0jTR)E@olRP_AA^}_p+byX7(Q7ThoWikQFXC_VsM5Aw)_A96CoS?Bqu!8r~tL0Nq z&4O1HAeNaQK(LJ)Iq|ru8YsfY? zy4xg>U!_s9aNsDO2lj~~%P=IZ?Ve%AjCd1$wf#XyV7#>K<@l9C6eYXDOApF)9OWOn zCof|Yzl$%Ex)L$n+V`zPK~-Zi!rKc6SuBuvo$7^lN%^4E9vF=-FyJm~=y=3Ms#~=g z?~vYc&<=9&VdUREqMO_b)9vJ3yT5D9tVMsr6l;NM@?eQIKiZlo>1FYAZ}J;iBSK&4 z1!nO_892lYMqV`UKAJ(fOkHZW>vw#7i7+af{G8T0=PoxW?_cyZdS#hdR0w+g0?Bc+ zv?1e0!7XdtcPB5ZWvFHGK^<;|s$~QeLbR{AlDPa#CAc|JH{cQO++`r?yJ zEonUrMd<^FyH4z~jxLxjV}zxLqP-@irO7!&)r(8}qMG03FhE2UeGuL>tuAdY<48;( zz*HA6I&^JAnrU;jMdQx>D{@DL5ZTmS*1OIX_>u`xHYU9Z#C9ocNtDEK5-PQUD@FO` z*M*okUNNxps^k)xaP(wck+HS=H08>cB8_D;DffWtQGAcHqEv`98@^j>5Z^0vytLr9 zpkjxQ5kdn@{^`b&L?++fN?Z?HALUhb^jP;gsG7QQUaV)(uj;ARn(M#HI8!V8)Y$e zF(=Xz`W5&&6#aUo;E(YN=lu(c;BXZVB@XV3gpVadf2W8nO$$`UDi?qqXT{ToBptN% z$-&ei4;d-*7CzlMBp@yi{EaD)zY%u=3S|T+5cCy+5J};HmQ2RyW29L}qXwNaLBXiu zh8IQlnFZpp`G?(3^h<;5uWxctAm7vKk!S&>U$kmg6Nd1otuUFkX29b=3uaT!1$RB) z#5#apj}8~weoka%e?7T6e(vR)`P6a$i8ZSdylz|TPChjHb!Sv39_&bkEJg!-2qm5-b*y5W-7w7|a z6SiK>-f~zM+G1CDB;0zHG)t!Jnu} zHBe+Cs7!I{G~0VP+`BT$68eu1w zO`Rt_LlEsW`(Z;UdTdLJo32PB*Rs#diuO^yH!<11h*zHo)6UxXqX1JrWPFF#go;)(&ZU}tQ+GKL+R$@h~iYj7opcEure)}1{&!yv1%JT8o z_^lmk5&_E_=WrNtyXkKn=H!I_H;360Z`uL6M102=lO4ay$2QA6hbijQ@hE@F(`;K^ zRT7sE$@KAGA2R1%AC2`{Zj0DUn3NKD6?+&*Rc4ywD9&AW=dgQaz*pi~t?Piqz58QR zt;?^pX0UWg?=FA#I@Dx*$bKLl+-;eX&$1)?`5}2i%)DNfc1}UU{5#JX&c&YEwaE1m zd)5Q_G5U)JK_d@SiR`yy;B<@qqf4_^GZrE2s1I`4m}QsR*Veho({-u!18=(Htqlg% zHH%8OXT(i(W;jH@xo~qy-7W@YlG$NO)BMo|EcH$JdQGe2U{5m}!_3EtM2Rd57TRLT z*Cg531hC%Ir%H9ZO*XT%yos{b+R*KU=+BSzRA88k9x6FU(g!3KJ2Fj2iS4S}+~kM%kNCR?5|i_iM9L`DvM>T7Rbj%U8sFD%_OSC`kz z=1`22-`Gz-XAs?O@FzBYMQov1Y9`=SpXyvs`=o+h@$#^@foj^}i85gG?Cqm;Yvk6S z(*@2mTsb)Yy9>Z$%5d6*u~EF`TL?1y?i2UUf*ht-Cnf!OLXc)quhJ2s6oDJ!up2Bl z;S@f$whm-cf|kk6b%|*Mq@0(e!^BOdIoP;?7n0+tXB5n z?G92LokQ$7mMSObUq}Raf=mJxqQAOgviB1h^}A$lICu;+oqPlrZ0BHJLp5ROd2XZ0 ze&AHIWC{yA`uaq2nEBw~lUqndC~rrY`~kmXsIPp~NXC^+TcNvsHA&u@Ll}?77E<2} zGCQmI_x0wblc@6xCN^C*c5Gy#7`4Dxb1P|oZ@pt%#{Rt1Xi_2O5$Xc*{e_?&o!sd} zsY?N$hAMN}_B{7;=)Nu@m!%SyWPC%3-q-F1HV56-90; zFJo&6FnlswruxghZpVJ7{D| zwsB>Uug+EvjYV?~qGzw?aW&p+-gXU;MP0V7aQ;+)Y3(}Qx}TZs&&?$7f+?b9Z>S`` z;7u&gIk967A8Ht54i&JPAbKtw5Esr%@d0(xcK_i%&f4S&Qpd^@?sIy;dDbudFBGMG z{7t);!apQdgg^I!BeCi9nE!(debN=BOt&~cJBSmiT=b1llI~B=wbz=V*n5E%wzS^g z(RgiD3^=OVPMn@weLY%+g+c%Iw=PXkVSL6*{J`0nPp+?UCJmidr}OVrOkeTJqFoSH zi?}fE%>3ans}G&am=Cy- zE}?s&l~QW0p-^G-5(Of?>Y8$Q38`$Wxa>7zSD5Z@pM<3;bmDYDcll2?!p<>jp`1K_ zG3syKv_pfSk1h~53FigKNWX!7P$6AAs)0ASL7*5d)MmXA82UabG{uIFGw&pO0>&e< zZrx8NourFpt`EN_Ej3ioVy-lf_BQRMGhDEZvA%qWBW$TQc6~>c3T{0-!Y(?X+aHd< z;CvGTPMfdJU~{i8r;JW(*pZigkY1OGp~+ckdw4s9f1~! z_FRvG!Jb7KcA96N?Q|CvvKLMoMB&E3JAb0Etp6;9yQb&$MBI)qf}B!+^LPmB5* zDQ5`jDJKrn+Q4P}b4WSI7Kd{Eg%XD=K@bzdl}$`R2jynrgq>AO*Y?$uQJA>Suvmw| z4O&x7T|d?Kn%8fWZc_=yY$_Uu@}qc)>j4v0?WY}s95H5fmr0iKQ9Ai*JYK0!^UJGp zKk@l6Qx$K75w@YC)U?Au(z*kG zHN4$a4fP@j*@F?Si*SCujTKwU2Rf!7b2*RcqA}GWO0+)t3v%)M7s#E}Ej{B0)}VPE zqsdXXbvPs2Ny#rY`aWv7vT}rHXLZ};9Cn^FJ%Ikr^a`%jWs3^IKB&j!!WVvW*NZ{( zQ`yo9h3td@KfdA3+*dsqfnc8kT6jEgV2t2#|MSl`lySO?cfu>Ojl$_uA{|irV23=r z5}XlhI$4J06qXLL*JfVz9`L`sl~boZ3XL+0Z)+Y@T8dqCi-}3XNc=E5%%{hD3oK4E zOhu4Bci5(u!TK?l-%R5H{2fyeH}b5J-gS8@otO!+t^(Ct3Wb}uyL`=bFj@^+GjGg^5hQPl0Tq%u2j{W2KIdz2|N6a4?D5{Ze{H zdfGWx8oFh4KgW*TOT|m83FFFc;%s@5OWN!)e4%_cMj6Jp?|a|v^lR+z#kALC`Fy*Y zzWR~H*vA+G4@dl^E5z|f&6q+#uyE3&L&H1aU$FMbW(qu_UV)@t;GoAymlvv|J}g?6 zlBQ5-b3{}nW8q81IDD+})$@^1G^Q)wP5V~v*?W*(_JQ>RV+(pOz61_;d1zYZ^<~^z z`hCTzowvC?y=DC<=(b8BN!|Nh3y&?Puic>tn#+#P;SCsYo=lvs%H(dcE!;(>jC2ze zy&DGE?eL=%@2>%^2~;~ZXw?Sd69-;K@AtF|2qt@}-YqEpV3+*chp*GHvBQsZ0+~5k zd@u(N!i>4MGqmib6U6T?o6I8Fe#Ew7M`nbE(oBnW(l{L`^;emBQ9ST`owN0-(rk7S znGa?&d%Z7bHY&0`k8Xm|^Ew-|)XMzR$=(2FQ^-Bj?w}B@_^jdVJfxmU|bG zQiibapTmfAOlc_hUuZCSWf>=6#i4$M#OF?}%7)C7M6vx7YZ2z9Zw5h^rSVJk3M;Q) zsEHOUe$eF6yQvto`Ebx^)Bm293yEB^oUW)_iixogqd+p#ytp*2lu`}0VDd;4W6BDy zz~UHkhlv>4{mW$YJpo>61t=Fs+|-N)b=YPKB`Q{v?mV};wB)d4POAkI-kNy#A@qRh zSgd`lUi9%R+Gj|#r0F}E0kXG*TfDMpp2wiHL{NFOUgiE;x2w7Cq!{7O%>B->TTOjO4as*nC$~-sZE8V>;&%1%}k~%HwilwY(o_l zXJ4Bth~i#*v)!TpsCOa~!sHnGJafL!0kJ-I zgqdl8?=daVWm5d==tT@DQ7*Cy6n$~@b4^de1 zFiS*UPgH7|rwQ%p`mcPsz@RD4_9q88&$BRr?*88!B1$4oFFCO54oLQe%~>(JzIeYK zUdlUpaXI}PYD6%SMwR^QO0YZ2q&n}~=%o{J+XIp=W;&_L=T8jt#@nnNC)xO88Pus8 zq>QLSWrl6UoCU%$ zzw{~OF_4?D^X$rg^)Tm+wSB$!AvRuve7lb`4Y)f(Zk`=BX=<%yqA6s?R$@Y!@QKJN zO`~zeAsq!hd5GrlWNQh^s2MFe{p=r>cNvpY7#wvau4iVY@9?ACyAX^vp; z;(HHszYuyc&5G3-IZ#LQv*Ju+SKaD(%KF((_nOvct8jY$qa?50O|r`$6O%RHJz^v| zs$|U_@u;LELpGKyz9W+fAw4o*n=F*)ZhYbNmbU*K;?FTeIsZZfoY};v2%EUSSr}mx z_w)hw@TXIIm@}z+N)`p_z5{mfIc1(>twA{dW}w}zCqUsq7xPwZK?gNwc_zHhbJ(`0 zTP413)Adonv9><{+ow6mHN82@wFV)2%bRW$;X-M|QZ$t0jIX9xudN#|9l( zRmp_jAE_5&qgmHD_AGg{$ME*C(ih&f1xKOzm7$YPx*pgb+h&XX)f=4?ce4z=H9fB2 z7`b@}Io^;derBy@Xs)baC75q-bl5JN!K7pP9BCws<=r7}2Ku%#nI#1G$~#(@8B%eA zLJ3!w&Z*;r=frNq6PLefhDevCfy}NvOJu;5GEE>ib;(c3(W5Wdc`SG1^>VnI);J++ zvE0O`cJZ1Lkp|m~dKXq5M{mZfyu5qVm9Synd}*MLWm3c^wZX^j8h$1Z|Or>h07kg@$)8I!HVOJn7OfNqFCO& z!&^t|h8JCMl-Dw`C0e;>KX=#vP-r<+c8ANiuX9 zm5!>zjGCfi?Ec4BHWnr~kFaWTQ?brzKt!IUmfm9GyK_1z-JQuJTRi8osit4rE;UAcm0lLiCh8sivtS!ZaB7+zTA^hCy zwjw+}C2=9W`M&c-@0#>(uKwlGf5X8B}@~q73t)|eEsIDWNT=-=`a>Q_Iw&vS6NH7 zyZvLv>o;&w3A!qnWBYGj)f7wLNEBS5Ajm4)uOq=7xpyl1F)sydqdf8F6#txZ|6TH2b6I!kMMmZLP_vsyE zW)X+WPDF04TX=)q8FOlxw~Ma`9Gp|>b8Ju;*I)1xo^UhvDg-_F`}PCBZ5D7df|kkp ziwrzIt;+1)qCyZbKTJ7PPE@`3Ov7)bEh>{`^6oyN4zoXX41HR6jRZWRNcVo9G0SzL z%_LgS60WSyrO?@16H!D^wCQP%Ut<91vj?Au|SrWB%Jh7%oxt=QP|o<}H-#FXr{|Td!@~ zfysk6W~{-{ny3D)Z*lAXYh>IzgY{cR_VM3PwUejugt7+V$XAxw9v4Q%UzTgc<9@hb>i*!v2Xtv zWli2t_D`+?6Td%p;~>1S|2CQBmClp_uqcZZ*zBn-;0kFx*-q`@FTgS}5#*}9#9%ai z6P1Nj7gGrO(dKw#3RkzSl?EfxQ`@zxW#b_polK~}I{MojwBa}{4hb}$1bMM*+oG56 z;LKd}hIw0vHsHRu+vjq|aXh+T(3L1toeuFz&`xHfblb*MtCf;%sj$oPb~bzs!}Gqv+z7#+#d+Hy*?+WDM?7m2msVh##KA ziSx{0j=z|u!($-0z`7BNxBfI^08#lTff{Yw_yd3M*d!LD*UdcR)8UUl7R5mv!ywfM z@V!y3%b8LT_I_&?g_j>%*aIR9mZ+$WljD$a^|19+-{DnRR^TpVF}%rb{fMErJ+g_y zhfAq;4z=gly-?tg*ndxT;$Fu9`_y6^#iBS@@aIaFW$;&NMo-|m8R;(}or<|(_0J8p zaTHfDD5Sk}dP*bZ4!~lO{td4$s5NhYT)k+o!H-GXWNs@HH>|P~%j-ID-|+c|vGi-J z^w#c0hoXdV;F4QOWJ{j6U0xrfx+bGxj zQWs0)_%f*2=QT1@O!C`wY~Tmes#PQ@YoT!u{el?Y+c_G`uByrksCjU-*C}Ep+|6F9 zbqbp=ozxNPzU}FYWSu0Na1LwEGtU8+nE%Nd`PhFN5>-_6YolQ$;;hc1%K7@)4RBKub?{9 zAtTU1-Qi{MfWq_K5&W;hT8dASbejCvd}w*jq5B-;8_N9`(+c>H{gJ-wK7q1_GH*5u zpV)bw*vh|U>_sOmg(#Sb;D>_Ma#<_wd{){h6gxMrP4^PM5N9*t((f~3r;t@eGyXiL z(m!G0mbN!t{)XuG-Ic~K7Lp;GeaF_ zsJOM1_FPPK<%(OdWwfRpB*$r@DcAk%EpC!!#V95(U?e3k3@27sPgQh3-8x1|6Eiz! zWdJxP^pBYP?$Y(tPb*l|(Ao5RNJ>#sOG%4G#L3doMqU-sTb!I-#4HS*fSh1C!yjQN z9F7S_WDqxXHg>XfaItp+K@bh63<9gT8o8W$i$xRwh)@MXTfpb)zwv(0EGl~HWR;DJ z3l3s~0d7^H9KiNBgoA?%;b0YkUf$5f$@0_@D?0}ugb-4i<7Y?@6O15C3GfdK_+35q z7K;g%umjwvBD}}`;*dkWdr1Zz^)*$8{kG37|oslxxiUiz-=s8!_vgX z0^kH<;Q#7@_OxNnj+*?_>c2GIDX-6KC!h{ChUU%ytNy0NM4XKgesy_)_5&lZBmS^K zfs?#|<6sBrQ*ME{AzYYX%^!c@hz$uDz^AaKv51|yjVVAlSjENERvi(LH}p8I5x@$d z-vSjpI~jZ?*mOYLfZqOpcrgF3IH24-AjAWM!$EMMM*t5d4-YSh3)qhY&MkvrFxdA8 zfWdh#Th{YdvSACLzT<^>jzPGJcA0KOkYU?Sd}fW?Wxg{a^ek%%(R z!iY3K!w4FH2N92WBg+2~_p^ehco6Yt`G7Ej##5XKyubsT>I6Z#zP}L+Kr|Wx58@5r zJ;jB<1IUkm2>A&>)a2(ML>5Gq&e|P87UDr903J-(57eiqe&+g41niVCr-cLMAkZR+ zMkG3`@Yx$6_9s!?!1^l;kO`-80P%?Q2pAj=1#u$i<>Wmp{W~r|a=;OJzEgyFW5NLW zg2>MUFyib%)Z%9&AsXtJ=ahN`7k>o;;nNmG5PkNX_68u`X^#QmzrIg7jesHIe|di3 z#6%EqR^i{@h@o)Wf&kIK#+{XW_Wlv~vp4*j7QvWv%Q$Of;Qi010KYC0h=3r7nLQwE zMM1zy!D&bg7?$5dl7O=L9+CqAa`ZG-9_R$$Lkb}1sT9QoBsTPiUQ-4E8tOEi3eXR~ zhtvTceh(Rfpg+nm0zrS2VGKGw>;bqG|E?ckKguydsGd_yra;5~2qAO{2x!?;_^GOa z{V2{71pA@fY=A!WJ%KF<_Me2T(HR5` z@Y5s+eFiMteh;51#M6H+AkH5Je%FD3b_C#-HZ}x75QxDd z(El|VpfLYQ2K!%=0Z!flB@CoI4G0SULhmmY)IXK@rQ7`jL@@3L<^KQ?Tsf=GcU6pV zNe?LXo0cXBn*zV@DY!&x|jeH$-&0n#m3SIm0Jc>Nd7csQAwVFHqj3lk8)KYoBo1P8jY z8R#brVW%N}AiEzh7?ca~tj0>;?e@_PiMC@-cVEy5D z7(feRup@pyX@T(aAY{$6vq8W8v#WCmicdG04HF6 z?^jSx_@8`%b0Bobzv+WRc>c%>2YT-x={ONm^0&O)f6~mw0jR~_(s9AK{)E911OMOn z0PXunyKy1Z?QiJ-6XOp)bMyQ$jsQgre8%)!ULIhi{{e#o9|8QHju)XT5x<{(5|9#q z(!vYl`ZFCDF!lO9FAqX%{tF+#Wne-6C(Omk(9*`#2@^5JQ?c|iJ)Mjx+uI|=<*Cw< iu`>fy4N#-+s>a#H(8=XgJi(wmFbF0+y@aA9=Kla7vv-*Q diff --git a/audit/dataset_statistics_plots/score_range_by_eval.pdf b/audit/dataset_statistics_plots/score_range_by_eval.pdf deleted file mode 100644 index d668868c26ed88bb78d8a1a364055374c837f3fb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21724 zcmb_^1z1!~)V~M_(hU;2bO`J&%hDm;A&r1iOLs|kmz0!%fGDL1NJ=Om-6E3GN`s05 zO8xJmzOS+0_xgO#AD`>aJ$Gi#nKLuz{ATXl!>leX%LV1;#bJIp4Jv(#0|h}qj;1y^ zA|fELmX{L(1eP*!GqH2D1cB8}ED^3C9w37TNK6a|;b4x;DDYDOSw{yq5bQJqSWnH= z24Us~;{SFl1N`F0P%h!sGGRCAzU0laO5oxSkubH+}gns#P|KFi=&w)!VP2q zu$7htL_v7Efxrs(fC!S`f2F?vDu9fB5(oNc1CVkfrFTI9(tQ&jtbuTKbaycW+JlrI z;ID!(w>FV<^aMCUfPega{M=BeAcT(x4&sCGaSQSYK!KzYpbnsoKt-+~!EcgEI5;>0 zcTi-hpY?)#{6mq-2nS0yD-iTMb6IOUpy?p6tR0{aX@r@hIRe=NS2q`gi9L>I)*AyA z*C_##rXAh6bU}=cy$DBVi-pMz3k%Z*4+2e14jBs)1Uq|$T_Mji|4VF97|iK4Y%vet zD6%6&gW|TZa)(z~RQ(6G3zH8Q4r3de`&8E)lMx-=l)hrbvdwcl*-kF;gN3p4Q=Csa zGzLUrBsXIAOHIPP=j(=^>+h^Da~`B;HwLFOUlbnXK~QNchu^4vkRx0g;`FxYWmymf z{wt`>YbQ>I;NG(4L@lgV!5-c-c}2|pf?q=Vy1OqES1l}YiQT|1q~?RIPs(J{m z5{)djyKCc|Wb4n}yML%5)SUBpc!Or#Vu`s;4JXx`S@A-WW}foCEl=-2S!FVRmJ|AB znOLfnvwQZD!gWZGKSMxbY3&49cjBOOaJkmdRiuA3^)1x9{4Mnp8Qc1_xA|DTj~>Xb z*!MDia_WuCpDt?R)a|UDmRAVlHAwATX-TDNDX#Vg_o@)4`sDUF7+lPYFmMpPJ+OhO z7jAybz9;O{b>#76T5$tsX3Ip-(Yr2u(CL+RQTv6@_x%%T$5KA7FIez#KL}VJGZ)kk zOuzHtP}AR6kui$O)z*XU(NI~ei`o~P&hn%+d+JQLb1VZhjmKkZOnB&xn`@W6B%*zN zvuAzvVjatfi4T2BC3Y85eD?*|t)NYVy(hB0s~fh7w>o;KhCO-e8!o8%2lhYY=W8mI z?Y8H;Ty=058gQ9ep^I0J*|j^|kI50%;#9w`nNCe7m-eH*(pF2wxgmdh|gF_E$`<)#0Hk(_9w zn>uYq7m~${*TtCLZhcsF_5U((^yJ3+e9f1#gW~ytnXHK(@5@EvmAEBGUm(X=$C=EB zYMA0}l{>~C&U3E>m`|7#7CMYw9d%lcVr5s|xX&iRwQf7iWA!vGG)WeRmGu6r2X zupNX`_~Le@h6qLG=2mdp;;k>6xBM;BJc^4-!W&%X=et3&w4x76EMISw)nyY<=i{tv z3~S@xR3(KrPcUKdMHNcxop|o_9@VYfm!vgEBN%pHv2RE z4hlQbmTkX*mDzLwYux+8j8*n5X|-d1r0n(_+@z+xH3dto!c*W+1MG4Ui1)4nCg;W}yb(w+L2JGnEnxmm; zx9ynT_r8}ra620y&%9V5;BlcduClwp^F+}mw$yZhrLzBJk$Fvh05iG&*uZx3DxFIm z&8n^v^N4bn)>ro{%y;Tlh6)=~i(kD<5aN3op6cEEsJggnK0(oe`EB}$@)s=)2YKe3 zAB`jncZ{SszATB11S*P&cESph)CL~m6)!cf;u0q@@*Y;I##MFux4P%~-DsX&J~yhf zqc~&+-rK$dF%5DIOU`qZw22M<-~gwv`occI45v<#m5SA9$Z~fn;zY|#uyc}KFJ~T` zNbHWyYBXC~mKm*lXcy)h+is3sKjZc!E%>TM73s9V4%8V5yUioG&hKBU z?pJCQW$QOoL^z@!L7N8BY6ZfQSFR5syud3A)3`Q4oa(92Ew-gbI zBL2dRO*wYAd#!ywur+#I6MOX9GN{Hz%Tv4Ps&3CQZmZ(>Bog&~$0^_O-zlg;0v#S6 z2>b^qJVirDq5;V0AKK4I@dqM;!T}icA8-gB+l!Abfi1PI8KyKhs6Hrrl|bQS3}U4# zzaPXU$7ID~$|(4L!F^yh>6Mpn+Jjj)$yaMF_fpT#+bvAM;KYvM5yMq zDiR2`80D5=+mK68Ene%v0Y_-CJ_Jq8nQw~36&@X7O^!45p=cxuGzWw8|F`U-NfJp* z5NwQXe@r!^v-GQ?Pb#1hGbcCrV{%qwT0rwXt=d!Jn{!ulYMM4~i60%SNra7&3I;qe z$VRx>(kczSQaSK`%~tAX-(GcX>a|O0{V01$TJ+QE8K@*3!c60WsP!XrmtxYXm#bf^ z<{Vsj;!9!0ujV%@=N?*wuB$?=b4Ee1ZQ=Yxf{g7d$$M>;TfBQZO}Cs8b*3|<(%mc9@6jk%C*LRr_dz=0Dr5PPxxPDfCpydp*7 zdI`F^LkZTM3Ik`Ayql?Ocw{4q6>?Nj^R0uQI#jiEXQO<)@zJD$$hPR7U3*+PCiex5 z$ru!P3nTnZ)H%AVb=f0lMj#Ah5TWx$w~iQRKSvmL@@}T>n{ntf3nOrs>1GdBIg4U# zNmE{w)%#Ej-<=fy$Sk@@G{pjuGM;?)eE;48I>n`Oi+%s&V=R)GSn5A}iwB9tf5YB9 zf`2j6m^BpQF`(ED%jCz;rF>KZRm|Auyyg0YNH!_P&b&W~ z%psjCg_bNqMzbzxy|k#3U7QWl#lk75SwLzI>C2{4uy=9d2f|_rWuV$#%4*03M3_9~R z(b{0?hz6ys@{&G&L6mF6V0Imkb5*)OCho%Q$((+``e(E)9m^dQ`u=&%^?hawK>vGk zsPtD52STthhjQ4d(9TL$-dfgYI;fQ#(4;;_k9K?~zrq+^k^5x6yL|6vL`FFCaofH+}hQlQ?UY1$}+snwYzIIpbf1)2;=1TkjOqzLmwReOQTDe3#F#FLM=SFd?$%k}JQi7*++C|y@` zIp#S(fKKaO<|lgbz`I7uuJ_vb zXD#_Squ~m6KBq;HU|ob^etFB5iE{bJ1PLYU>+il=O?XO&Fl&F(7EG-kWnCz%Z5?>% z$J0R)-OV}VvNn*Foh)He$^ORa3rU!$?zDYd%j4oSvxlxMA(pvpH*KIzcFvDtWK#tD z_{UPFv5qMF`^}$Ba#(N4nBlc8xd(H7j8AIM9=?Nr$=lY%{z2f|0f8Bj`n~N%AMt>@ z+n3x}{F+b<806-}Z`}ij{CDrbcxu&w0mFNaB%Fc*)qp_<;p-%}-EO&k?o2KiuN?(v zo~ds;HGOy<(o0!g*G++(bhj^i?2t@!EJ>Hs;gP zd5SrP=@+j;EYu1e%*OVw`Ern+X!S{2QCEwk6vzAPMBR3paE;s9ov!-&c=zqiwcV*A z+S0A)tgc;JI!4-tuQ5gRCnqlPcRzeVCpi{6g#*9M3FMy7Z%r5Y3*V{|SAqW=@U1=% zj4?v`r_g1t;`p5K1xBWXi+g$ocE?%nouyJkyCFGoH-XBITzl=?X)l1I>GVUWY_(tJ`Ek6jR~Z=j-qFfJ6OMo2N(SBj)B49nhzvJV?JH1`%yl+I zYc@Rby;v^o+tBX%9oz#R_Si`A-Fi|RNA}d3#J4X7-etTAnAx;>OxUBy!iqw66wryE z@4wXtk8MEqE+(>f&HO)#>7=yIJd0~7n5T>IoDt@86K-2Nb3rwD`lfUw=8|CnPl8r- z*})}nb)z#&#yx7vui25XamxD{Kw3SNfo3{e$~Z?cB|NRl_@9 zy(@QfD?4iDW@Iicsx;i|o}^pRp61uGv7C~*ehE{-e-uRM7;!z-`>NU7a3iF#FtHEad+9FiUAjUS!ViV zy95g{8(vwrhf1!N_sZ^*+?-~Q4YvRf&7Kv6;&5ZYJ=ddf5s7$SU^+UR$zG&_VL6+!kZ-wKb`UnI=g1&b@epEggxq` zBd(%Wx_9CpIrRJvT%?5Z1B{R7EfX_BKazRX;qr{1x09fa%03%n%yFq4{d%;t(w?u$ zjmz(}+TAMJ*hBR^jWi><>=F0lWq6xA6zx6ksF*I-Ek4GTh0U5I_HVtfz9Z;q4kOw0 zijl*F-A;1xG^?7oh$WiF-;%+c(fP1v_WFxdOIZU{wf!bShR{swu+wSYtZw}1%jL&+ z1GtBjO;IQX1r`A|mH&Gph$c2TqXV9Qr}Qjy;C|BQuEncPTjHO&RqtJCy-0aiuDoCR zKHX`<1G}X_28}-XVbkHP5Ncb@TQQp?oqTpry_i`-80NVT8^f{VTUtB}C7Srw zd>7WQ9ToYIUHB^DJt)DpxB2E?AVP?PM4pPgi8*yAsm^6?xbFFLLwl^#i!Sde zzR#mhihE~t_ga2&(z_9_1>Tjuy3OdVNk`5Dm1*X4#vzk=8KjQ8ad5_!p{dnHn+2;d zZj3txTpWt4?VDTt)LDj?t_LaIB-|VedHP&ZzCAl(rZd|qcHpeis*Xb(`YeajoSx;q zv$#4t@U7Zb=dr%$Tr3N(XOd;^T3u!+lkFwXWfH}G#hfA6<1zdEa?3!Bt^T%QC-nNe z$-XD&EhP`tU89);lgpgh=3}Jxb?kyinix6vGW#|(h+!Wz?$VvvY_83}ulDI3Pe#oh z<)C7=EYm@6&R0toruM7^Aylt>g1=rdvUT63cNnxyxw`nuvIRFj%Kt{3j$KC3l!0BY z3hiAXTMC)%eCCW^A3w+=Kig;49(o&VTNQ87%~A&(=j&M{_ZtJq%(}>|)XFVHy&E!I z8yE_oaH~;F_!w(t9-e3b%hNAk(R*|Y8c+-Y6!;Aa`3nFC2e3rj6+Cnwl@{Wh!8Ee| zMUd0{#;mNrK-ifDjEjsU=%wJs1iVJ;9XO4zy}i=~IWgUD#*w87r+OuUQdipPt;p1$y_dO za7*+e*7C7tj1LJ@72~7aH&FVXd;RK3p1ni$VY9i-x6QgX?i6ztjakT=Q94S`+DN^7 zVQWQEKyoc>EtESb(`7;9rfT{$XTQn(9T};S#Ok9sLp9Ugh4|dXNqy=edDT?WD76!_ z6y|lv_-uszbu(w}R%zQhS2^}5?y+qMo z6gZEU_b)6UJV_ym4mm&Fakz(wO^4iaww*kOn@?VYi+A&Jm@v#)`s(fUxDQ-vR}rv? zqmPBM6C4KzOCDiQ!Ua1bR1QR(!~IlZCbOw>?8R>l)~5LAji0|ay`1q%jKftkV6gvw z7P*dySaLIknR62x&6E|9w#Vc4SGIfh72Nfmrn9PX_b`^p(w0N|3<~Cx<*)*m#;Xdr zzIc7fXZ*NwIJq-<;e;{bj%$$YjB2Ssbf3xn+7)$8wtOy|E0vDnoRVnGl@(l#fhKux zvin7r{g@N0W(f<=-isyu=KBWeQ>!b)5+CbxrL;qfkk0@}sJ zn(^sa-ZAXlt^0gUx1aC22P$H$**|ezD#o#OpKtw|lN!L!rs9Spq3>v-F1_qSroub3 zX9*u~oaP7@wV5HUzY>@bDM&MlF>C)d?<@Z1?8%vq^+J9WBLD@I=7IkOFi=TUa(Hp& zyM>Ml;9qtoL(ESH+&Mca|D>GlD(`zIX%fwIeo^Z3L#YLh`U^B)o?%9;ZVh!bF>OeB z)v&jdWfj!0$0}TAx!(J!TMtx{nEiq%XmMf5y$gTV#ARc?DEP@dmG@nS+E^PZ#~~4`b@5w!T6A#RiAiqBVZ)(FqIHiAX)jOU z)c`}04vXFG!TbtF{RwA5?lERVHkM{r_3bq4W@{zJ{EhX4olBkfYYaqNWZDZni^uv_ z6u9Xhd9^d1Q&r)JUbffLlGSCgB%Wz$?TJ}^q16drcKq~)PP=-++oU@7k)uatLqbEy z<~1ua`Rgj=1+T$G-JhR|ZEUJfRN{ zM1P9Hi)tyt_ZJ&NaCInhuX23{N6bm%`IV%-4Qu!Ijgu*uw84Z_hp{jN0>{u_tG%}O zWy%dYvA7*Ivv3h~FKHuS5B1u^jxk7_#ZwCMH6rv*p<2%_op})zE&f8^(Q)@d3*Hy= z=@>%oD$kn~qzXH!le_+magQF!(_NuW9F7;|SakPX=u+C2z2nY9RY=#}g`#(WQ_;yHO7alvTe2BK~Alis*M`&Z51EG1c zVs?|f+iK&)lk?@7z8*($pY5C(-64AHb?Rr-(rIU@U3ILz5{EZ=?srV3#@sOAjc%u; z#%l7r*GRQ~MCf31!yJV=Q7t|GX0;Ai@3zN);vF>L@DW`pyyeZJw^XruLZdiiEJ9?W zRM6FTJ_zhv%m7d1-hWN}V5t7Rum*ll*MA^k^saLD-?dz6@{Fromm4N*ZgY zc&3G}{sWO0SM#4~@az1%+NBib~fkh zco7TTJMa;<5Pq~pQzLd2I)k_wsqSK}tEweCH@f{S3~)HhNV2c!XTN>rPEmyYE(lg@ z^rm?bUzy{~*6#Vw7sOu~d%I9MjmzNa@ryJa__?|cP_k&grPC_E6K(&cz|4OOABMfQ zKIt~&pEQ3_$eCyO!!y(3XUAtcCXI4X=n}<>g%|o4J0GGN21q=8a97hQNO3+p%d)gY z+~;I=vd`{*e=Ek>8u};Z3PrxfiX{r`iH=QG2ELA7H>e}Qy zNG{Ilb*-O~uZ)@*W7&S@r%JE>gWQTy&aS_zuCTIAB{HJLrn5k1a!Z{pH_OP z7PKi^K7equBM1eizQAE|=V8-_uiuL1%Yph(wS_m$9k1T}&s|j(iz{eCFuO(e!TkLk zG6I-JmU%K&%)1nl9Mek~d5;)z=1eE~q_5zY?&Qfo6%WW9xgU$7uTiX>pnS0ZUY)9_ zsZ<67E@h8Q@T|IjkCz)QrT59BRo&$;6xpHYNXoAf@y@=Astjw0*M2xlBZBdGIDl$W z8b@5iU%p2Y*F{JuF|Q7vT&_}LB3m@CjIH=l0aPW=UbjT0{yiV(!r{XvwX7xqwfn0J z$<8$?oW=?P;Pl716K3B2FCwSz!kaSmvR-4p6&rSk>}VYN+|oWYTV8o|oStRIFVeY) zuJuqv5k*^3EE0Hl|6*LrTvc#*0l{lHAV0i4PvQi3iSI6G4;vg(e$S_5x93PVt-}a`#xX65hgFB|#sJ;ih!s z(;{lk?YA#3y)X4naR~L0pqq~xr+rMMEBI_*kA5J(tC6F9^^;KEJps3y!PlNbKl8q^ z<#X(J5-bXfe02Np`aUda>7?k5yV4}qsB8z*M{9Zej5tHg!mJX`*J0*j)0&1*to;I+5xwW$TDp*8Z_|dq!t&w)6B2AO{ z(-sDN6jGvCZ@}Px0h2_NgySzl@j?y+!?+=V#c7*6hY|@4^#Kh9O?MPS?c=)CG)MV3 zr>|d{a+4?a%O8%o z+76nd4aKveg4VU|3vcjnMF(E>zhm?$keyGtF65ZrfEoPy2qPiLJj^j?{30K`&Pr!w zx?D|h8>Z2J2;#K4W@+`xh@gB)W6Xmnw`dWK#(D?BK$O5H$?a$3xpGdIkHwOmmc+m1SNr!+p8^$ypb_=FO1d1ma&&dY%|F*S_xgW zs@sb_Oi^SHnGRIXVp-##ylS}fHTQKR+Pd=2jj^v~gJADqab^@PM}g=0|6*MXNx~nn{CLA~r8y(7Sdu#ujihHjSV2+`)J>aS0=$ysE=Mzd`yf13T?RW9O}YY2 z&zkzoHVXv=g*YhCvEX0qp{T^RAb0H1Pcnju12G~tG`0N-4gzqkIX;IFmfQEez<6PW z{tDAgUjHCFDMJ6nC(sX(ok?lg)jeVkZLHn09)G)>*T({ zb(-4>g~?K1Pi!SPRzDesP{3RoPI| zBi-E0msK>C?VYqdgPeLTo>=N+Gi&-ff#}LKT8Ft5W*WtXcYT50nZ@YmCOx!G#|*d@ zN@SjFqy*R7V68f>S~BRvu6&*u84W*RJCFeXr zTcYSZTQBl|e7di<<)W7G^63ePu$Es*%0WhPNv?D5orQWet4t@B_Kqgv`K4X^PFk(@ zgSg!To^rbvP@>dOlkI zdWl}ksi&-9{3+SK9jp%-qVo20|PP zuL*W5wu3z$6@Bd3)#1U&aJ_WZ@l^)>lyYzhSaPv#C47OCE)kwJF`1?>bjLtgGK)rC z!ZCGmZr&6TQi~8NG!yEs*0UX4$wR+5xlm{T1rCD= z{Ke}st|Wy@+V6X0bJ$QtAvIyz0F&8(b*#}+X;7DW7pFc6%QG$JQSpz@e$2kUOO5YE1EsZS{qyv_BY_9fG|irAb#xP0wVjDE4MpWO0;<*L z4au3QdW|{HlU;}wQ=>wdF~Zv835A4&gAtqO-#4#taZqHfM_*#Z5^hF~=Y8z2FOEx@COO~FlC?(f>7kgmf5lbtnc$TTp(JqX8#ecb zB=6xpquw~X$?#Gu0iqn?NG9!35{YNg zxzs3UN*As$VJB6e2XA>t3GQO8Ae05G65L5Xn9A}tly~YpKhs*U2wm0MdQWQj zuGQy-Z;61h^1G*A-ML+P`2)6&4L*l>gpDfgzOMA(o+#ya+^|`Mt-iUQxCPf^bCRSb zQkP8Krgf(*bnMh|0V=x81jGt46=L2Ba$C(2k3v}tBVCV0p!+Sy1ENXwhjRCF>sNZ_ zE)X7%eLX~LiO_w3LTf0XB^3G>69BF%NPsSZw+5Jixv9lHJ2HFdmjIe=Hfy}rz({#W zs88ALyn+uT-fW9fPfeXJ#fos^&*N2J?Rd!f-ocPb|GiBlv&d13_x{cWir2}hdL#E( z$&acy3no3QXfL3d$(98xp8i>&$*it@fE!d#vcfFE&?>p@}GH#Hu+jYMdb*^hD`n2>G%4GD` zdb3fS*;>itNu)Ss-dmwJE2gALC1-PZyY98*@Tl)#ojzL3d{H1Ap!aS`xFzL42w1Z%0AGY6dMhuY~qZJK_R zI@M=#<5>NJ;O4Tk_`CJ-lTOAy*cYyy7RQUioin%Yn)v8>QsJ9=c#1m&ax^_`SIlNJu&zHd8F6{!kRTg-SL1>;lz?i4!EJ$3nz%&9-K}%(_@JK5V`7Hl zlU|s7MJC9C>QOQajvOM18sS!yo^QllY4AWQ36We2z8z^zyvqyCnxFV@d7uq5R}@G9#S6*7n130m|3Rt_paAmm_5S ztLASX;d_uLoh6O7mO;qX#jf<#a%94foxwU6hicOYr zXW3r6xdMK(p`|usQ?`fP*PzKlzZLy8#k>{lq2PjnxhG8f9=z}K<;U6I zujw21>nvb@#PDAC3<%2122rrK5yVkHYD+h#$tL50lDzNtSikaSPUdXZ^EB6UKDAjr zd_&Klcnpo;8E^>L8|+;qBlle`*>iQ9fK=q37Zcbl>?$wHY3Riuio%#bQBKf+MmFg^ zFm;T8nP*8)kz`8ONpuR~^bUNKZ4IwPOyG*(z01Vyu4t*b>G*(^U6=raxcdo5{E*T` zJ*lkiWU+M`;=2`JpOOrG+G-zI+iaL;!IK~zqtLn|caMJD?D3#m{!LuZIaTjRUVu*?;pnmdt^*} z8uY)sbe!?#X589}w*y^k{5awb@ypx!;Wb(V8u8}^9|ogwNH|frkb1O!CLH6>eyg2x zql`-Q07auw;AtMdzu4=8U+uMT2i9voIPu0un_kzieoEL1;JTlCg_dLZRskakjrW&m zd$#)hq|tfjL)G#RoKxrzHG?h~epP6(44bdKO21Zz)f#no^pXos(mCym^WDa>VAhqO z`j%)t(Ndq-Hy1?>=a=!qIbfvDWp82xUbt}2zFDNSCP^GX<(DGhPNd0yKcZ_g1JwzblqPxiXz-rcgDN64rUF1m&N zG9PmyK_4Q6>6@e?#k4C)5sbgU1tAF!C``h8rAviCZsS+-xCd?JoHZF}sCEV$_h_hj? zb#olX%DKU`p_AWElb5m+siWYI0&hYgf3b4}kAo5bZ)nhcuFtaul2$H>)?TwuJP7cK zPq}=i_qo8E`N#*aOB0~Z5zv|g_?IbtiX1s8_mJ&n)fb~J+=0=>t8|ylQWMY!jPQ)K zMhL5|pAanHWf>@FeSO|&cX9_^h=5ibg|sNZ8Wc109N->ds)GZFAkHk3bjYu zKOIk@u^54J>|#Y^Jj>rHQwj4ulZYelfs-8z#X69wQwa(4q8?Jkf$z{43%ICb-4If@fWZiUFoSe}-%m@29I6q|fLD2#f zP>=`s*6BY16Fi|e4miUnwOuBOPet_h@tOkB2L03tydXR41+-JGAfln(kRf3<7`3&5r^e@$mlzx`%&1OEJjq8>D!sFtAfn=-_=~ zukw<$ADg5cs%jxY6b{xd;H+}+U2mgN>)d9V?Wpjb-qhS$eqdHT#W|-1*d;D1G&QFSae_ShW~S>|E62&LxY?!|VoX?GHJeHbM_F z2Hv04i{JX%S>4^Yv!6dL@AG~=0jI7+P?v&0*rZP|sxxy<&vmLZlO8)K+-ZT1Utj%; zo4KKKSqq+m{=Ady8~Rw<7NCH;ee&BEi4c~)Cpvm2?G!efLg9jCT2hc3}8FM3E^VxXbwW2+VMuX zID){A4uIHhHyuH6AfE*goC5gY+5@1u0usT25(sNcD>odlgEcavnWLSf0|;mcSONqF zB4S8^kf25eXa>-B5I>*?1rS&f1XcpH2m-4BY5{@OfFKwk0ibpbKx-fYJ{+(P2&@Z2 zei-NOV2*HcHFI=9fCPaZDiaXc6yN~@n*(eCIY3|_{D&0?Yz=4&cx(#-+kwFLAg}`< zHqaCtC=^(O0|L7OEdVM5fjvOrn?Se=VETIj?E}661;Sf^b*;_atN_&^!NbpyGERH) z#|IYw4wIky`m}RUbu7?cI}=M+K(~L>#S*S&$PgevR1XjsSv&HEiwC*jl`?UXKh-aY z9}4J2@B0lLdD=k%2!CR2CgEUdhX4czYq}xqb&v@a6VKBY0XhL%@NYnr)Ak{oV*ug@ zUR3;lxN!cjb?^Y+G$1bkZGhmw7y>RF0Rce}A8^Mc6aWM` z%?ms~E%!a&KjnZj$h07^@pDQ;)(_P8T?icHFE20~kaZy&_(Mn}j~{7do`2Fv8GsA< z9Qljn|NYrN4Lq#}`TR#YAdQsqv`%Ebzy*9X2;u=&+sI#}0+5}CtOxlE)O%VNvL3+T z{42;m1Y}G8xj_~|Ht9#dBgH~q$PB=R1N&a}X;uG}`X&VIR57Q_0Y1oTkwPOg{b=xy zUqIM@gyIJ_reT0DJbeZTkIavx!2xT;iZj{edZZSACIacx9z+WL<2oG;K)%xv1JHl|J=HXlhJ60h^}S9UqyRq} z{QEC*DxCHpAoS1Ae(?SA`~BHJqv7YgNX4M!@uQQ0-_vj2kQU+8Yy7Zu5+Gn*f9k>{ zfywwSB?b7TZz&lNU_DQt$^u>gEhPs!{d^QK-wGg}Q!9!C*m%H^o#s;lM&GxTGN8F{ zDHRaUcOI%Bp6@)=K&PKS0kx=uV5dBA0FMB8r_)+AfiC@)(g6YecuLa+49K^X2?+L` zhbb^zzR}D;uu~p5-<=ZdJ0HNgeQyik@xHg^n~ep&$vCao0>t~B*Quui<}*OX`R4q9 z4c5~YHl@KJN!^aR9#toEbTN;0gjx5`IeoKKXQG4bb8@pU(Hag>E3= z@aAbAq%Q|fP@UG{4mw>j1G#_lw7_&Z&FBG4fN!arAYgi&(!4-?-kksN*uDOo7!6m$Lj9NE+#&n2KgX$G{Ri|6?}5qyL)? z_CIEWa08C>-;BKfOGW?y0U06O0s!luyeLvP-^KpdwIbE@2N9{U9}QFi@Z7gv{DxRa-ExOWBsXyMLnQ?s#$V_*nZPyAt~2_h^GVvaPUe!r)%|PRM)Jg>a_p zF3jGP2hg|=NgAwIy$LM)aIFa*4Rf<8FxMN*EFEws+-jk0luSJvQ#3ExAzeiz;#Org zycIsX1&I%tM~fDPWQBCgx_2n@4x{;Ds!II1AVQAbU$ylu?i*Oi+8haE0egxZp{J41 z+yMY}I%K}b0s#)K{99NGh-QWa>TCe2cCvGHv$Hk@dBC^<^$Buwif4LO65e;|kNG^n5;a0cOb z8VuO)`kls$w8hB3pUUz`G6I{uVsNlSAWz8<%c8f`Y$}7e1Zahv`v5?SU~)m58#V5aKDs=@CYD{@UJxF zocn{uhqU3pU&q3aIq=vQP*w($D^u zPXIWA_8SdYcK%7@5%{A%DDNM01`35EUG1+tfF;o%G@yHb*C~_-=%nAv@*tNZztsnW zBC*u3JYc-QX`bI{e30M!7YY+VBEetF^1}WY_fTN${!t&V01^rQS{4rZQ}=LQq;LE! zA0Pi8x`+Qc-=J{8KlAZHfsKdX>H{qCAF}ZA{n2m0R?N?RMlb~wi?r!1g2tk5Bv7s8)CgwJdRuG<(r>;(B8W!#l z1AwihG$4wFmpcS1=KzQxesY&MxywO}eiH}wX9GaFLFrvB0O@{;4^_8tbMkOC1KI=1 z5AavAFt;%gck%)_asmH%d3ZUwc=-{$NC*!X4<|1i32=c*0PO=Raf9&xlv2#m(Fynk z155m_6Y%jbB`R7tTDe<8U?+^FZR~+&L!i?3fG#90%$&?EK4$}c&1x}>j@)k@$!kIPpDpBS;f6p zo4%j>Ek~10L+fynX1ukhV$`>><>vNq_Oz{cC-G6#qy*=#)xqtB2$A-6+I^*&jfZi} zw5$}aHLGtGpbe0%jvB@*3kWYfb=UXm3AsEwfj%{}Q z@=-o~VZQMz4UcJ#q}7XFZTuS^2R9Bb`hSzmOK1H-_%h=KA?~AGtVJ6kPNMPj<@2Lf zPP^67-mHl&59diM{0xO(N)GczRnL)g2RmNNtSawFRTH1*eQWmI?bQ__4MT7D^(lG< z4VN#0hbbblc)R{PO$uAz0vg_3nrLv&yI(eOjficUQSe4hf;3ApE5=ohamBj$4V^c5 z1y&(BC7;N?P`}w;q-b3v7VvK=7=&XGnvgBz$BR2y5Ms-_+#QOlJJokK1Qpt^!2`n-WAnv zC!bEq)VJWILEpzHH>Peh8r)jcURNP4!aMl%Bg0>Co@{UY_RU-0a5EZoQ#}T&qNIc3 zZw^+}%DrX7W^|hiQ^v4wzsYj&Zg>BO_3md1vMc(`k)JHq)ep7?swBLyR}NY(6GTsZ z+`GO_t+eWXBPpZic2@>RlGOqhla;>;Qk1;c!;nW`@}*zXJp(?ybQMfy>Q&1xiG*>_ zOON8d4n{sx(Rj}BMj%w3IycP4SCss+|Fh*JnLO&F!{NCh$ebb1bat4!Q_8;D#r?>8 z<4Agy=F&XvQI?#ME97nS515Mzq!QX*m(-P%rz{zCuSk?&Wf!jP1rP1UQsUuhnjFsH z#ng6$tJHnW+EK=X@ut--tf5pMs$6}gPuccjI!n9}|! za+5rt2>ADEAF0}0%QjHjO}Zc>?4FDT!6Su_*|yPaHaNQYY7m#5MzU|_S&jOHF|cgErT^MLSl3Iv^<~AhYH`0LV|BXQd=aZsz4dW9 zvtEdyT09PnBGkV|lZ&<4XRfH_E%EhF2R@I#uwolf4=@uD-)-u=ZnZsdfh@O2bH*>& zmpi|~;XR%F0g>4?kpc=EN_DT*M&W{x6un@PyGOc_aI~f3D6?Vo&<9jJGf~RMvgFkI zg*)%>z4 zTZ7upE7U+u7%bj*XnTZrt11oMh|Rm2;5R0)K~Or^@F>dtBc(%I;dxvrzy0e1t*fC^ zxs?6rhK0}b89r=Fdjxl@u7z`H#k*^-eP!~Y?`5!cQoZD@;`!8P3gknjy*-FMyQb@b zrc)Tl#9vXoOqh|aZxYGNf+wM_^f94AEi`9jI6yxk$5!)(i&uvUy>c-{Za(c~{*@{D zs+LCl5ijj&gyt-fzW0SZxx(`M3i&;(Om>u}5R3;*QFv6>U5COniahlnI$!Q)_YQm< zy7n|+pOA;Otg7c}^Fn?Sp_rOp234N|>m|5Qa(1$Yt`q6-?d-jX76n{N7u)`G-sy>y z{kVELF3JzQZcKMJ=N#QjIkyQ=W8?X01}#S(1;^Hw9e8 zRdE<6bogu@I@0S0clSg$yb`c7?CPa9ippVmBFD&(q54UGT<2%YR<_Xkqb3)nm8J~2 z8ei{Y-<0lVW&6@aP*)T7!4?48K60inqU2CJAJjiMrcq2&~}mtcEu?B*={9$CWm?C zXC^p)G7Hmv;XTdvsl4F%{Wqf18i_89L1UNfM5_zW*Q-Y}7`@u2>?aYHL1LN@$bS-V zE}9}?%|N)Wv)e{%e!Hi2Lv4rH;;BH|*sw8WKj!tKErwh6WL6NkS{Mi zT)1?bp*)rjX>nY?bQN!N8&4LASMi9Ct{tc$Rfn8#Yf@iJZ3mp?$a(|;p|ag{U34fO zv9Oelra?4&UYHdppS=&D(;DRlC$KBjl-lI_!p)!AiWg4%$)ZujMD&e3`cSj;YML=L zny<<+?ftn3FH6POw^f00rtoc`@+)8~=6Ov+t8*|uUQ+{{g|sd`t#XR4_}Tq zRj}gbg;cm|jyq}Uc;Drx-2qDc^9l5;uLugnG`w+7dfQWsZ{fHjq)Fr(qisoq7Zf{u ziPZ$=dr2bI&)S4(u@YQBb3xJ_Sr2UgYK%m_Ug?ZJoA^_yQtbLt=QOAH)wK$@VjK@3 z!(Hbu4G?T_+?^F1$EskP@w@6PG0hkp%~G}Oo<+r@_HLF=D0y0;9{QCWvy~_R{+Fk2 z8s>wEj?#1LLH6#1yV4uxId9Q=DN?G0e%e*8z!UVd<6*$aL|}dy3D9nS#(n<2<3@gE zN$-o;*A~~yY`#u$&}kKUw;U__zGX$T!l*Tn{TY2=bfyrTV=7~y-rJFx+M+mPtf$f2 zePKf{)pvCJ`MSg8Xk)dX)BeO(!`cz==mX+3mZdMon-F-${`WgHw=-wFHZGDFk9|(L zOy6w0GQz{)KCv^3&yrXr$8*ru_D!SfJf50sj=`OCnEbZ|{hc@N-`S}2Vsl+^YV>vO zDJBi=H&b#NK03o8zbgdd+!J(mf_RUi90>2=+(_6j7<`NYL81Y~=pP!0eNW>+7!m+~ z{{aG#QExAyi(yHuYXmDyb*pwutKrHW4RTrQ$nFGk$S_zlnbPxrneli(8Q%o&r zck#aErtG8()AloCyjsyx&qNg7&{`N-KMz&TY?jC6Z!&sZh-FJ6Mn0?Gi46@^XMPG9 zoibk&jLtheJU=qT(1oIrzvKE742wYW{hpRU?1GWHmi&dc?WtxWGV3guVn zlrHSMNajLsMP5!bA*<<}VZ#o8A9bZQr7bW{R-QC&r4U`!vG9CKxq*vP=FOyK9OB-% zavAdQ>E`ZlZOWQDli|KTm(U~vi9b`n)Gw_Vl-Y%1(gz0I!w4AXT2BXj{{wNcd zgz?DBS3B7==%h4dmOHn9{5Ve-8AXAjw@}PD;QW7KX^iS}F&Hqcx&@LSuVhV_*9+9_ zaC?tlK9{4B$A`3e7HXCgP>RsK;YsAVZ6U?Wjj@iD>Tf>ku4Z(a%Mn{`K zu=uB%NfViW`zXa4`gxRJ<5PlKvTF5vKV{c$7w!fs+w{pAw*P!@;9^o`EyGS2W{`lE+N7?rMxX=sjU+9Q%%kAP^-iVkUYK)k z%7L&W%p@RhKa5d@TbNp(iAQovn_i|&7PlZ^kEI&=Mpc4S!hL25iyVnUB5>k+N>vDM z_`7{_3p6xa4zV|8_5b`0@~qmgQT3d4+<-vR66Wt ztK^3}-1i^cl=l`%__4mE@1-TvxuYq_U?8P%UB&eW{K9*5DxVTRp)317&m|1I^v5o& zysF73sbS0uO!h(|btbuu1%Rv5`P{XXUrq3pd zWgD?#3O3j0_pFD#B=0b4ebeGks_JK+DXDIL|I`oOMi|k-{=s$meQH{Qm`MfexbrSy zu#nD}LrYU>!2`3WZcKNq9<$uEg*Di_6h}%Y@^|qLCXSsyBzyPH{MiVb&1WeyoR&F{ zAdZcg_|~+ZluI-|b|wx_0zSRxn-F}ly*}$J;-9fjEJ& z2My*7p(T^_X@j}^D21+EF~G;QMp3CNP8VPAApw38nkKF^C1r|)l4zNztMRC74ZP;# znZQs=c@#o{Yec7nLUR4L$spj>s(?`3=LkbcNlyy}S}{80l6G9n{igfp&OC4CCedZ`Mdvv6$CfCyo?`FFx7ZV~sa{bYrmcnmPl~Y(N^4naO@uimt0H*~@JR zEhT1B>|E5v2Zb1JLFyXvZD;eQsT5_x^P71dmQPcWx=d=6nW)P`_V^yp>JPP~Vb&H+ zlTOi%z3#hXsgmbtHn^jIi;d_=vrF8XqDnBaAm+Aq_7zz`>=rQ%y(m|98s~NTddrDi`lL z!Y9%prSB)|Rj3jO?B7bIumdxoBt7)$r+2X#f8eFbOG_6QIMeR)m73 zMFX1BnO}^NWgLzgx69qRixFmWPHc}&C=N$(*z@Rm+1)zUO)g5b`+8kcO_0=cwy(5A zVSg_ zqmUg1VB+QZZ}lOg>Mp$kLW1uTV+^y~8^YR&%@Z%9n{uY9L%k-h@wi`WnLBe?IcV&r zWEke0VGcZ2Gooal23l3`!jzm%K^ELGj|ml|iH;%jLm?9?SnyZYtXAhB3By^LBsPQu ze6JGf#th{ietD@t0$buPdk;gyc9Zy(TgfIe#`~?+LaUP353tRhQiGQ<-%9Q`t$g+P zP(5|?ow8xdo=?TrT4TJLwpOE3*J&{2Zudj*oWc$ml5xjg zyPK?;Ud@scs<$EZpzDJgKonZN;yL>ViGD2c-N%(_0C*1O_MWpAZ-2yc$DMuj*8neH=@Y-1Ewr(#qR5)UB1o84fM z8;x)FZe_YtDW@fs(E36^5Zfw>sU^L^n2VTVFpn*WYR;6-Q(DJh(k6wbYsZ|m=!=tj z$fQ3>hUL>g)r&xZ0^nSK5p9K3`Qux-a8|x!3yaO5$*;>XJqzAA@)Yx73cmD^>UtSzgmE}n|-^(09xaJ)xgNe8w@9Je%Ic*t*KX&P}6y}8X*KMRiwPRJK`kmN8%_{`;F*HRsPF-#L}DMVqgX zWyqAhlg!0C`X*Z%pW*Rtn+6^4^IH`^&BPe_agEsWoTb7=q-RwxK zP;RCz2ojEKCx5KB_-5aJE`UzGOSZ?fC-n}69p=5rHNtit`&w^CraN@goCozGSTRja zo`zx#Jj=IcR`d_^eTgsciTQMkv23r6X9rjauo23VlQb|UZN}HQPW9BhdS!fF!1W3Q z+nZ1B+clxDdYdfnovQ4ZRgpzZ&Q!vL%OAgBb-TBnf4KZ(Bkub)4T+G=H53{~0j#+H z2KRC#C^!OBGRbW}Y)<@+TMijkSuBxrKSl)~K7H(9s7d19pTroqk;#_JIvR6pxg~rz zeo{v0P0T?ob-6{BizIJ(&_U-)kDpX=jYp5v{Rd-H&C74}W-c0%-CO?RbD+zd$+<6k zx6|ff{?%>yFPRka(ep+b`dJ0>^S$0P+>2c`YZ0GEoY?o3#u(2T-xJttpXZF(PR zr?+m{4!b@-()H|umH2_GTLfc3LWv8@bfmoGK4C*s$jn!GX zD&OYe$0oycy;CQr4uuJzfU&%P17nXTV_a|mQ6>F@ zsHC4Hi3S>SIau1~$F^WiTdS;fAl(97_!ir0D3Bp$Qf zUd%MSyK|9?L`%3=>57egZqVnMI6Z|weKmtt{jr=#GCya*Ya?cGDm-OThG#WV)<(mL zogBT&1QD!xy)$Xpv*TK2njKa;Zt(%l?l&?R!-PDAl}u_;XwmR3IfWk0DyBJ68n*EY zn-N1eTw{%GP6=PXMrYM=>-}4)$i);j0czi^?>Af9cdp!cIqf3x5QQ#KU|ks3Uvv#J zy8Kd$C=R->QWHUX_XFa0vk>R$)k*2we8FdCFs{%Oq8CByV{z(jHj$LK92}f4%Lv;f zy{b)k)I-L7UN%(He42}cM>ap>TR@iQCoVjpM@QOrwFt+r>Gi9X-FQ2Ll!qvUMS+UB zx&LCd0U0kBPYrq!4{Xj41g0O{)7PE7dK!-gpaol5*bErvOkFSRbvgH4tCuVgp@$oJ z(!*@~`*WVb&qDayLY4LfT|)enB1h85(;YJDAV9$QH66J~SM@b7+?n@XZB zD4ftpYUa|wLOE)UujN_V+Gn@zP|o?H-E>kpIvZnw_`$-RE`yxu1ex;zb3>Im9J}7T zS@au=2P2y!myhT}Q``cjCzOl$BDzd+s~1%@Sh6^5MJt>_*u~KrE6O?Q157f<)7}X# z_%X&+PU7XA&5j_rE5ji-KdYOWVKqWU=#QYGjS779b)&B`t6w9grbB_dFH+L+-M3Aw z7V{j;h1y$0{&w+FE^xkB!6BOC3kEwJtq@Y$pv3xCGLrBE96l9 z7+Z*t?F7*a(SX=6e#(A~Nr%16y-RD8M`zkr@_12L0E)Q<9QhYOQz=ft@wMnlED87L zU2q|`m>%;_IomD!tc*pC`-`(Ap~gACa8=n4NjXk>GnBh8F+-O=e`sr9Se5X8&e}?x zn)94BO71Gt^|#+TbRmUtX|M4EXJ_U--dvhAab2Cx4|+CD?vp{cAgUGT5KM$)fwZI8 zmwb1-SDr`Ue%#~E1{NzE3at4M+{a0Z^+~qT@prllT$;QZYSzDB#LVIzGbYaFR1Fid zkXTu=UG>D78lL^OgXq^Z(4DfS{GhvxIo=YGH&mFOKOAb?-@c5U!h7aCmJenbmEIaU zH4Z;f;F&iaQ#aj%D_fqC`iwXhrrvZ)yr?U3(a#c0&rcFxOQKafL3`#C+b?q(<5ek8q0j7=8%DU-4X(^!94g3!}@| zX0pHEomEb+H|)aCIml?p!qn)dy8ghX(MEwjYjtIRlcqiQxq(oVRBMh`!C=>-94BqD zcPsrlWhJ(V1qV${X&ojjf{CW)&d8ZRw=ww~y3}J}q$G#8h zw5_oh-@sv&s&768VSjSIoK+?o&u{1~eU5DCOqr_$I(j}G#@jPlLua0HUplkaLyC_= z6cp18IL}`!Diq|RPxw^ z&Zew1L2K3O*2NloQct|@9ZK6CN0E2mxCnVR2@#h|9T&ew*tOxWhP7H~VVr}Yc`;*l z5YB&YVaJiV1s}T}O`6Mc&W!p1J?c8eOA5&cXDQsYZM@@p*5J8qqe+oB47ej&$tcb@ z_+{6VuN>kz+TJinp-xoufxlV(L8^8*V8C$p>#%w7Mf2|YFzL>fFC9_JPZ$g0n<(VG z>ADaIy;VSmjN{xHAb9fO#g}X9mpV(f!=61h4P$&5;f&smIN;ln;tpRk$S|p(v~iAM zn0fPVU+}eBR;~UhJkm0*<<+3-Qp}=fbadi{g#OWC0V94js3h$$H9_j9gBGnc_JJ5d zOPv&CFUuWXv{_RlRwZhK=n09A0!=mL!p$2UewGH<>?MS0qIzkc`aDSUvE~C2)kfou z-Io;E&V1gw@cpt#pRtcCx$}?|jxMiY!@i%J+j}x5jZf5?WhoI3yE$gJKVL#%Ew7BY zPuz~5z9QfP@A>-DwBY5BGi@VA=_quG0{O#u5dXatq@Y*_~mB!ZlSLVJH`6N2t^%SF? z4j-Z_#n<6~xuZ+_KI=_ATkFy{fy`_^_nSfbwXpBp<90kw@0|JbgTspNA6(x-#Lpe& zk9#PLobQ)zW7x2fbx4jj#LP=AWFH7N7q$_IOnU-}XO-F}aT0|2+H%kutmeh5u}10{ z$#GrFR&ZTxXVK`tW=ETS)+!1?H&fWf+gkDEpl)#I^18@}=X%yCtOW%u=l`3~9xaCZ z3IZ&b)CV!ws8;^Svk@FRx^a~}o9y8#=Tp7;*B@Z)I)JSxmG-+YG05byl7ok%q{#wix z7|xx2{$Nh7fPx2Wn+IaCbT9ntavDa=kQPJqkF8aTv=4KxpSN*}tygRHUE?j0*rkzYrszdL}i+v+>K9}~1x z*ubG@w~-7vbLD%LG`IZCbSG8Veg z@l1zDD*GS-)^|Q2X?Fq@d?I&+9F{!_KD_6nGhN~##~8A5v7ZaeQFx*Cy;1WU5faYD^!9Zl}wtG>3=%b?>?W(ywRD_fUXbE9TignP^tVAUzJ&hyoN@tD>N9}^j z&t{41<@_|r6TV))#j#m`pAcJX!;^2ihH*ELP%1s>PWhEJV$FgJCf<&H2xZ!?s?=S) zQEy#}VKFjXXZOC(mYKbSs@@n&aZQ6C4_PJ>A~j>*P${p^b~1z$KV{fgK6~Eg(otVs z%HHFFdbAbA%^QPzCEZY;AQ46sEk^;Bk$(XT5Ij8Ez%B-lIv1Q@zn zEN2)RVdr#^oVYHegX&@7&l&a)Ib2K>yiwpw-oKcqA>iO%8W#ecFysl-uN@T8#^lTb z@jtn9mZ2GcdJ>YPDutDVCRvWAYeR8nor#p|PjP@9vp+X<{$h^%OB^&noIkeqPVJ&S z9Kv~kf-@>`_E+YoF6Rm?jMuG@+zP7ESeMI7kk~u26JuNYW_*WK7Qf7>u#)v`b(BO& zN~71iO_e*FnS-XA{`d9VNfeV54aGelnw#BX7D{57mz1TGQLV-iP8w-sez<}wv^a*= zW-fvGfb{ZwSAfr>0`zkuo?51Z1{^bmQqNWsgI?I4UvgfuqSHf&exK;=57}q=A<;Tk zC;sFOUGW(@vefPL0C_dhCZ7!27t!cV;nZHuS9mvScXU6ys>FV*J%Zp>^D0O4Aw)%G_N|4o zI4;Bct+wmg?`-_)7lvH8y;4axAcDgaX z*R)WFdC8l@m(h>}#fT~lc*Rf{y@4`K`U8yTV;)L}llb}@hRa`DQfc(WO^e7~+aCLV zc|NuG*>LsY#X>*P)^~hn?q%NZ?>Oe^^LNO%LOqM~H`=ze;TXwoG-^(L$+V-2L4{E9 z*_OqS8FuP8Wa{w913iHhgKOfcl&WG*NwZVarWSXqEd=w-1Ujm8?Yb8;(XZxWmx{fe zsMNE1B(iJhzw-4glddEO3Jst@Sa8H&OhkZPT40X>r*@xYPt=P2LdRF%kHbs3M=wcJ zzhQ*mJ)={l^tKXu^XjBF|JvyJBT0vSk`6X{naUS=Cb{D+b}o|~f-y|mH1#s3)FE=i z_7X2Lkxlbt&^OHGc+ya1+?0g<(NrHpNmpB+(V0N=GtD`(uqDmUUx*Cnn|)v3D&#X( zoUirnc>E?a>%E;*o$mn-UcFN5EjL5?G1bQsy8)!JmC5^uvx*N=1`Pza9#Cc*js!kd9hAb?Ae+!n@!;)@>03#ddq z2A9)?*_3*Mm3$=M;j||+N)*mL@s+cc+lz%Z2!AN5>*MHFnDr+s7}t{RVyzY3BJOSK ziru%k5;tl4zcUi+NsDg>(tR2Ph%mmb1Sy^5n^St znnhb5jMjUIsGpPy3K_~hWnWW#O`w-Ymp&V9%{<#~N@bvbs37{~ljVlqr}z5zXz&s( z2T&*&1;PW+{eQlz`0Jeu3~+A$;gu*jx9{o(=Jj*O^Lp;Y&MA5HGY_0`?xNs{3Z`)X zjVHTTM}W?SCE=^mgavt(;T``r*Lll{ewCQjzGFbhrKT?b+uSD@gV86OwR#ao8wJm2 zVIpLip9IDiO^M0Qgwf;M;a8`WI#V|&qG#S_`L)C6$-RCXGQYh0j91TUwP{0sw;KkAZ zR3}G*oGZN;mK3;V`#8+}&61!ipAWcZyG{?)R5fSxKl2V|uUp$NR-822Wpd+(>R0}? z1s9R|m7$|{`YyyS$7Yk$m22%2_cBa;b-l%kBo*#M)P98UMuP^sIM)AtWAd6LAR zC5o_-vXH5XTI{N3dxSiE7TWPlymK++TxztWjEAOm@U`x-dy686dg7|CcJn_P>nQf=BWZNwM3C zTHinXIMNR-ctU1BOw03ujLm-f;bY2rZNaC`>|RqE>WKP_`>%RLp9|^vHtTgIzpkDA zU|ghIK;f2aql1u?Q1Q6b6(c1%XLE;@&`%T7`Ug{sxa5G+le6aUhA^R?$;02eZ(p7J zkvx7gdU?^uk-9l%$YPw}B<&?hop0hnWA-Tk>^D(H~0a8Fv#{PW8+JORDR$M1>?AGIgSc3C-OZ4iD-`pXPiiYde`BaE@(;v7$}D}Liv z45QVcR|&^imT7VVAp#1udMew`cD;3$g3i3zl4n#BJ{E`-*(P$aTN|X6cYXD`5nG7( zc2L?=?=BX%Si*wyyiWq7a|!q0V+Wtq5&@>=8maDEb3|e5N|j!}Za|l`e(d!{ihPe7 zUza}S#N~XX_04^1gPFKNNc-r1>GuMgdqj&-Lwy14S+wUWo1=oie_S;hN|sfbv~m@y z!_~9DITw`3?`JP<&VGYvzQ*Se_&jmUcZAq4w6H6@dG`*fGY3Yu?w@zQum_=g565 zO2yW5FNdCx(q}iqf#tC zpVPWB-C-;ZWnK(?(G;O8ROA~qenrS|dI2Yd4MF5mG9JnI+Ld#1e3r}xH$mJ-d`0SW zxz~|BA)`fsxh11~?SLs&nqTNzR1;av?&Wu|8&D4#(a`>R2BSG$Y zD$FNdKZ%3Na|>ImMn=BrnPY~p>j6uF6}k;+GPz%ct>w+?uh7)z#a79TJ%txrH6%;~ z<;bHCqe@D4O z62vE*4=-Ar4R))UVl!5J97J_@VmGK=-{q%^W~f zF(*{5?+~}|?;DeN^~~E>eB;w$PX>x&VJ@Mt=ljUrQ9aUh85rjWyQ|8t`0lM$wpzAnEo|f>K^(^NQrxcwUNy5&GwF7*?*Gu`J{6coV+#ii&M?aOSr3D8bemyww)46bdnX%`K%Z|6^&zFbDSNS&^X9~ z`Z>BhW%WhT0u-PS4t!qopU?>z`!*VQHA!NIH_n{II{`GIqYQ5STSUu;YBZy7cXyqUmJFKxC!|&1az~n{$ zSSrIRqutG-193v+lVCh)v^3{HcX4I?$FO`|=IOdKJB0zO9Q}_>>ZEAKy5;Bw33!+j z93nj)$eb%x6iB1yoYzTzXrA9z6sQsgeDVIDs!@vhpDAL9i*KY0r9GeeF`3@M zV-Rqj*5FEUW|OVAJ~s_H~BocZ4q`Im~&kcbjrj z%4=epeHqjEFk$WaXm`5eJ<*MOD-B<*rGsx~dR7n;N;-AwvV1&$ErS25)|+>oS+&@& zd2%jaWhH(-XN~**5nn`Jg|QTl3sWa&cB9`AVam70I7zkKw^(+VgH^&zQB&ci?Q@oo z4$|rG=!vDh_*{5qt#*G#(j!`mM%>$;7$ zKcdA-4{@HM=G9Z(bvHLuENQ}#)0=jd9;c0>`h45TTH$g=v`W&#g~X(V;e_YUr=E2_ z{r&^}k%T3RwGiNYvj2sNpC>jPAHx6@H*vH0nUYbF)>GBHD&}fqVy~nLYTeb%UBcSL z6(|W+G&xDbkw|PPSU}Rk&CJ!t+1<$%0s}p-Is~fWVd{Q-7zD@w2tmpw4#1HL|E~9E zVey}5C~)xbAR!zG;MfE>7w}#U@NNwccz^;}y^@K$tIhE_3!Gel8wHUr*FPyEY$zz0 zDscP*aIOOA%Yd^Pq#S``62QYD{&y(FFA_FX(#F!#0yx?MJj=lVf&ehGo3n`-aLfc? zUe?~u))tNssI!HujgvV9-0JtSaCL$}og4x2-ERUZcWYM*AZ-bp1_AtU<7ojw@?k^G zoa~((AwUzLVh|{B;DiJOgfmh=1AtaTpt8Wx6A-98a7qNAKL}I_PzVI70-P}cfvQ2E zpuT_;DTfhQkg)YrVywZ1Zoa}g1jM6;B*aZ2-F79 z6Ck2J1nK~R0%8N=IYVGTA3y=w+_0hUKur*+Cj@#EI8Fi>_1-{pfUgaK<0GItHshLv>GnAYp(;<$$9_Y|O+Qt?VrT!J!)N77p5ALdnGIxJ7_M z0Q>&mVI#-w1Dj(2;RTkr|36&V|F3nxf%oXa3ji4qB$5Zbu=)7-Aw0kf5iouj1c5;O zyZ{=KkNam{fcC50Nts_|Pv5bBJ^v>^PzPAg@dfZZ)(prDym;a8_KjnZjV45FT`8lS6^#k>t2!RdWxq+bo)&(~3myjTj zUuiJUKWR_~-~yk6caZ`CV3U6JJ17=-ff;}c8*x(gaaI46`Y8nBSTV=U0X|^0 zpwM8ZUk(0s2Za4cC|+P?8UeibdHf6z9?TEYkifnaHz+T#Uk=iKT}Z&KAi*;KqhTWe zUj&xt0~GM<0$cGD}`L7{(L$JPMkJGK~r{`>t{(;y9e{@Zm@ zCpIX+uLhsKgG1rC2LYjffA)*-ulvcff2`s6yr5!G^7z%s!2S3`CeR}s2a8{xP7DGA zQ6%7r#Ua3o%yCKr7?wX%Qb17pnUaP8e)RaM3}6I5Q?d~Fu@}V#d_4Rl{>ejtNOqi0 z5wOFbDJ4LMKU2yO_z4dc2>gVHD&+X<5FmP~K@cZA)WPs{T#E+Kxj$2y5THkoXS*9sWXKU3Ba?voas1Wq7e9%r$E06lRmhCR@!KWPpSh6(@aEAm-5of6=_GX=r+JQ9 z0+A31-N&mAKw~^0$1`Ox^}jI7bHdvb4C2RC-2{fh&y+U=7$^V@8;G93^3w6sV^{z< zHt@&)!VTD%UnV6DgB(Mx-=Bj_emxh1{SHt5IqeE6AqHfEnwdagV8u`|`2RH<5YbPv z{U!n^{RxZz$ptp#4m7c#>r6bVq!TJB>i{Rovty3-p67-7yl7$Pw2fW5~ipI?adYFIb z;|3SCPSbe)nR>^-al*z2ImI@1Niqpzkxu2Sm6|n8;sAVXn+qoP2-0B z(I0?c`@>FQ+<=k)QI`J?dx0V0f7lBQfRm@k8w|++$)K8a6%_fPV&N#7<6NXgQ9wa*mcxkmJyD97)~W XOEjs1TBrW5VE diff --git a/audit/dataset_statistics_summary.md b/audit/dataset_statistics_summary.md deleted file mode 100644 index 11ff1811b..000000000 --- a/audit/dataset_statistics_summary.md +++ /dev/null @@ -1,37 +0,0 @@ -# Dataset Statistics Summary - -This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains 40,495 result rows across 59 datasets, 178 evaluation names, 794 developers, and 5,299 models. The coverage plot (`dataset_statistics_plots/coverage_counts.pdf`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. - -Normalization quality is strong for this snapshot. Of 40,495 result rows, 40,395 rows can be converted onto the shared zero-to-one scale, or 99.8% of the dataset. The only observed normalization exclusion is 100 out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`dataset_statistics_plots/normalization_quality.pdf`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. - -Coverage is uneven by design. The most-covered normalized summaries are GPQA, IFEval, BBH, MATH Level 5, MMLU-PRO, MUSR. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`dataset_statistics_plots/top_evaluation_coverage.pdf`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. - -The new model-per-dataset histogram (`dataset_statistics_plots/models_per_dataset_histogram.pdf`) adds that missing model-coverage view. Across datasets, the median number of unique models is 37, and the largest dataset-level model count is 4,557. The highest-coverage datasets by unique model count are GPQA (4,557), IFEval (4,557), BBH (4,496), MATH Level 5 (4,496), MMLU-PRO (4,496), MUSR (4,496). This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. - -The inference-engine spread plot (`dataset_statistics_plots/inference_engine_spread.pdf`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are unknown (39,618), ollama (450), openai (150), google (54), anthropic (47), gemini (33). In this snapshot, 877 rows have a named runtime field and 39,618 rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. - -Mean normalized scores vary sharply across tasks. The lowest means include BFCL leaderboard CSV: bfcl.memory.kv_accuracy, MATH Level 5, WMT 2014, BFCL leaderboard CSV: bfcl.memory.vector_accuracy, while the highest means include BFCL leaderboard CSV: bfcl.overall.latency_mean_s, BFCL leaderboard CSV: bfcl.overall.latency_p95_s, helm_mmlu: Marketing, RewardBench: Chat. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. - -The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as ARC Prize evaluations leaderboard JSON: v2_Public_Eval, ARC Prize evaluations leaderboard JSON: v2_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Public_Eval indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`dataset_statistics_plots/score_range_by_eval.pdf`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`dataset_statistics_plots/normalized_score_variability.pdf`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. - -The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. - -## Plot Notes - -`coverage_counts.pdf` is the high-level inventory plot. It compares the major corpus counts on a log-scaled axis: result rows, normalized rows, unique models, unique developers, unique datasets, and unique evaluation names. Its purpose is to make the shape of the datastore visible at a glance. The figure shows that the snapshot is much larger in row count than in benchmark or evaluation count, and that model coverage is broad relative to the number of datasets. Because the axis is logarithmic, the smaller categories remain readable instead of disappearing beside the result-row total. - -`normalization_quality.pdf` explains how many rows can safely enter normalized-score analyses. It separates normalized rows from the different exclusion categories, including out-of-range scores, missing scores, missing bounds, zero-width bounds, and incompatible score types. This plot is a data-quality checkpoint rather than a performance result. When the normalized bar dominates, as it does here, the downstream normalized mean, range, and variability figures are based on almost the entire datastore. Any nonzero exclusion bar points to a specific normalization failure mode that may deserve follow-up. - -`top_evaluation_coverage.pdf` ranks the most-covered benchmark/evaluation pairs by normalized result-row count. It answers a different question from model breadth: which evaluation slices contribute the most rows to the descriptive statistics. The chart is useful for spotting which benchmarks can dominate aggregate impressions and which ones have enough observations to support more stable summaries. A long bar does not necessarily mean the benchmark covers many unique models, because repeated rows, submetrics, or source-specific reporting patterns can also increase row count. - -`models_per_dataset_histogram.pdf` shows the distribution of unique model counts across datasets, where dataset corresponds to the `benchmark` field in the JSON. Instead of listing individual benchmarks, it bins datasets by how many distinct model identifiers appear in them. This makes the coverage imbalance visible: a small number of datasets cover thousands of models, while many datasets cover far fewer. The median reference line helps distinguish the ordinary dataset from the high-coverage comparison hubs that dominate broad ecosystem-level coverage. - -`inference_engine_spread.pdf` is a horizontal bar chart of result rows by recorded inference engine or platform. The x-axis uses a log scale so smaller nonzero runtime categories remain visible next to the very large `unknown` bucket. The y-axis is ordered with the largest categories at the top, making the plot readable as a ranked metadata-coverage view. This chart should be interpreted primarily as runtime observability: it shows where execution-platform metadata exists and where it is absent, not a definitive ranking of which engines were actually used most often across the ecosystem. - -`normalized_score_mean_by_eval.pdf` ranks the evaluation slices with the lowest mean normalized score. It is a quick way to find benchmarks or metrics where the collected model population tends to score poorly on the zero-to-one scale. This should not be read as a model leaderboard, because the model set is not matched across evaluations. Instead, it is a difficulty and saturation diagnostic: low means may indicate hard tasks, older model coverage, sparse high-performing submissions, or metrics where good performance is rare in the collected data. - -`normalized_score_variability.pdf` plots each evaluation’s mean normalized score against its standard deviation, with point size reflecting normalized row coverage. This figure is designed to identify discriminative evaluations. Points with high coverage and high variability are especially interesting because they have enough rows to be credible and enough spread to separate stronger and weaker systems. Low-variability points may still be useful, but they are less likely to support fine-grained comparisons unless the goal is detecting failures, regressions, or saturation. - -`score_range_by_eval.pdf` ranks evaluation slices by their min-to-max normalized score range. It complements the standard-deviation plot by showing the full observed spread rather than the typical spread around the mean. A wide range can indicate that an evaluation distinguishes sharply between weak and strong models, but it can also reflect outliers, mixed subpopulations, or uneven source coverage. This plot is most useful as a triage tool for finding evaluations where a closer row-level or paired-model analysis may reveal meaningful structure. - -Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. From 0426fd2d292d9915d2434aaa89a5c1c87e96202f Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 10:16:09 -0300 Subject: [PATCH 12/15] new 2-panel plots --- every_eval_ever/helpers/dataset_statistics.py | 269 +++++++++++++++++- scripts/plot_dataset_statistics.py | 146 +++++++++- tests/test_dataset_statistics.py | 102 ++++++- 3 files changed, 505 insertions(+), 12 deletions(-) diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py index 69ac29aaf..5667d1df1 100644 --- a/every_eval_ever/helpers/dataset_statistics.py +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -32,6 +32,24 @@ 'metric_kind', 'metric_unit', ) +METADATA_FIELD_CANDIDATES = ( + {'key': 'generation_config_present', 'label': 'generation config'}, + {'key': 'generation_temperature', 'label': 'temperature'}, + {'key': 'generation_max_tokens', 'label': 'max tokens'}, + {'key': 'generation_agentic_config_present', 'label': 'agentic config'}, + {'key': 'inference_engine', 'label': 'runtime/platform'}, + {'key': 'source_locator', 'label': 'source URL / HF repo'}, + {'key': 'source_organization_url', 'label': 'source org URL'}, + {'key': 'evaluator_relationship', 'label': 'evaluator relationship'}, + {'key': 'detailed_results_file', 'label': 'detailed results'}, + {'key': 'has_uncertainty', 'label': 'uncertainty'}, + {'key': 'uncertainty_num_samples', 'label': 'sample count'}, + {'key': 'metric_id', 'label': 'metric ID'}, + {'key': 'metric_kind', 'label': 'metric kind'}, + {'key': 'metric_unit', 'label': 'metric unit'}, + {'key': 'model_parameters', 'label': 'model parameters'}, + {'key': 'model_license', 'label': 'model license'}, +) def read_data(datastore: str) -> list[str]: @@ -76,7 +94,28 @@ def extract_result_rows(con: Any, schema_table: str) -> list[dict[str, Any]]: er.metric_config.metric_name AS metric_name, er.metric_config.metric_kind AS metric_kind, er.metric_config.metric_unit AS metric_unit, - source_metadata.source_organization_name AS source_organization + source_metadata.source_organization_name AS source_organization, + er.generation_config IS NOT NULL AS generation_config_present, + TRY_CAST( + er.generation_config.generation_args.temperature AS DOUBLE + ) AS generation_temperature, + TRY_CAST( + er.generation_config.generation_args.max_tokens AS BIGINT + ) AS generation_max_tokens, + er.generation_config.generation_args.agentic_eval_config IS NOT NULL + AS generation_agentic_config_present, + er.source_data.source_type AS source_data_type, + er.source_data.hf_repo AS source_hf_repo, + er.source_data.url AS source_urls, + source_metadata.source_organization_url + AS source_organization_url, + source_metadata.evaluator_relationship AS evaluator_relationship, + detailed_evaluation_results.file_path AS detailed_results_file, + TRY_CAST( + er.score_details.uncertainty.num_samples AS BIGINT + ) AS uncertainty_num_samples, + to_json(model_info.additional_details) + AS model_additional_details_json FROM {schema_table}, LATERAL UNNEST(evaluation_results) AS t(er) """ @@ -100,8 +139,50 @@ def extract_result_rows(con: Any, schema_table: str) -> list[dict[str, Any]]: 'metric_kind', 'metric_unit', 'source_organization', + 'generation_config_present', + 'generation_temperature', + 'generation_max_tokens', + 'generation_agentic_config_present', + 'source_data_type', + 'source_hf_repo', + 'source_urls', + 'source_organization_url', + 'evaluator_relationship', + 'detailed_results_file', + 'uncertainty_num_samples', + 'model_additional_details_json', ] - return [dict(zip(columns, row)) for row in rows] + extracted = [] + for row in rows: + item = dict(zip(columns, row)) + source_urls = item.get('source_urls') + item['source_locator'] = item.get('source_hf_repo') or source_urls + model_details = parse_json_mapping( + item.pop('model_additional_details_json', None) + ) + item['model_parameters'] = ( + model_details.get('params_billions') + or model_details.get('parameters') + or model_details.get('parameter_count') + ) + item['model_license'] = model_details.get('license') + extracted.append(item) + return extracted + + +def parse_json_mapping(value: Any) -> dict[str, Any]: + if not value: + return {} + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + if isinstance(parsed, dict): + return parsed + return {} def normalize_score( @@ -292,6 +373,189 @@ def models_per_benchmark(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: return summaries +def has_value(value: Any) -> bool: + if value is None: + return False + if isinstance(value, bool): + return value + if isinstance(value, str): + return bool(value.strip()) + if isinstance(value, list | tuple | set | dict): + return bool(value) + return True + + +def benchmark_name(row: dict[str, Any]) -> str: + value = row.get('benchmark') + if value is None: + return 'unknown' + text = str(value).strip() + return text or 'unknown' + + +def format_benchmark_label(benchmark: str, result_rows: int) -> str: + return f'{benchmark} (n={result_rows:,})' + + +def field_present_rate(rows: list[dict[str, Any]], field: str) -> float: + if not rows: + return 0.0 + return sum(has_value(row.get(field)) for row in rows) / len(rows) + + +def metadata_completeness( + rows: list[dict[str, Any]], + top_benchmarks: int = 12, + top_fields: int = 12, +) -> dict[str, Any]: + candidate_fields = [ + field + for field in METADATA_FIELD_CANDIDATES + if any(field['key'] in row for row in rows) + ] + if not rows or not candidate_fields: + return { + 'fields': [], + 'benchmarks': [], + 'matrix': [], + 'top_benchmark_count': top_benchmarks, + 'other_result_rows': 0, + } + + rows_by_benchmark: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + rows_by_benchmark[benchmark_name(row)].append(row) + + field_summaries = [] + for field in candidate_fields: + key = str(field['key']) + benchmark_rates = [ + field_present_rate(items, key) + for items in rows_by_benchmark.values() + ] + present_rate = field_present_rate(rows, key) + missing_rate = 1.0 - present_rate + benchmark_stddev = ( + statistics.pstdev(benchmark_rates) + if len(benchmark_rates) > 1 + else 0.0 + ) + selection_score = missing_rate * max(benchmark_stddev, 0.05) + field_summaries.append( + { + 'key': key, + 'label': str(field['label']), + 'missing_rate': missing_rate, + 'benchmark_stddev': benchmark_stddev, + 'selection_score': selection_score, + } + ) + field_summaries.sort( + key=lambda item: ( + -float(item['selection_score']), + -float(item['missing_rate']), + str(item['label']), + ) + ) + selected_fields = field_summaries[:top_fields] + + top_benchmark_names = [ + benchmark + for benchmark, _ in sorted( + ( + (benchmark, len(items)) + for benchmark, items in rows_by_benchmark.items() + ), + key=lambda item: (-item[1], item[0]), + )[:top_benchmarks] + ] + selected_field_keys = [field['key'] for field in selected_fields] + + benchmark_summaries = [] + benchmark_groups: dict[str, list[dict[str, Any]]] = {} + for benchmark in top_benchmark_names: + items = rows_by_benchmark[benchmark] + benchmark_groups[benchmark] = items + benchmark_summaries.append( + { + 'benchmark': benchmark, + 'label': format_benchmark_label(benchmark, len(items)), + 'result_rows': len(items), + 'overall_completeness': average_completeness( + items, selected_field_keys + ), + } + ) + + other_rows = [ + row + for benchmark, items in rows_by_benchmark.items() + if benchmark not in top_benchmark_names + for row in items + ] + if other_rows: + benchmark_groups['Other'] = other_rows + benchmark_summaries.append( + { + 'benchmark': 'Other', + 'label': format_benchmark_label('Other', len(other_rows)), + 'result_rows': len(other_rows), + 'overall_completeness': average_completeness( + other_rows, selected_field_keys + ), + } + ) + + benchmark_summaries.sort( + key=lambda item: ( + item['benchmark'] == 'Other', + float(item['overall_completeness']), + str(item['benchmark']), + ) + ) + + matrix = [] + selected_fields_by_key = { + str(field['key']): field for field in selected_fields + } + for benchmark_summary in benchmark_summaries: + benchmark = str(benchmark_summary['benchmark']) + items = benchmark_groups[benchmark] + for field_key in selected_field_keys: + present_rate = field_present_rate(items, str(field_key)) + field = selected_fields_by_key[str(field_key)] + matrix.append( + { + 'benchmark': benchmark, + 'benchmark_label': benchmark_summary['label'], + 'field': str(field_key), + 'field_label': field['label'], + 'present_rate': present_rate, + 'missing_rate': 1.0 - present_rate, + 'result_rows': len(items), + } + ) + + return { + 'fields': selected_fields, + 'benchmarks': benchmark_summaries, + 'matrix': matrix, + 'top_benchmark_count': top_benchmarks, + 'other_result_rows': len(other_rows), + } + + +def average_completeness( + rows: list[dict[str, Any]], fields: list[str] +) -> float: + if not rows or not fields: + return 0.0 + present = sum( + has_value(row.get(field)) for row in rows for field in fields + ) + return present / (len(rows) * len(fields)) + + def grouped_summaries( rows: list[dict[str, Any]], value_key: str, @@ -499,6 +763,7 @@ def descriptive_statistics( rows, 'inference_engine' ), 'models_per_benchmark': models_per_benchmark(rows), + 'metadata_completeness': metadata_completeness(rows), 'quality': quality_counts(rows), 'normalization_exclusions': exclusions, 'score_summaries': grouped_summaries( diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index aa36f55fe..1740a47a0 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -19,6 +19,7 @@ 'range': 'score_range_by_eval.pdf', 'models_per_dataset': 'models_per_dataset_histogram.pdf', 'engine_spread': 'inference_engine_spread.pdf', + 'writeup_overview': 'writeup_dataset_statistics_overview.pdf', } @@ -90,6 +91,10 @@ def short_label(value: str, width: int = 46) -> str: return textwrap.shorten(value, width=width, placeholder='...') +def wrapped_label(value: str, width: int = 16) -> str: + return '\n'.join(textwrap.wrap(value, width=width, break_long_words=False)) + + def columns( rows: list[dict[str, Any]], keys: tuple[str, ...] ) -> dict[str, list[Any]]: @@ -253,6 +258,22 @@ def plot_normalized_score_means( def plot_score_variability( rows: list[dict[str, Any]], output_dir: Path, plt: Any, sns: Any | None ) -> Path: + fig, ax = plt.subplots(figsize=(10, 7)) + draw_score_landscape(ax, rows, sns) + ax.set_title('Normalized Score Level vs. Variability') + + path = output_dir / PLOT_FILES['variability'] + save(fig, path) + plt.close(fig) + return path + + +def draw_score_landscape( + ax: Any, + rows: list[dict[str, Any]], + sns: Any | None, + annotation_limit: int = 8, +) -> None: plot_rows = [ { 'mean': row['mean'], @@ -261,13 +282,25 @@ def plot_score_variability( 'label': label(row), } for row in rows + if row.get('mean') is not None ] + if not plot_rows: + ax.text( + 0.5, + 0.5, + 'No normalized score summaries available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_axis_off() + return + max_count = max(row['count'] for row in plot_rows) sizes = [ 45 + 455 * math.sqrt(row['count'] / max_count) for row in plot_rows ] - fig, ax = plt.subplots(figsize=(10, 7)) if sns is not None: sns.scatterplot( data=columns(plot_rows, ('mean', 'stddev', 'count')), @@ -289,13 +322,16 @@ def plot_score_variability( ax.set_xlim(0, 1) ax.set_xlabel('Mean normalized score') ax.set_ylabel('Standard deviation') - ax.set_title('Normalized Score Level vs. Variability') notable = sorted( plot_rows, - key=lambda row: (row['stddev'], abs(row['mean'] - 0.5)), + key=lambda row: ( + row['stddev'] * math.log1p(row['count']), + row['count'], + abs(row['mean'] - 0.5), + ), reverse=True, - )[:8] + )[:annotation_limit] for row in notable: ax.annotate( short_label(row['label'], 24), @@ -305,11 +341,6 @@ def plot_score_variability( fontsize=8, ) - path = output_dir / PLOT_FILES['variability'] - save(fig, path) - plt.close(fig) - return path - def plot_score_ranges( rows: list[dict[str, Any]], @@ -435,6 +466,100 @@ def plot_inference_engine_spread( return path +def plot_writeup_overview( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + fig, (ax_missing, ax_score) = plt.subplots( + 1, + 2, + figsize=(14, 6.6), + gridspec_kw={'width_ratios': [1.35, 1.0], 'wspace': 0.34}, + ) + draw_metadata_completeness(ax_missing, stats, plt, sns) + draw_score_landscape( + ax_score, + stats['descriptive'].get('normalized_score_summaries', []), + sns, + annotation_limit=7, + ) + ax_score.set_title('B. Score landscape by metric') + ax_score.title.set_fontsize(15) + + path = output_dir / PLOT_FILES['writeup_overview'] + save(fig, path) + plt.close(fig) + return path + + +def draw_metadata_completeness( + ax: Any, stats: dict[str, Any], plt: Any, sns: Any | None +) -> None: + completeness = stats['descriptive'].get('metadata_completeness', {}) + fields = completeness.get('fields', []) + benchmarks = completeness.get('benchmarks', []) + matrix_rows = completeness.get('matrix', []) + if not fields or not benchmarks or not matrix_rows: + ax.text( + 0.5, + 0.5, + 'No metadata completeness summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_axis_off() + return + + field_order = [field['key'] for field in fields] + field_labels = [wrapped_label(str(field['label']), 13) for field in fields] + benchmark_order = [benchmark['benchmark'] for benchmark in benchmarks] + benchmark_labels = [ + short_label(str(benchmark['label']), 38) for benchmark in benchmarks + ] + value_by_cell = { + (row['benchmark'], row['field']): 100.0 * row['present_rate'] + for row in matrix_rows + } + values = [ + [ + value_by_cell.get((benchmark, field), 0.0) + for field in field_order + ] + for benchmark in benchmark_order + ] + + if sns is not None: + sns.heatmap( + values, + ax=ax, + vmin=0, + vmax=100, + cmap='RdYlGn', + xticklabels=field_labels, + yticklabels=benchmark_labels, + linewidths=0.35, + linecolor='white', + cbar_kws={'label': '% present', 'fraction': 0.05, 'pad': 0.05}, + ) + else: + image = ax.imshow(values, vmin=0, vmax=100, cmap='RdYlGn') + colorbar = plt.colorbar(image, ax=ax, fraction=0.046, pad=0.04) + colorbar.set_label('% present') + ax.set_xticks(range(len(field_labels))) + ax.set_xticklabels(field_labels) + ax.set_yticks(range(len(benchmark_labels))) + ax.set_yticklabels(benchmark_labels) + + ax.set_title('A. Reporting completeness is uneven') + ax.title.set_fontsize(15) + ax.set_xlabel('') + ax.set_ylabel('') + ax.tick_params(axis='x', labelrotation=0, labelsize=9, pad=2) + ax.tick_params(axis='y', labelsize=10) + for tick in ax.get_xticklabels(): + tick.set_ha('center') + + def main() -> None: args = parse_args() if args.top_n < 1: @@ -468,6 +593,9 @@ def main() -> None: 'engine_spread': plot_inference_engine_spread( stats, output_dir, plt, sns, args.top_n ), + 'writeup_overview': plot_writeup_overview( + stats, output_dir, plt, sns + ), } print(f'Wrote {len(plot_paths)} PDF plots to {output_dir}') diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py index 708572cf6..de38e63d3 100644 --- a/tests/test_dataset_statistics.py +++ b/tests/test_dataset_statistics.py @@ -18,8 +18,9 @@ def row( metric_name: str | None = 'Score', metric_kind: str | None = 'accuracy', metric_unit: str | None = 'proportion', + **metadata, ) -> dict: - return { + result = { 'schema_version': '0.2.2', 'evaluation_id': f'{model_id}/{benchmark}/{evaluation_name}', 'model_id': model_id, @@ -38,6 +39,8 @@ def row( 'metric_kind': metric_kind, 'metric_unit': metric_unit, } + result.update(metadata) + return result def test_normalization_respects_lower_is_better(): @@ -164,6 +167,7 @@ def test_json_report_shape(): assert report['descriptive']['counts']['result_rows'] == 2 assert 'inference_engines' in report['descriptive'] assert 'models_per_benchmark' in report['descriptive'] + assert 'metadata_completeness' in report['descriptive'] assert 'metric_id' in report['descriptive']['score_summaries'][0] assert 'coverage_aware_model_summaries' in report['observational'] assert 'pairwise_model_comparisons' in report['observational'] @@ -249,6 +253,102 @@ def test_inference_engine_counts_group_missing_as_unknown(): ] +def test_metadata_completeness_counts_present_and_missing_values(): + rows = [ + row( + 'model/a', + 'bench-a', + 'eval', + 0.9, + inference_engine='vllm', + generation_temperature=0.2, + source_locator='https://example.test/data', + has_uncertainty=True, + ), + row( + 'model/b', + 'bench-a', + 'eval', + 0.8, + inference_engine=' ', + generation_temperature=None, + source_locator=[], + has_uncertainty=False, + ), + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=1, top_fields=16 + ) + matrix = { + (item['benchmark'], item['field']): item + for item in summary['matrix'] + } + + assert matrix[('bench-a', 'inference_engine')]['present_rate'] == 0.5 + assert matrix[('bench-a', 'generation_temperature')][ + 'present_rate' + ] == 0.5 + assert matrix[('bench-a', 'source_locator')]['present_rate'] == 0.5 + assert matrix[('bench-a', 'has_uncertainty')]['present_rate'] == 0.5 + + +def test_metadata_completeness_aggregates_other_benchmarks(): + rows = [ + row('model/a', 'bench-a', 'eval', 0.9, generation_temperature=0.1), + row('model/b', 'bench-a', 'eval', 0.8, generation_temperature=0.2), + row('model/c', 'bench-b', 'eval', 0.7, generation_temperature=None), + row('model/d', 'bench-b', 'eval', 0.6, generation_temperature=None), + row('model/e', 'bench-c', 'eval', 0.5, generation_temperature=0.3), + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=2, top_fields=16 + ) + + assert summary['other_result_rows'] == 1 + assert summary['benchmarks'][-1]['benchmark'] == 'Other' + assert summary['benchmarks'][-1]['label'] == 'Other (n=1)' + assert summary['benchmarks'][-1]['result_rows'] == 1 + assert any( + item['benchmark'] == 'Other' + and item['field'] == 'generation_temperature' + and item['present_rate'] == 1.0 + for item in summary['matrix'] + ) + + +def test_metadata_field_selection_favors_missing_and_uneven_fields(): + rows = [ + row( + f'model/a-{index}', + 'bench-complete', + 'eval', + 0.9, + generation_temperature=0.2, + source_locator=None, + ) + for index in range(5) + ] + [ + row( + f'model/b-{index}', + 'bench-missing', + 'eval', + 0.8, + generation_temperature=None, + source_locator=None, + ) + for index in range(5) + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=2, top_fields=1 + ) + + assert summary['fields'][0]['key'] == 'generation_temperature' + assert summary['fields'][0]['selection_score'] > 0.05 + + def test_cli_help_uses_summary_limit_not_top_n(capsys): try: stats.parse_args(['--help']) From 06d918873d7306a87dce030f06d31ced895f756c Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 13:41:18 -0300 Subject: [PATCH 13/15] include computed stats --- audit/dataset_statistics.json | 7665 +++++++++++++++++++++++++++++++++ 1 file changed, 7665 insertions(+) create mode 100644 audit/dataset_statistics.json diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json new file mode 100644 index 000000000..57bb9d88f --- /dev/null +++ b/audit/dataset_statistics.json @@ -0,0 +1,7665 @@ +{ + "descriptive": { + "counts": { + "result_rows": 40495, + "unique_benchmarks": 59, + "unique_developers": 794, + "unique_evaluations": 178, + "unique_models": 5299 + }, + "inference_engines": [ + { + "count": 39618, + "value": "unknown" + }, + { + "count": 450, + "value": "ollama" + }, + { + "count": 150, + "value": "openai" + }, + { + "count": 54, + "value": "google" + }, + { + "count": 47, + "value": "anthropic" + }, + { + "count": 33, + "value": "gemini" + }, + { + "count": 30, + "value": "openrouter" + }, + { + "count": 26, + "value": "deepseek" + }, + { + "count": 18, + "value": "minimax" + }, + { + "count": 15, + "value": "moonshot" + }, + { + "count": 15, + "value": "ark" + }, + { + "count": 12, + "value": "zhipu" + }, + { + "count": 12, + "value": "qwen" + }, + { + "count": 12, + "value": "aliyun" + }, + { + "count": 3, + "value": "kuaishou" + } + ], + "metadata_completeness": { + "benchmarks": [ + { + "benchmark": "BBH", + "label": "BBH (n=4,574)", + "overall_completeness": 0.07692307692307693, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "label": "MATH Level 5 (n=4,574)", + "overall_completeness": 0.07692307692307693, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "label": "MMLU-PRO (n=4,574)", + "overall_completeness": 0.07692307692307693, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "label": "MUSR (n=4,574)", + "overall_completeness": 0.07692307692307693, + "result_rows": 4574 + }, + { + "benchmark": "RewardBench", + "label": "RewardBench (n=1,025)", + "overall_completeness": 0.07692307692307693, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench 2", + "label": "RewardBench 2 (n=1,379)", + "overall_completeness": 0.07692307692307693, + "result_rows": 1379 + }, + { + "benchmark": "GPQA", + "label": "GPQA (n=4,635)", + "overall_completeness": 0.07793544104223715, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "label": "IFEval (n=4,635)", + "overall_completeness": 0.07793544104223715, + "result_rows": 4635 + }, + { + "benchmark": "helm_mmlu", + "label": "helm_mmlu (n=2,844)", + "overall_completeness": 0.15384615384615385, + "result_rows": 2844 + }, + { + "benchmark": "global-mmlu-lite", + "label": "global-mmlu-lite (n=912)", + "overall_completeness": 0.21862348178137653, + "result_rows": 912 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "overall_completeness": 0.3076923076923077, + "result_rows": 1020 + }, + { + "benchmark": "BFCL leaderboard CSV", + "label": "BFCL leaderboard CSV (n=3,350)", + "overall_completeness": 0.38461538461538464, + "result_rows": 3350 + }, + { + "benchmark": "Other", + "label": "Other (n=2,399)", + "overall_completeness": 0.3087825055311508, + "result_rows": 2399 + } + ], + "fields": [ + { + "benchmark_stddev": 0.43200860863664675, + "key": "uncertainty_num_samples", + "label": "sample count", + "missing_rate": 0.9801456969996296, + "selection_score": 0.4234313788220063 + }, + { + "benchmark_stddev": 0.4794204729297534, + "key": "generation_config_present", + "label": "generation config", + "missing_rate": 0.8809235708112113, + "selection_score": 0.4223327949332781 + }, + { + "benchmark_stddev": 0.43144687321918196, + "key": "has_uncertainty", + "label": "uncertainty", + "missing_rate": 0.9604148660328435, + "selection_score": 0.41436799094308985 + }, + { + "benchmark_stddev": 0.40252007074704804, + "key": "detailed_results_file", + "label": "detailed results", + "missing_rate": 0.9803432522533646, + "selection_score": 0.3946078352534155 + }, + { + "benchmark_stddev": 0.49820130361691756, + "key": "source_organization_url", + "label": "source org URL", + "missing_rate": 0.7824175824175824, + "selection_score": 0.3898014595332366 + }, + { + "benchmark_stddev": 0.3595458209423123, + "key": "metric_id", + "label": "metric ID", + "missing_rate": 0.8815409309791332, + "selection_score": 0.31695435772314273 + }, + { + "benchmark_stddev": 0.3595458209423123, + "key": "metric_unit", + "label": "metric unit", + "missing_rate": 0.8815409309791332, + "selection_score": 0.31695435772314273 + }, + { + "benchmark_stddev": 0.30224668644283065, + "key": "generation_agentic_config_present", + "label": "agentic config", + "missing_rate": 0.9977775033954809, + "selection_score": 0.3015749442084843 + }, + { + "benchmark_stddev": 0.30224668644283065, + "key": "generation_max_tokens", + "label": "max tokens", + "missing_rate": 0.9880726015557476, + "selection_score": 0.298641669785172 + }, + { + "benchmark_stddev": 0.30224668644283065, + "key": "generation_temperature", + "label": "temperature", + "missing_rate": 0.9880726015557476, + "selection_score": 0.298641669785172 + }, + { + "benchmark_stddev": 0.21968612536975798, + "key": "metric_kind", + "label": "metric kind", + "missing_rate": 0.8892702802815162, + "selection_score": 0.195360342281525 + }, + { + "benchmark_stddev": 0.12908090009938827, + "key": "model_license", + "label": "model license", + "missing_rate": 0.9172737374984566, + "selection_score": 0.11840251967383078 + }, + { + "benchmark_stddev": 0.40252007074704804, + "key": "inference_engine", + "label": "inference engine/platform", + "missing_rate": 0.1724410421039635, + "selection_score": 0.06941098046738207 + } + ], + "matrix": [ + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4574 + }, + { + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4574 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench", + "benchmark_label": "RewardBench (n=1,025)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1025 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "RewardBench 2", + "benchmark_label": "RewardBench 2 (n=1,379)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1379 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.9868392664509169, + "present_rate": 0.013160733549083063, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.9868392664509169, + "present_rate": 0.013160733549083063, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4635 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 2844 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.1578947368421053, + "present_rate": 0.8421052631578947, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 912 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 0.6648603584827011, + "present_rate": 0.33513964151729886, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.22634431012922052, + "present_rate": 0.7736556898707795, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.6519383076281784, + "present_rate": 0.3480616923718216, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 0.6681950812838683, + "present_rate": 0.3318049187161317, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.5310546060858692, + "present_rate": 0.46894539391413087, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.8220091704877033, + "present_rate": 0.17799082951229678, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.8220091704877033, + "present_rate": 0.17799082951229678, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 0.9624843684868696, + "present_rate": 0.03751563151313047, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 0.7986661108795331, + "present_rate": 0.20133388912046687, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 0.7986661108795331, + "present_rate": 0.20133388912046687, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.9524802000833681, + "present_rate": 0.04751979991663193, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2399 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=2,399)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.08711963318049187, + "present_rate": 0.9128803668195081, + "result_rows": 2399 + } + ], + "other_result_rows": 2399, + "top_benchmark_count": 12 + }, + "models_per_benchmark": [ + { + "benchmark": "GPQA", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "IFEval", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "BBH", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MATH Level 5", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MMLU-PRO", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MUSR", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "RewardBench 2", + "result_rows": 1379, + "unique_models": 197 + }, + { + "benchmark": "RewardBench", + "result_rows": 1025, + "unique_models": 179 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "result_rows": 1020, + "unique_models": 139 + }, + { + "benchmark": "BFCL leaderboard CSV", + "result_rows": 3350, + "unique_models": 109 + }, + { + "benchmark": "GSM8K", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "LegalBench", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MATH", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MMLU", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MedQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NarrativeQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "OpenbookQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "WMT 2014", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_lite", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_mmlu", + "result_rows": 2844, + "unique_models": 79 + }, + { + "benchmark": "MMLU-Pro", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Omni-MATH", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "WildBench", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "helm_capabilities", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Wordle Arena Word Set", + "result_rows": 75, + "unique_models": 43 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "result_rows": 64, + "unique_models": 40 + }, + { + "benchmark": "SciArena leaderboard API", + "result_rows": 114, + "unique_models": 38 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "result_rows": 46, + "unique_models": 38 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "result_rows": 50, + "unique_models": 37 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "result_rows": 40, + "unique_models": 37 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "result_rows": 38, + "unique_models": 36 + }, + { + "benchmark": "wordle_arena_daily", + "result_rows": 92, + "unique_models": 32 + }, + { + "benchmark": "fibble4_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble5_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble_arena_daily", + "result_rows": 82, + "unique_models": 28 + }, + { + "benchmark": "global-mmlu-lite", + "result_rows": 912, + "unique_models": 27 + }, + { + "benchmark": "Easy Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Hard Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Medium Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "fibble3_arena_daily", + "result_rows": 75, + "unique_models": 25 + }, + { + "benchmark": "fibble2_arena_daily", + "result_rows": 66, + "unique_models": 22 + }, + { + "benchmark": "apex-agents", + "result_rows": 74, + "unique_models": 20 + }, + { + "benchmark": "ace", + "result_rows": 32, + "unique_models": 12 + }, + { + "benchmark": "apex-v1", + "result_rows": 19, + "unique_models": 10 + }, + { + "benchmark": "La Leaderboard composite dataset", + "result_rows": 5, + "unique_models": 5 + }, + { + "benchmark": "Anthropic RLHF dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Best ChatGPT Prompts", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Koala test dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Open Assistant", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Self Instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Vicuna", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "helm_instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "appworld/test_normal", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "browsecompplus", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "swe-bench", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/airline", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/retail", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/telecom", + "result_rows": 15, + "unique_models": 3 + } + ], + "normalization_exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "normalized_score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 191, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5524884816753927, + "median": 0.5604, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.008, + "stddev": 0.19526001389051642 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 133, + "evaluation_name": "v2_Semi_Private", + "max": 0.9999676010927855, + "mean": 0.9675588722465054, + "median": 0.9969557986781246, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.0, + "stddev": 0.11170694755172818 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 133, + "evaluation_name": "v2_Semi_Private", + "max": 1.0, + "mean": 0.1482124060150376, + "median": 0.0333, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.23541775910763008 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 131, + "evaluation_name": "v1_Semi_Private", + "max": 0.9999805606556713, + "mean": 0.9826512407799388, + "median": 0.9976983816314812, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.4264226887417708, + "stddev": 0.058880684082207674 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 131, + "evaluation_name": "v1_Semi_Private", + "max": 0.98, + "mean": 0.44456030534351143, + "median": 0.4, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.2907857931349756 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 125, + "evaluation_name": "v2_Public_Eval", + "max": 0.9999663051364969, + "mean": 0.9846037386298053, + "median": 0.9968910008636955, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.7719116932098415, + "stddev": 0.03818843389200095 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 125, + "evaluation_name": "v2_Public_Eval", + "max": 1.0, + "mean": 0.1310936, + "median": 0.029, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.23801453457380936 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 114, + "evaluation_name": "v1_Public_Eval", + "max": 0.999984448524537, + "mean": 0.9935590006174354, + "median": 0.998216116168769, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.899950878565301, + "stddev": 0.014694808632437306 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 114, + "evaluation_name": "v1_Public_Eval", + "max": 0.9825, + "mean": 0.5073622807017544, + "median": 0.5056499999999999, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0175, + "stddev": 0.2800617230927051 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 0.9312, + "mean": 0.6721155963302752, + "median": 0.7076, + "metric_id": "bfcl.live.live_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.16692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 0.9401999999999999, + "mean": 0.6615788990825688, + "median": 0.7104, + "metric_id": "bfcl.live.live_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.17084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 0.9375, + "mean": 0.6427752293577982, + "median": 0.75, + "metric_id": "bfcl.live.live_parallel_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live parallel AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.24460198666555008 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 0.9582999999999999, + "mean": 0.5703339449541285, + "median": 0.625, + "metric_id": "bfcl.live.live_parallel_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live parallel multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.2059801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 0.9031, + "mean": 0.726408256880734, + "median": 0.7636, + "metric_id": "bfcl.live.live_simple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live simple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.1625125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 0.7376, + "mean": 0.20235045871559632, + "median": 0.157, + "metric_id": "bfcl.memory.accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.1699218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 0.7097, + "mean": 0.13904036697247707, + "median": 0.0839, + "metric_id": "bfcl.memory.kv_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory KV accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.1515138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 0.8323, + "mean": 0.2820403669724771, + "median": 0.271, + "metric_id": "bfcl.memory.recursive_summarization_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory recursive summarization accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.208463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 0.7290000000000001, + "mean": 0.18597155963302753, + "median": 0.1161, + "metric_id": "bfcl.memory.vector_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory vector accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.18379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 0.7737999999999999, + "mean": 0.23962385321100918, + "median": 0.165, + "metric_id": "bfcl.multi_turn.accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.21479676048452157 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 0.825, + "mean": 0.29009174311926605, + "median": 0.2, + "metric_id": "bfcl.multi_turn.base_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn base accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.24897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 0.76, + "mean": 0.24009174311926607, + "median": 0.175, + "metric_id": "bfcl.multi_turn.long_context_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn long-context accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.2138372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 0.77, + "mean": 0.21591743119266055, + "median": 0.14, + "metric_id": "bfcl.multi_turn.miss_function_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn missing function accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.2171396175036615 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 0.74, + "mean": 0.21238532110091743, + "median": 0.15, + "metric_id": "bfcl.multi_turn.miss_parameter_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn missing parameter accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.194452693868985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 0.9065000000000001, + "mean": 0.7661733944954129, + "median": 0.83, + "metric_id": "bfcl.non_live.ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.18657086363085557 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 0.97, + "mean": 0.8535779816513761, + "median": 0.92, + "metric_id": "bfcl.non_live.multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.182740318362281 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 0.96, + "mean": 0.7979816513761467, + "median": 0.88, + "metric_id": "bfcl.non_live.parallel_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live parallel AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.2273336991546167 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 0.925, + "mean": 0.7347706422018349, + "median": 0.825, + "metric_id": "bfcl.non_live.parallel_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live parallel multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.24427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 0.8067, + "mean": 0.6783633027522936, + "median": 0.7258, + "metric_id": "bfcl.non_live.simple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live simple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.14843039998882532 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 0.9959969388355802, + "mean": 0.910949171600733, + "median": 0.9723906516748102, + "metric_id": "bfcl.overall.latency_mean_s", + "metric_kind": "latency", + "metric_name": "Latency mean", + "metric_unit": "seconds", + "min": 0.0, + "stddev": 0.16788751393048792 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 0.9983116129372659, + "mean": 0.9052860681766953, + "median": 0.9794227826729278, + "metric_id": "bfcl.overall.latency_p95_s", + "metric_kind": "latency", + "metric_name": "Latency 95th percentile", + "metric_unit": "seconds", + "min": 0.0, + "stddev": 0.17750828285090742 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 0.9978872247523358, + "mean": 0.8712378709255851, + "median": 0.9528616366965585, + "metric_id": "bfcl.overall.latency_std_s", + "metric_kind": "latency", + "metric_name": "Latency standard deviation", + "metric_unit": "seconds", + "min": 0.0, + "stddev": 0.18715211182331667 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 0.7746999999999999, + "mean": 0.3809394495412844, + "median": 0.3552, + "metric_id": "bfcl.overall.overall_accuracy", + "metric_kind": "accuracy", + "metric_name": "Overall accuracy", + "metric_unit": "percentage", + "min": 0.0717, + "stddev": 0.1568359888890471 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 1.0, + "mean": 0.5, + "median": 0.5, + "metric_id": "bfcl.overall.rank", + "metric_kind": "rank", + "metric_name": "Overall rank", + "metric_unit": "position", + "min": 0.0, + "stddev": 0.2926814601721238 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 0.9987048455669116, + "mean": 0.8673404362764129, + "median": 0.9486161556437762, + "metric_id": "bfcl.overall.total_cost_usd", + "metric_kind": "cost", + "metric_name": "Total cost", + "metric_unit": "usd", + "min": 0.0, + "stddev": 0.2029161256124978 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 1.0, + "mean": 0.7561073394495413, + "median": 0.8079000000000001, + "metric_id": "bfcl.relevance.irrelevance_detection_accuracy", + "metric_kind": "accuracy", + "metric_name": "Irrelevance detection accuracy", + "metric_unit": "percentage", + "min": 0.06280000000000001, + "stddev": 0.16896574532662487 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 1.0, + "mean": 0.7637614678899083, + "median": 0.8125, + "metric_id": "bfcl.relevance.relevance_detection_accuracy", + "metric_kind": "accuracy", + "metric_name": "Relevance detection accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.19862042242738473 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 0.845, + "mean": 0.24573394495412845, + "median": 0.105, + "metric_id": "bfcl.web_search.accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.28751797503234583 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 0.87, + "mean": 0.2646788990825688, + "median": 0.13, + "metric_id": "bfcl.web_search.base_accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search base accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.29552705211555524 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 0.85, + "mean": 0.22678899082568807, + "median": 0.09, + "metric_id": "bfcl.web_search.no_snippet_accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search no-snippet accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 0.28410639873751836 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "GSM8K", + "count": 90, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6740333333333334, + "median": 0.765, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.028, + "stddev": 0.24790177694247365 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.169, + "stddev": 0.1866150109050233 + }, + { + "benchmark": "Omni-MATH", + "count": 61, + "evaluation_name": "Omni-MATH", + "max": 0.722, + "mean": 0.3746065573770492, + "median": 0.364, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.072, + "stddev": 0.17904862269679006 + }, + { + "benchmark": "WildBench", + "count": 61, + "evaluation_name": "WildBench", + "max": 0.866, + "mean": 0.7791803278688525, + "median": 0.797, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.477, + "stddev": 0.07613989497338025 + }, + { + "benchmark": "helm_capabilities", + "count": 61, + "evaluation_name": "Mean score", + "max": 0.819, + "mean": 0.6281803278688525, + "median": 0.642, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.325, + "stddev": 0.12667261058817744 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Arabic", + "max": 0.9475, + "mean": 0.8123458333333333, + "median": 0.82375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.455, + "stddev": 0.11404825771861875 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Bengali", + "max": 0.9425, + "mean": 0.8118458333333334, + "median": 0.82375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5175, + "stddev": 0.10786060736231451 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Burmese", + "max": 0.945, + "mean": 0.8254416666666666, + "median": 0.8375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.63, + "stddev": 0.08983182356393916 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Chinese", + "max": 0.9475, + "mean": 0.80325, + "median": 0.835, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5075, + "stddev": 0.12931314787277418 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Culturally Agnostic", + "max": 0.9528, + "mean": 0.8264125, + "median": 0.857, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5631, + "stddev": 0.10811543599320127 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Culturally Sensitive", + "max": 0.9397, + "mean": 0.788525, + "median": 0.78935, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5391, + "stddev": 0.1149148963548909 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "English", + "max": 0.9475, + "mean": 0.7939833333333334, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.38, + "stddev": 0.15081692344416497 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "French", + "max": 0.9575, + "mean": 0.7944791666666666, + "median": 0.8275, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.41, + "stddev": 0.14230966528431346 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "German", + "max": 0.94, + "mean": 0.8004833333333333, + "median": 0.8275, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.4775, + "stddev": 0.12445258061886479 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Global MMLU Lite", + "max": 0.9453, + "mean": 0.8074583333333334, + "median": 0.82315, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5511, + "stddev": 0.11081356363967734 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Hindi", + "max": 0.9475, + "mean": 0.7983333333333333, + "median": 0.82355, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.555, + "stddev": 0.11719085240122123 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Indonesian", + "max": 0.955, + "mean": 0.801275, + "median": 0.80625, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.515, + "stddev": 0.11649187077838011 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Italian", + "max": 0.955, + "mean": 0.8056875, + "median": 0.8300000000000001, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.48, + "stddev": 0.1239779332201175 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Japanese", + "max": 0.94, + "mean": 0.8170291666666667, + "median": 0.84375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.58, + "stddev": 0.10297801657229139 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Korean", + "max": 0.95, + "mean": 0.820125, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.595, + "stddev": 0.10111529652574511 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Portuguese", + "max": 0.945, + "mean": 0.8010041666666666, + "median": 0.8323, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5175, + "stddev": 0.12492813757011505 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Spanish", + "max": 0.9475, + "mean": 0.8042458333333333, + "median": 0.8325, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.485, + "stddev": 0.12684843352857172 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Swahili", + "max": 0.94, + "mean": 0.8143708333333334, + "median": 0.8200000000000001, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.6075, + "stddev": 0.09313423156204427 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Yoruba", + "max": 0.9425, + "mean": 0.8155583333333334, + "median": 0.8223, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5825, + "stddev": 0.09530013023440752 + }, + { + "benchmark": "Wordle Arena Word Set", + "count": 43, + "evaluation_name": "wordle_arena_win_rate", + "max": 1.0, + "mean": 0.38320930232558137, + "median": 0.3, + "metric_id": "wordle_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.3652171551113076 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 40, + "evaluation_name": "bfcl.format_sensitivity.max_delta", + "max": 1.0, + "mean": 0.654875, + "median": 0.775, + "metric_id": "bfcl.format_sensitivity.max_delta", + "metric_kind": "difference", + "metric_name": "Format sensitivity max delta", + "metric_unit": "percentage_points", + "min": 0.18500000000000005, + "stddev": 0.2671153776928614 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 40, + "evaluation_name": "bfcl.format_sensitivity.stddev", + "max": 1.0, + "mean": 0.8911525, + "median": 0.9430000000000001, + "metric_id": "bfcl.format_sensitivity.stddev", + "metric_kind": "difference", + "metric_name": "Format sensitivity standard deviation", + "metric_unit": "percentage_points", + "min": 0.6582, + "stddev": 0.0973636534813211 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "count": 40, + "evaluation_name": "fibble1_arena_win_rate", + "max": 0.881, + "mean": 0.1804375, + "median": 0.08990000000000001, + "metric_id": "fibble1_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.24891262362266392 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "count": 38, + "evaluation_name": "fibble2_arena_win_rate", + "max": 0.3, + "mean": 0.03426315789473684, + "median": 0.0, + "metric_id": "fibble2_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.07868617592468241 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_cost_per_100_calls_usd", + "max": 0.9980941077911198, + "mean": 0.8975751958435603, + "median": 0.9749930187098577, + "metric_id": "cost_per_100_calls_usd", + "metric_kind": "cost", + "metric_name": "Cost per 100 calls", + "metric_unit": "usd", + "min": 0.0, + "stddev": 0.22179374764532372 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_elo", + "max": 1.0, + "mean": 0.5286286504070941, + "median": 0.5551610136078723, + "metric_id": "elo", + "metric_kind": "elo", + "metric_name": "Elo rating", + "metric_unit": "points", + "min": 0.0, + "stddev": 0.23322432731710405 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_rank", + "max": 1.0, + "mean": 0.5, + "median": 0.5, + "metric_id": "rank", + "metric_kind": "rank", + "metric_name": "Rank", + "metric_unit": "position", + "min": 0.0, + "stddev": 0.30035284825530906 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "count": 37, + "evaluation_name": "fibble3_arena_win_rate", + "max": 0.333, + "mean": 0.010551351351351351, + "median": 0.0, + "metric_id": "fibble3_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.0548872866357432 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "count": 37, + "evaluation_name": "fibble5_arena_win_rate", + "max": 0.6364, + "mean": 0.09143783783783783, + "median": 0.0, + "metric_id": "fibble5_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.17735344709138076 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "count": 36, + "evaluation_name": "fibble4_arena_win_rate", + "max": 0.0732, + "mean": 0.0028055555555555555, + "median": 0.0, + "metric_id": "fibble4_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.012925853261571653 + }, + { + "benchmark": "Wordle Arena Word Set", + "count": 32, + "evaluation_name": "wordle_arena_avg_attempts", + "max": 0.534, + "mean": 0.29775, + "median": 0.308, + "metric_id": "wordle_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.0, + "stddev": 0.15686834422328438 + }, + { + "benchmark": "wordle_arena_daily", + "count": 32, + "evaluation_name": "wordle_arena_avg_attempts", + "max": 0.46599999999999997, + "mean": 0.161125, + "median": 0.12700000000000006, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.1632183872469149 + }, + { + "benchmark": "wordle_arena_daily", + "count": 32, + "evaluation_name": "wordle_arena_win_rate", + "max": 1.0, + "mean": 0.4733125, + "median": 0.4165, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.43730335211863347 + }, + { + "benchmark": "Easy Problems", + "count": 29, + "evaluation_name": "Easy Problems", + "max": 0.9014, + "mean": 0.4996824672170957, + "median": 0.5352, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.056338028169014086, + "stddev": 0.2844141675332875 + }, + { + "benchmark": "Hard Problems", + "count": 29, + "evaluation_name": "Hard Problems", + "max": 0.1594, + "mean": 0.009876639145216123, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.03091194562296296 + }, + { + "benchmark": "Medium Problems", + "count": 29, + "evaluation_name": "Medium Problems", + "max": 0.5211, + "mean": 0.11304244779018942, + "median": 0.056338028169014086, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.14517143188248943 + }, + { + "benchmark": "fibble4_arena_daily", + "count": 28, + "evaluation_name": "fibble4_arena_avg_latency_ms", + "max": 0.9995416666666667, + "mean": 0.8651820238095238, + "median": 0.9580791666666666, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.25770499999999996, + "stddev": 0.21579466004033868 + }, + { + "benchmark": "fibble4_arena_daily", + "count": 28, + "evaluation_name": "fibble4_arena_win_rate", + "max": 0.667, + "mean": 0.06310714285714286, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.1740985968598582 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 28, + "evaluation_name": "fibble5_arena_avg_latency_ms", + "max": 0.9994566666666667, + "mean": 0.94403, + "median": 0.9936783333333333, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.6904066666666666, + "stddev": 0.07859265956074572 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 28, + "evaluation_name": "fibble5_arena_win_rate", + "max": 1.0, + "mean": 0.2728214285714286, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.40591789477871915 + }, + { + "benchmark": "fibble_arena_daily", + "count": 28, + "evaluation_name": "fibble_arena_avg_attempts", + "max": 0.48571428571428577, + "mean": 0.0935204081632653, + "median": 0.027142857142857135, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.1390761452271949 + }, + { + "benchmark": "fibble_arena_daily", + "count": 28, + "evaluation_name": "fibble_arena_win_rate", + "max": 1.0, + "mean": 0.2829642857142857, + "median": 0.14300000000000002, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.34601974704330873 + }, + { + "benchmark": "wordle_arena_daily", + "count": 28, + "evaluation_name": "wordle_arena_avg_latency_ms", + "max": 0.99951, + "mean": 0.9754844642857143, + "median": 0.9956175, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.91617, + "stddev": 0.02958724541010069 + }, + { + "benchmark": "fibble_arena_daily", + "count": 26, + "evaluation_name": "fibble_arena_avg_latency_ms", + "max": 0.9997083333333333, + "mean": 0.9607440384615384, + "median": 0.9966616666666667, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.80352, + "stddev": 0.05500541034838315 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 25, + "evaluation_name": "fibble3_arena_avg_latency_ms", + "max": 0.999565, + "mean": 0.9165300666666667, + "median": 0.9949433333333333, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.4411333333333334, + "stddev": 0.13092548313502184 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 25, + "evaluation_name": "fibble3_arena_win_rate", + "max": 1.0, + "mean": 0.07, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.22267315359812312 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "count": 24, + "evaluation_name": "fibble1_arena_avg_attempts", + "max": 0.6071428571428572, + "mean": 0.30160714285714285, + "median": 0.2957142857142857, + "metric_id": "fibble1_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.0, + "stddev": 0.13236522334576908 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 22, + "evaluation_name": "fibble2_arena_avg_latency_ms", + "max": 0.99951, + "mean": 0.9446994696969697, + "median": 0.9964583333333333, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.774135, + "stddev": 0.0790297132511389 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 22, + "evaluation_name": "fibble2_arena_win_rate", + "max": 0.75, + "mean": 0.049227272727272724, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.17183545058550076 + }, + { + "benchmark": "apex-agents", + "count": 19, + "evaluation_name": "Corporate Lawyer Mean Score", + "max": 0.548, + "mean": 0.38605263157894737, + "median": 0.394, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.147, + "stddev": 0.1127334484940327 + }, + { + "benchmark": "appworld/test_normal", + "count": 15, + "evaluation_name": "appworld/test_normal", + "max": 0.7, + "mean": 0.38053333333333333, + "median": 0.505, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.2795156184408681 + }, + { + "benchmark": "browsecompplus", + "count": 15, + "evaluation_name": "browsecompplus", + "max": 0.61, + "mean": 0.47951333333333335, + "median": 0.48, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.09206274930461185 + }, + { + "benchmark": "swe-bench", + "count": 15, + "evaluation_name": "swe-bench", + "max": 0.8072, + "mean": 0.6515666666666666, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5253, + "stddev": 0.08692541685397948 + }, + { + "benchmark": "tau-bench-2/airline", + "count": 15, + "evaluation_name": "tau-bench-2/airline", + "max": 0.74, + "mean": 0.6333333333333333, + "median": 0.66, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.48, + "stddev": 0.0830375703837612 + }, + { + "benchmark": "tau-bench-2/retail", + "count": 15, + "evaluation_name": "tau-bench-2/retail", + "max": 0.85, + "mean": 0.7409, + "median": 0.78, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.51, + "stddev": 0.09942657736095659 + }, + { + "benchmark": "tau-bench-2/telecom", + "count": 15, + "evaluation_name": "tau-bench-2/telecom", + "max": 0.8876, + "mean": 0.69824, + "median": 0.73, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.53, + "stddev": 0.12537163497834292 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "count": 13, + "evaluation_name": "fibble5_arena_avg_attempts", + "max": 0.8571428571428572, + "mean": 0.31, + "median": 0.2857142857142857, + "metric_id": "fibble5_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.1428571428571429, + "stddev": 0.18710286741040952 + }, + { + "benchmark": "apex-agents", + "count": 13, + "evaluation_name": "Overall Pass@1", + "max": 0.335, + "mean": 0.20892307692307693, + "median": 0.23, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.04, + "stddev": 0.09209276259878907 + }, + { + "benchmark": "ace", + "count": 12, + "evaluation_name": "Gaming Score", + "max": 0.613, + "mean": 0.4613333333333333, + "median": 0.46199999999999997, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.284, + "stddev": 0.130447713895668 + }, + { + "benchmark": "ace", + "count": 11, + "evaluation_name": "Overall Score", + "max": 0.561, + "mean": 0.47963636363636364, + "median": 0.478, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.38, + "stddev": 0.06701831500011432 + }, + { + "benchmark": "apex-agents", + "count": 9, + "evaluation_name": "Corporate Law Pass@1", + "max": 0.266, + "mean": 0.18122222222222223, + "median": 0.189, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.078, + "stddev": 0.06984586204238906 + }, + { + "benchmark": "apex-agents", + "count": 9, + "evaluation_name": "Overall Mean Score", + "max": 0.401, + "mean": 0.3071111111111111, + "median": 0.341, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.115, + "stddev": 0.10572658658592508 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 9, + "evaluation_name": "fibble5_arena_avg_attempts", + "max": 0.7142857142857143, + "mean": 0.29873015873015873, + "median": 0.18428571428571427, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0714285714285714, + "stddev": 0.239007775089615 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "count": 8, + "evaluation_name": "fibble2_arena_avg_attempts", + "max": 0.5714285714285714, + "mean": 0.28875, + "median": 0.3692857142857143, + "metric_id": "fibble2_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.0, + "stddev": 0.21391319661285788 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Investment Banking Pass@1", + "max": 0.273, + "mean": 0.17825000000000002, + "median": 0.202, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.012, + "stddev": 0.10551607866644239 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Management Consulting Pass@1", + "max": 0.227, + "mean": 0.122875, + "median": 0.1235, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.029, + "stddev": 0.06801982379109364 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Overall Pass@8", + "max": 0.4, + "mean": 0.29725, + "median": 0.3345, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.115, + "stddev": 0.10747724011555723 + }, + { + "benchmark": "apex-v1", + "count": 7, + "evaluation_name": "Overall Score", + "max": 0.67, + "mean": 0.6027142857142858, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.359, + "stddev": 0.10972953763035471 + }, + { + "benchmark": "La Leaderboard composite dataset", + "count": 5, + "evaluation_name": "la_leaderboard", + "max": 0.3362, + "mean": 0.28874, + "median": 0.2761, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2587, + "stddev": 0.03096309093097781 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 4, + "evaluation_name": "v3_Semi_Private", + "max": 0.7503406194310979, + "mean": 0.4348723241072839, + "median": 0.49457433849901883, + "metric_id": "cost", + "metric_kind": "cost", + "metric_name": "Cost", + "metric_unit": "usd", + "min": 0.0, + "stddev": 0.32065117443368096 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 4, + "evaluation_name": "v3_Semi_Private", + "max": 0.0026, + "mean": 0.001775, + "median": 0.0022500000000000003, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.001195477589361953 + }, + { + "benchmark": "Anthropic RLHF dataset", + "count": 4, + "evaluation_name": "Anthropic RLHF dataset", + "max": 0.993, + "mean": 0.9538500000000001, + "median": 0.9898, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.8428000000000001, + "stddev": 0.07408920299206893 + }, + { + "benchmark": "Best ChatGPT Prompts", + "count": 4, + "evaluation_name": "Best ChatGPT Prompts", + "max": 0.999, + "mean": 0.9971, + "median": 0.9974000000000001, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.9945999999999999, + "stddev": 0.0018366636418608235 + }, + { + "benchmark": "Koala test dataset", + "count": 4, + "evaluation_name": "Koala test dataset", + "max": 0.9974000000000001, + "mean": 0.99515, + "median": 0.995, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.9932000000000001, + "stddev": 0.0019824227601598896 + }, + { + "benchmark": "Open Assistant", + "count": 4, + "evaluation_name": "Open Assistant", + "max": 0.9974000000000001, + "mean": 0.9957499999999999, + "median": 0.9961, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.9934, + "stddev": 0.0019070046320517552 + }, + { + "benchmark": "Self Instruct", + "count": 4, + "evaluation_name": "Self Instruct", + "max": 0.9984, + "mean": 0.99645, + "median": 0.9965999999999999, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.9942, + "stddev": 0.0020680103158994805 + }, + { + "benchmark": "Vicuna", + "count": 4, + "evaluation_name": "Vicuna", + "max": 0.999, + "mean": 0.99855, + "median": 0.9986999999999999, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.9978, + "stddev": 0.0005744562646537976 + }, + { + "benchmark": "helm_instruct", + "count": 4, + "evaluation_name": "Mean win rate", + "max": 0.689, + "mean": 0.5, + "median": 0.611, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.089, + "stddev": 0.27645614480419856 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 3, + "evaluation_name": "v2_Private_Eval", + "max": 0.9983152568248455, + "mean": 0.9251369250630029, + "median": 0.9974080874228392, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.7796874309413243, + "stddev": 0.1259637735393401 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 3, + "evaluation_name": "v2_Private_Eval", + "max": 1.0, + "mean": 0.345, + "median": 0.031, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.004, + "stddev": 0.5674072611449381 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "DIY Score", + "max": 0.56, + "mean": 0.55, + "median": 0.55, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.54, + "stddev": 0.010000000000000009 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "Food Score", + "max": 0.7, + "mean": 0.65, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.6, + "stddev": 0.04999999999999999 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "Shopping Score", + "max": 0.45, + "mean": 0.45, + "median": 0.45, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.45, + "stddev": 0.0 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Big Law Score", + "max": 0.78, + "mean": 0.77, + "median": 0.77, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.76, + "stddev": 0.010000000000000009 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Consulting Score", + "max": 0.64, + "mean": 0.64, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.64, + "stddev": 0.0 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Investment Banking Score", + "max": 0.64, + "mean": 0.6266666666666667, + "median": 0.63, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.61, + "stddev": 0.01527525231651948 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Medicine (MD) Score", + "max": 0.66, + "mean": 0.6533333333333333, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.65, + "stddev": 0.005773502691896263 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "count": 2, + "evaluation_name": "fibble3_arena_avg_attempts", + "max": 0.7142857142857143, + "mean": 0.4285714285714286, + "median": 0.4285714285714286, + "metric_id": "fibble3_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.1428571428571429, + "stddev": 0.40406101782088427 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "count": 2, + "evaluation_name": "fibble4_arena_avg_attempts", + "max": 0.0, + "mean": 0.0, + "median": 0.0, + "metric_id": "fibble4_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 0.0, + "stddev": 0.0 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 1, + "evaluation_name": "fibble2_arena_avg_attempts", + "max": 0.2142857142857143, + "mean": 0.2142857142857143, + "median": 0.2142857142857143, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2142857142857143, + "stddev": 0.0 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 1, + "evaluation_name": "fibble3_arena_avg_attempts", + "max": 0.5, + "mean": 0.5, + "median": 0.5, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5, + "stddev": 0.0 + } + ], + "quality": { + "has_uncertainty": 1603, + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_metadata": 0, + "missing_score": 0, + "out_of_range": 100, + "total_result_rows": 40495, + "zero_width_bounds": 0 + }, + "schema_versions": [ + { + "count": 40495, + "value": "0.2.2" + } + ], + "score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5353568527918782, + "median": 0.5529, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": -0.01, + "stddev": 0.21529016446306679 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 133, + "evaluation_name": "v2_Semi_Private", + "max": 77.16309638, + "mean": 2.503257867518797, + "median": 0.2349, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.0025, + "stddev": 8.619653960249606 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 133, + "evaluation_name": "v2_Semi_Private", + "max": 1.0, + "mean": 0.1482124060150376, + "median": 0.0333, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.23541775910763008 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 131, + "evaluation_name": "v1_Semi_Private", + "max": 44.25900135, + "mean": 1.3386839797709924, + "median": 0.1776, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.0015, + "stddev": 4.543415900755722 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 131, + "evaluation_name": "v1_Semi_Private", + "max": 0.98, + "mean": 0.44456030534351143, + "median": 0.4, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.2907857931349756 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 125, + "evaluation_name": "v2_Public_Eval", + "max": 17.6, + "mean": 1.1880232, + "median": 0.2399, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.0026, + "stddev": 2.946737805009728 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 125, + "evaluation_name": "v2_Public_Eval", + "max": 1.0, + "mean": 0.1310936, + "median": 0.029, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.23801453457380936 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 114, + "evaluation_name": "v1_Public_Eval", + "max": 7.7201, + "mean": 0.49700745614035086, + "median": 0.13765, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.0012, + "stddev": 1.1338969347904155 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 114, + "evaluation_name": "v1_Public_Eval", + "max": 0.9825, + "mean": 0.5073622807017544, + "median": 0.5056499999999999, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0175, + "stddev": 0.2800617230927051 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 93.12, + "mean": 67.21155963302752, + "median": 70.76, + "metric_id": "bfcl.live.live_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 16.692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 94.02, + "mean": 66.15788990825688, + "median": 71.04, + "metric_id": "bfcl.live.live_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 17.084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 93.75, + "mean": 64.27752293577981, + "median": 75.0, + "metric_id": "bfcl.live.live_parallel_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live parallel AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 24.46019866655501 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 95.83, + "mean": 57.03339449541284, + "median": 62.5, + "metric_id": "bfcl.live.live_parallel_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live parallel multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 20.59801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 90.31, + "mean": 72.64082568807339, + "median": 76.36, + "metric_id": "bfcl.live.live_simple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Live simple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 16.25125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 73.76, + "mean": 20.235045871559635, + "median": 15.7, + "metric_id": "bfcl.memory.accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 16.99218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 70.97, + "mean": 13.904036697247706, + "median": 8.39, + "metric_id": "bfcl.memory.kv_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory KV accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 15.15138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 83.23, + "mean": 28.204036697247705, + "median": 27.1, + "metric_id": "bfcl.memory.recursive_summarization_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory recursive summarization accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 20.8463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 72.9, + "mean": 18.597155963302754, + "median": 11.61, + "metric_id": "bfcl.memory.vector_accuracy", + "metric_kind": "accuracy", + "metric_name": "Memory vector accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 18.379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 77.38, + "mean": 23.962385321100918, + "median": 16.5, + "metric_id": "bfcl.multi_turn.accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 21.479676048452156 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 82.5, + "mean": 29.009174311926607, + "median": 20.0, + "metric_id": "bfcl.multi_turn.base_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn base accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 24.897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 76.0, + "mean": 24.009174311926607, + "median": 17.5, + "metric_id": "bfcl.multi_turn.long_context_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn long-context accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 21.38372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 77.0, + "mean": 21.591743119266056, + "median": 14.0, + "metric_id": "bfcl.multi_turn.miss_function_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn missing function accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 21.713961750366153 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 74.0, + "mean": 21.238532110091743, + "median": 15.0, + "metric_id": "bfcl.multi_turn.miss_parameter_accuracy", + "metric_kind": "accuracy", + "metric_name": "Multi-turn missing parameter accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 19.445269386898502 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 90.65, + "mean": 76.61733944954129, + "median": 83.0, + "metric_id": "bfcl.non_live.ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 18.657086363085554 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 97.0, + "mean": 85.35779816513761, + "median": 92.0, + "metric_id": "bfcl.non_live.multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 18.274031836228097 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 96.0, + "mean": 79.79816513761467, + "median": 88.0, + "metric_id": "bfcl.non_live.parallel_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live parallel AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 22.733369915461672 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 92.5, + "mean": 73.4770642201835, + "median": 82.5, + "metric_id": "bfcl.non_live.parallel_multiple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live parallel multiple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 24.427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 80.67, + "mean": 67.83633027522936, + "median": 72.58, + "metric_id": "bfcl.non_live.simple_ast_accuracy", + "metric_kind": "accuracy", + "metric_name": "Non-live simple AST accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 14.843039998882533 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 169.87, + "mean": 15.127064220183486, + "median": 4.69, + "metric_id": "bfcl.overall.latency_mean_s", + "metric_kind": "latency", + "metric_name": "Latency mean", + "metric_unit": "seconds", + "min": 0.68, + "stddev": 28.519051991371985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 568.59, + "mean": 53.85339449541284, + "median": 11.7, + "metric_id": "bfcl.overall.latency_p95_s", + "metric_kind": "latency", + "metric_name": "Latency 95th percentile", + "metric_unit": "seconds", + "min": 0.96, + "stddev": 100.92943454619746 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 212.99, + "mean": 27.425045871559632, + "median": 10.04, + "metric_id": "bfcl.overall.latency_std_s", + "metric_kind": "latency", + "metric_name": "Latency standard deviation", + "metric_unit": "seconds", + "min": 0.45, + "stddev": 39.86152829724822 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 77.47, + "mean": 38.09394495412844, + "median": 35.52, + "metric_id": "bfcl.overall.overall_accuracy", + "metric_kind": "accuracy", + "metric_name": "Overall accuracy", + "metric_unit": "percentage", + "min": 7.17, + "stddev": 15.683598888904708 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 109.0, + "mean": 55.0, + "median": 55.0, + "metric_id": "bfcl.overall.rank", + "metric_kind": "rank", + "metric_name": "Overall rank", + "metric_unit": "position", + "min": 1.0, + "stddev": 31.609597698589376 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 355.17, + "mean": 47.11669724770642, + "median": 18.25, + "metric_id": "bfcl.overall.total_cost_usd", + "metric_kind": "cost", + "metric_name": "Total cost", + "metric_unit": "usd", + "min": 0.46, + "stddev": 72.06972033379084 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 100.0, + "mean": 75.61073394495413, + "median": 80.79, + "metric_id": "bfcl.relevance.irrelevance_detection_accuracy", + "metric_kind": "accuracy", + "metric_name": "Irrelevance detection accuracy", + "metric_unit": "percentage", + "min": 6.28, + "stddev": 16.896574532662488 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 100.0, + "mean": 76.37614678899082, + "median": 81.25, + "metric_id": "bfcl.relevance.relevance_detection_accuracy", + "metric_kind": "accuracy", + "metric_name": "Relevance detection accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 19.86204224273847 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 84.5, + "mean": 24.573394495412845, + "median": 10.5, + "metric_id": "bfcl.web_search.accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 28.751797503234584 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 87.0, + "mean": 26.46788990825688, + "median": 13.0, + "metric_id": "bfcl.web_search.base_accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search base accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 29.552705211555523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 85.0, + "mean": 22.678899082568808, + "median": 9.0, + "metric_id": "bfcl.web_search.no_snippet_accuracy", + "metric_kind": "accuracy", + "metric_name": "Web-search no-snippet accuracy", + "metric_unit": "percentage", + "min": 0.0, + "stddev": 28.410639873751833 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "GSM8K", + "count": 91, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6556373626373626, + "median": 0.762, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": -1.0, + "stddev": 0.30260192099278316 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.169, + "stddev": 0.1866150109050233 + }, + { + "benchmark": "Omni-MATH", + "count": 61, + "evaluation_name": "Omni-MATH", + "max": 0.722, + "mean": 0.3746065573770492, + "median": 0.364, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.072, + "stddev": 0.17904862269679006 + }, + { + "benchmark": "WildBench", + "count": 61, + "evaluation_name": "WildBench", + "max": 0.866, + "mean": 0.7791803278688525, + "median": 0.797, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.477, + "stddev": 0.07613989497338025 + }, + { + "benchmark": "helm_capabilities", + "count": 61, + "evaluation_name": "Mean score", + "max": 0.819, + "mean": 0.6281803278688525, + "median": 0.642, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.325, + "stddev": 0.12667261058817744 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Arabic", + "max": 0.9475, + "mean": 0.8123458333333333, + "median": 0.82375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.455, + "stddev": 0.11404825771861875 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Bengali", + "max": 0.9425, + "mean": 0.8118458333333334, + "median": 0.82375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5175, + "stddev": 0.10786060736231451 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Burmese", + "max": 0.945, + "mean": 0.8254416666666666, + "median": 0.8375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.63, + "stddev": 0.08983182356393916 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Chinese", + "max": 0.9475, + "mean": 0.80325, + "median": 0.835, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5075, + "stddev": 0.12931314787277418 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Culturally Agnostic", + "max": 0.9528, + "mean": 0.8264125, + "median": 0.857, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5631, + "stddev": 0.10811543599320127 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Culturally Sensitive", + "max": 0.9397, + "mean": 0.788525, + "median": 0.78935, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5391, + "stddev": 0.1149148963548909 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "English", + "max": 0.9475, + "mean": 0.7939833333333334, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.38, + "stddev": 0.15081692344416497 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "French", + "max": 0.9575, + "mean": 0.7944791666666666, + "median": 0.8275, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.41, + "stddev": 0.14230966528431346 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "German", + "max": 0.94, + "mean": 0.8004833333333333, + "median": 0.8275, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.4775, + "stddev": 0.12445258061886479 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Global MMLU Lite", + "max": 0.9453, + "mean": 0.8074583333333334, + "median": 0.82315, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5511, + "stddev": 0.11081356363967734 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Hindi", + "max": 0.9475, + "mean": 0.7983333333333333, + "median": 0.82355, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.555, + "stddev": 0.11719085240122123 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Indonesian", + "max": 0.955, + "mean": 0.801275, + "median": 0.80625, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.515, + "stddev": 0.11649187077838011 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Italian", + "max": 0.955, + "mean": 0.8056875, + "median": 0.8300000000000001, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.48, + "stddev": 0.1239779332201175 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Japanese", + "max": 0.94, + "mean": 0.8170291666666667, + "median": 0.84375, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.58, + "stddev": 0.10297801657229139 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Korean", + "max": 0.95, + "mean": 0.820125, + "median": 0.84, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.595, + "stddev": 0.10111529652574511 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Portuguese", + "max": 0.945, + "mean": 0.8010041666666666, + "median": 0.8323, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5175, + "stddev": 0.12492813757011505 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Spanish", + "max": 0.9475, + "mean": 0.8042458333333333, + "median": 0.8325, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.485, + "stddev": 0.12684843352857172 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Swahili", + "max": 0.94, + "mean": 0.8143708333333334, + "median": 0.8200000000000001, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.6075, + "stddev": 0.09313423156204427 + }, + { + "benchmark": "global-mmlu-lite", + "count": 48, + "evaluation_name": "Yoruba", + "max": 0.9425, + "mean": 0.8155583333333334, + "median": 0.8223, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5825, + "stddev": 0.09530013023440752 + }, + { + "benchmark": "Wordle Arena Word Set", + "count": 43, + "evaluation_name": "wordle_arena_win_rate", + "max": 1.0, + "mean": 0.38320930232558137, + "median": 0.3, + "metric_id": "wordle_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.3652171551113076 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 40, + "evaluation_name": "bfcl.format_sensitivity.max_delta", + "max": 81.5, + "mean": 34.5125, + "median": 22.5, + "metric_id": "bfcl.format_sensitivity.max_delta", + "metric_kind": "difference", + "metric_name": "Format sensitivity max delta", + "metric_unit": "percentage_points", + "min": 0.0, + "stddev": 26.711537769286142 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 40, + "evaluation_name": "bfcl.format_sensitivity.stddev", + "max": 34.18, + "mean": 10.88475, + "median": 5.699999999999999, + "metric_id": "bfcl.format_sensitivity.stddev", + "metric_kind": "difference", + "metric_name": "Format sensitivity standard deviation", + "metric_unit": "percentage_points", + "min": 0.0, + "stddev": 9.736365348132109 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "count": 40, + "evaluation_name": "fibble1_arena_win_rate", + "max": 0.881, + "mean": 0.1804375, + "median": 0.08990000000000001, + "metric_id": "fibble1_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.24891262362266392 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "count": 38, + "evaluation_name": "fibble2_arena_win_rate", + "max": 0.3, + "mean": 0.03426315789473684, + "median": 0.0, + "metric_id": "fibble2_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.07868617592468241 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_cost_per_100_calls_usd", + "max": 28.648, + "mean": 2.934265789473684, + "median": 0.7163999999999999, + "metric_id": "cost_per_100_calls_usd", + "metric_kind": "cost", + "metric_name": "Cost per 100 calls", + "metric_unit": "usd", + "min": 0.0546, + "stddev": 6.353947282543234 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_elo", + "max": 1151.3779287263492, + "mean": 999.7829236774063, + "median": 1008.3158430770602, + "metric_id": "elo", + "metric_kind": "elo", + "metric_name": "Elo rating", + "metric_unit": "points", + "min": 829.7737302958208, + "stddev": 75.00592284131643 + }, + { + "benchmark": "SciArena leaderboard API", + "count": 38, + "evaluation_name": "overall_rank", + "max": 38.0, + "mean": 19.5, + "median": 19.5, + "metric_id": "rank", + "metric_kind": "rank", + "metric_name": "Rank", + "metric_unit": "position", + "min": 1.0, + "stddev": 11.113055385446435 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "count": 37, + "evaluation_name": "fibble3_arena_win_rate", + "max": 0.333, + "mean": 0.010551351351351351, + "median": 0.0, + "metric_id": "fibble3_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.0548872866357432 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "count": 37, + "evaluation_name": "fibble5_arena_win_rate", + "max": 0.6364, + "mean": 0.09143783783783783, + "median": 0.0, + "metric_id": "fibble5_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.17735344709138076 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "count": 36, + "evaluation_name": "fibble4_arena_win_rate", + "max": 0.0732, + "mean": 0.0028055555555555555, + "median": 0.0, + "metric_id": "fibble4_arena.win_rate", + "metric_kind": null, + "metric_name": "Win Rate", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.012925853261571653 + }, + { + "benchmark": "Wordle Arena Word Set", + "count": 32, + "evaluation_name": "wordle_arena_avg_attempts", + "max": 6.0, + "mean": 4.51125, + "median": 4.46, + "metric_id": "wordle_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 3.33, + "stddev": 0.7843417211164219 + }, + { + "benchmark": "wordle_arena_daily", + "count": 32, + "evaluation_name": "wordle_arena_avg_attempts", + "max": 6.0, + "mean": 5.194375, + "median": 5.365, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 3.67, + "stddev": 0.8160919362345744 + }, + { + "benchmark": "wordle_arena_daily", + "count": 32, + "evaluation_name": "wordle_arena_win_rate", + "max": 100.0, + "mean": 47.33125, + "median": 41.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 43.73033521186335 + }, + { + "benchmark": "Easy Problems", + "count": 29, + "evaluation_name": "Easy Problems", + "max": 0.9014, + "mean": 0.4996824672170957, + "median": 0.5352, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.056338028169014086, + "stddev": 0.2844141675332875 + }, + { + "benchmark": "Hard Problems", + "count": 29, + "evaluation_name": "Hard Problems", + "max": 0.1594, + "mean": 0.009876639145216123, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.03091194562296296 + }, + { + "benchmark": "Medium Problems", + "count": 29, + "evaluation_name": "Medium Problems", + "max": 0.5211, + "mean": 0.11304244779018942, + "median": 0.056338028169014086, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.14517143188248943 + }, + { + "benchmark": "fibble4_arena_daily", + "count": 28, + "evaluation_name": "fibble4_arena_avg_attempts", + "max": 12.0, + "mean": 11.7, + "median": 12.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 9.0, + "stddev": 0.7990735376162312 + }, + { + "benchmark": "fibble4_arena_daily", + "count": 28, + "evaluation_name": "fibble4_arena_avg_latency_ms", + "max": 445377.0, + "mean": 80890.78571428571, + "median": 25152.5, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 275.0, + "stddev": 129476.7960242032 + }, + { + "benchmark": "fibble4_arena_daily", + "count": 28, + "evaluation_name": "fibble4_arena_win_rate", + "max": 66.7, + "mean": 6.310714285714286, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 17.409859685985822 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 28, + "evaluation_name": "fibble5_arena_avg_attempts", + "max": 9.0, + "mean": 7.9835714285714285, + "median": 9.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 3.0, + "stddev": 1.7199229403188157 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 28, + "evaluation_name": "fibble5_arena_avg_latency_ms", + "max": 185756.0, + "mean": 33582.0, + "median": 3793.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 326.0, + "stddev": 47155.59573644743 + }, + { + "benchmark": "fibble5_arena_daily", + "count": 28, + "evaluation_name": "fibble5_arena_win_rate", + "max": 100.0, + "mean": 27.28214285714286, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 40.59178947787191 + }, + { + "benchmark": "fibble_arena_daily", + "count": 28, + "evaluation_name": "fibble_arena_avg_attempts", + "max": 8.0, + "mean": 7.345357142857143, + "median": 7.8100000000000005, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.6, + "stddev": 0.9735330165903643 + }, + { + "benchmark": "fibble_arena_daily", + "count": 28, + "evaluation_name": "fibble_arena_win_rate", + "max": 100.0, + "mean": 28.29642857142857, + "median": 14.3, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 34.60197470433087 + }, + { + "benchmark": "wordle_arena_daily", + "count": 28, + "evaluation_name": "wordle_arena_avg_latency_ms", + "max": 50298.0, + "mean": 14709.32142857143, + "median": 2629.5, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 294.0, + "stddev": 17752.34724606042 + }, + { + "benchmark": "fibble_arena_daily", + "count": 26, + "evaluation_name": "fibble_arena_avg_latency_ms", + "max": 117888.0, + "mean": 23553.576923076922, + "median": 2003.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 175.0, + "stddev": 33003.246209029894 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 25, + "evaluation_name": "fibble3_arena_avg_attempts", + "max": 12.0, + "mean": 11.6, + "median": 12.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.5, + "stddev": 1.5612494995995996 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 25, + "evaluation_name": "fibble3_arena_avg_latency_ms", + "max": 335320.0, + "mean": 50081.96, + "median": 3034.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 261.0, + "stddev": 78555.2898810131 + }, + { + "benchmark": "fibble3_arena_daily", + "count": 25, + "evaluation_name": "fibble3_arena_win_rate", + "max": 100.0, + "mean": 7.0, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 22.26731535981231 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "count": 24, + "evaluation_name": "fibble1_arena_avg_attempts", + "max": 8.0, + "mean": 5.88875, + "median": 5.93, + "metric_id": "fibble1_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 3.75, + "stddev": 0.9265565634203836 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 22, + "evaluation_name": "fibble2_arena_avg_attempts", + "max": 10.0, + "mean": 9.765, + "median": 10.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 6.5, + "stddev": 0.8113466172563569 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 22, + "evaluation_name": "fibble2_arena_avg_latency_ms", + "max": 135519.0, + "mean": 33180.318181818184, + "median": 2125.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 294.0, + "stddev": 47417.827950683335 + }, + { + "benchmark": "fibble2_arena_daily", + "count": 22, + "evaluation_name": "fibble2_arena_win_rate", + "max": 75.0, + "mean": 4.922727272727273, + "median": 0.0, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 17.183545058550077 + }, + { + "benchmark": "apex-agents", + "count": 19, + "evaluation_name": "Corporate Lawyer Mean Score", + "max": 0.548, + "mean": 0.38605263157894737, + "median": 0.394, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.147, + "stddev": 0.1127334484940327 + }, + { + "benchmark": "appworld/test_normal", + "count": 15, + "evaluation_name": "appworld/test_normal", + "max": 0.7, + "mean": 0.38053333333333333, + "median": 0.505, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.0, + "stddev": 0.2795156184408681 + }, + { + "benchmark": "browsecompplus", + "count": 15, + "evaluation_name": "browsecompplus", + "max": 0.61, + "mean": 0.47951333333333335, + "median": 0.48, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.26, + "stddev": 0.09206274930461185 + }, + { + "benchmark": "swe-bench", + "count": 15, + "evaluation_name": "swe-bench", + "max": 0.8072, + "mean": 0.6515666666666666, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.5253, + "stddev": 0.08692541685397948 + }, + { + "benchmark": "tau-bench-2/airline", + "count": 15, + "evaluation_name": "tau-bench-2/airline", + "max": 0.74, + "mean": 0.6333333333333333, + "median": 0.66, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.48, + "stddev": 0.0830375703837612 + }, + { + "benchmark": "tau-bench-2/retail", + "count": 15, + "evaluation_name": "tau-bench-2/retail", + "max": 0.85, + "mean": 0.7409, + "median": 0.78, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.51, + "stddev": 0.09942657736095659 + }, + { + "benchmark": "tau-bench-2/telecom", + "count": 15, + "evaluation_name": "tau-bench-2/telecom", + "max": 0.8876, + "mean": 0.69824, + "median": 0.73, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.53, + "stddev": 0.12537163497834292 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "count": 13, + "evaluation_name": "fibble5_arena_avg_attempts", + "max": 7.0, + "mean": 5.83, + "median": 6.0, + "metric_id": "fibble5_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 2.0, + "stddev": 1.3097200718728665 + }, + { + "benchmark": "apex-agents", + "count": 13, + "evaluation_name": "Overall Pass@1", + "max": 0.335, + "mean": 0.20892307692307693, + "median": 0.23, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.04, + "stddev": 0.09209276259878907 + }, + { + "benchmark": "ace", + "count": 12, + "evaluation_name": "Gaming Score", + "max": 0.613, + "mean": 0.4613333333333333, + "median": 0.46199999999999997, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.284, + "stddev": 0.130447713895668 + }, + { + "benchmark": "ace", + "count": 11, + "evaluation_name": "Overall Score", + "max": 0.561, + "mean": 0.47963636363636364, + "median": 0.478, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.38, + "stddev": 0.06701831500011432 + }, + { + "benchmark": "apex-agents", + "count": 9, + "evaluation_name": "Corporate Law Pass@1", + "max": 0.266, + "mean": 0.18122222222222223, + "median": 0.189, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.078, + "stddev": 0.06984586204238906 + }, + { + "benchmark": "apex-agents", + "count": 9, + "evaluation_name": "Overall Mean Score", + "max": 0.401, + "mean": 0.3071111111111111, + "median": 0.341, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.115, + "stddev": 0.10572658658592508 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "count": 8, + "evaluation_name": "fibble2_arena_avg_attempts", + "max": 8.0, + "mean": 5.97875, + "median": 5.415, + "metric_id": "fibble2_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 4.0, + "stddev": 1.4973923762900052 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Investment Banking Pass@1", + "max": 0.273, + "mean": 0.17825000000000002, + "median": 0.202, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.012, + "stddev": 0.10551607866644239 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Management Consulting Pass@1", + "max": 0.227, + "mean": 0.122875, + "median": 0.1235, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.029, + "stddev": 0.06801982379109364 + }, + { + "benchmark": "apex-agents", + "count": 8, + "evaluation_name": "Overall Pass@8", + "max": 0.4, + "mean": 0.29725, + "median": 0.3345, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.115, + "stddev": 0.10747724011555723 + }, + { + "benchmark": "apex-v1", + "count": 7, + "evaluation_name": "Overall Score", + "max": 0.67, + "mean": 0.6027142857142858, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.359, + "stddev": 0.10972953763035471 + }, + { + "benchmark": "La Leaderboard composite dataset", + "count": 5, + "evaluation_name": "la_leaderboard", + "max": 33.62, + "mean": 28.874, + "median": 27.61, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 25.87, + "stddev": 3.0963090930977795 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 4, + "evaluation_name": "v3_Semi_Private", + "max": 8866.2, + "mean": 5010.535, + "median": 4481.205, + "metric_id": "cost", + "metric_kind": "cost", + "metric_name": "Cost", + "metric_unit": "usd", + "min": 2213.53, + "stddev": 2842.9574427639027 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 4, + "evaluation_name": "v3_Semi_Private", + "max": 0.0026, + "mean": 0.001775, + "median": 0.0022500000000000003, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.0, + "stddev": 0.001195477589361953 + }, + { + "benchmark": "Anthropic RLHF dataset", + "count": 4, + "evaluation_name": "Anthropic RLHF dataset", + "max": 4.965, + "mean": 4.76925, + "median": 4.949, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.214, + "stddev": 0.3704460149603447 + }, + { + "benchmark": "Best ChatGPT Prompts", + "count": 4, + "evaluation_name": "Best ChatGPT Prompts", + "max": 4.995, + "mean": 4.9855, + "median": 4.987, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.973, + "stddev": 0.009183318209304073 + }, + { + "benchmark": "Koala test dataset", + "count": 4, + "evaluation_name": "Koala test dataset", + "max": 4.987, + "mean": 4.97575, + "median": 4.975, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.966, + "stddev": 0.009912113800799387 + }, + { + "benchmark": "Open Assistant", + "count": 4, + "evaluation_name": "Open Assistant", + "max": 4.987, + "mean": 4.97875, + "median": 4.980499999999999, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.967, + "stddev": 0.0095350231602587 + }, + { + "benchmark": "Self Instruct", + "count": 4, + "evaluation_name": "Self Instruct", + "max": 4.992, + "mean": 4.9822500000000005, + "median": 4.9830000000000005, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.971, + "stddev": 0.010340051579497423 + }, + { + "benchmark": "Vicuna", + "count": 4, + "evaluation_name": "Vicuna", + "max": 4.995, + "mean": 4.99275, + "median": 4.9935, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 4.989, + "stddev": 0.0028722813232691232 + }, + { + "benchmark": "helm_instruct", + "count": 4, + "evaluation_name": "Mean win rate", + "max": 0.689, + "mean": 0.5, + "median": 0.611, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.089, + "stddev": 0.27645614480419856 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 3, + "evaluation_name": "v2_Private_Eval", + "max": 17.0, + "mean": 5.776666666666666, + "median": 0.2, + "metric_id": "cost_per_task", + "metric_kind": "cost", + "metric_name": "Cost per task", + "metric_unit": "usd", + "min": 0.13, + "stddev": 9.719754798004594 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 3, + "evaluation_name": "v2_Private_Eval", + "max": 1.0, + "mean": 0.345, + "median": 0.031, + "metric_id": "score", + "metric_kind": "accuracy", + "metric_name": "ARC score", + "metric_unit": "proportion", + "min": 0.004, + "stddev": 0.5674072611449381 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "count": 3, + "evaluation_name": "fibble3_arena_avg_attempts", + "max": 12.0, + "mean": 7.333333333333333, + "median": 7.0, + "metric_id": "fibble3_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 3.0, + "stddev": 4.509249752822894 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "DIY Score", + "max": 0.56, + "mean": 0.55, + "median": 0.55, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.54, + "stddev": 0.010000000000000009 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "Food Score", + "max": 0.7, + "mean": 0.65, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.6, + "stddev": 0.04999999999999999 + }, + { + "benchmark": "ace", + "count": 3, + "evaluation_name": "Shopping Score", + "max": 0.45, + "mean": 0.45, + "median": 0.45, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.45, + "stddev": 0.0 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Big Law Score", + "max": 0.78, + "mean": 0.77, + "median": 0.77, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.76, + "stddev": 0.010000000000000009 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Consulting Score", + "max": 0.64, + "mean": 0.64, + "median": 0.64, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.64, + "stddev": 0.0 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Investment Banking Score", + "max": 0.64, + "mean": 0.6266666666666667, + "median": 0.63, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.61, + "stddev": 0.01527525231651948 + }, + { + "benchmark": "apex-v1", + "count": 3, + "evaluation_name": "Medicine (MD) Score", + "max": 0.66, + "mean": 0.6533333333333333, + "median": 0.65, + "metric_id": null, + "metric_kind": null, + "metric_name": null, + "metric_unit": null, + "min": 0.65, + "stddev": 0.005773502691896263 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "count": 2, + "evaluation_name": "fibble4_arena_avg_attempts", + "max": 8.0, + "mean": 8.0, + "median": 8.0, + "metric_id": "fibble4_arena.avg_attempts", + "metric_kind": null, + "metric_name": "Average Attempts", + "metric_unit": "guesses", + "min": 8.0, + "stddev": 0.0 + } + ] + }, + "observational": { + "exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "valid_normalized_rows": 40395 + } +} From cdc7d8ccd9ec56a694265cd68ffc4f74d75a949c Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 14:06:19 -0300 Subject: [PATCH 14/15] add 3rd figure --- audit/dataset_statistics.json | 5440 +++++++++++++++-- every_eval_ever/helpers/dataset_statistics.py | 335 +- scripts/plot_dataset_statistics.py | 164 +- tests/test_dataset_statistics.py | 64 + 4 files changed, 5328 insertions(+), 675 deletions(-) diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json index 57bb9d88f..b6d21586b 100644 --- a/audit/dataset_statistics.json +++ b/audit/dataset_statistics.json @@ -70,6 +70,7 @@ } ], "metadata_completeness": { + "benchmark_selection": "Top benchmarks by result-row count, with remaining benchmarks aggregated as Other; rows are sorted by overall metadata completeness.", "benchmarks": [ { "benchmark": "BBH", @@ -119,6 +120,42 @@ "overall_completeness": 0.07793544104223715, "result_rows": 4635 }, + { + "benchmark": "GSM8K", + "label": "GSM8K (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, + { + "benchmark": "LegalBench", + "label": "LegalBench (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, + { + "benchmark": "MATH", + "label": "MATH (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, + { + "benchmark": "MMLU", + "label": "MMLU (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, + { + "benchmark": "MedQA", + "label": "MedQA (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "label": "NarrativeQA (n=91)", + "overall_completeness": 0.15384615384615385, + "result_rows": 91 + }, { "benchmark": "helm_mmlu", "label": "helm_mmlu (n=2,844)", @@ -137,36 +174,95 @@ "overall_completeness": 0.3076923076923077, "result_rows": 1020 }, + { + "benchmark": "SciArena leaderboard API", + "label": "SciArena leaderboard API (n=114)", + "overall_completeness": 0.3076923076923077, + "result_rows": 114 + }, { "benchmark": "BFCL leaderboard CSV", "label": "BFCL leaderboard CSV (n=3,350)", "overall_completeness": 0.38461538461538464, "result_rows": 3350 }, + { + "benchmark": "wordle_arena_daily", + "label": "wordle_arena_daily (n=92)", + "overall_completeness": 0.6153846153846154, + "result_rows": 92 + }, { "benchmark": "Other", - "label": "Other (n=2,399)", - "overall_completeness": 0.3087825055311508, - "result_rows": 2399 + "label": "Other (n=1,647)", + "overall_completeness": 0.3430946709635234, + "result_rows": 1647 } ], + "field_group_order": [ + "eval metadata", + "benchmark metadata", + "model metadata" + ], "fields": [ - { - "benchmark_stddev": 0.43200860863664675, - "key": "uncertainty_num_samples", - "label": "sample count", - "missing_rate": 0.9801456969996296, - "selection_score": 0.4234313788220063 - }, { "benchmark_stddev": 0.4794204729297534, + "group": "eval metadata", + "group_stddev": 0.4794204729297534, "key": "generation_config_present", "label": "generation config", "missing_rate": 0.8809235708112113, "selection_score": 0.4223327949332781 }, + { + "benchmark_stddev": 0.30224668644283065, + "group": "eval metadata", + "group_stddev": 0.30224668644283065, + "key": "generation_agentic_config_present", + "label": "agentic config", + "missing_rate": 0.9977775033954809, + "selection_score": 0.3015749442084843 + }, + { + "benchmark_stddev": 0.30224668644283065, + "group": "eval metadata", + "group_stddev": 0.30224668644283065, + "key": "generation_max_tokens", + "label": "max tokens", + "missing_rate": 0.9880726015557476, + "selection_score": 0.298641669785172 + }, + { + "benchmark_stddev": 0.30224668644283065, + "group": "eval metadata", + "group_stddev": 0.30224668644283065, + "key": "generation_temperature", + "label": "temperature", + "missing_rate": 0.9880726015557476, + "selection_score": 0.298641669785172 + }, + { + "benchmark_stddev": 0.40252007074704804, + "group": "eval metadata", + "group_stddev": 0.40252007074704804, + "key": "inference_engine", + "label": "inference engine/platform", + "missing_rate": 0.1724410421039635, + "selection_score": 0.06941098046738207 + }, + { + "benchmark_stddev": 0.43200860863664675, + "group": "benchmark metadata", + "group_stddev": 0.43200860863664675, + "key": "uncertainty_num_samples", + "label": "sample count", + "missing_rate": 0.9801456969996296, + "selection_score": 0.4234313788220063 + }, { "benchmark_stddev": 0.43144687321918196, + "group": "benchmark metadata", + "group_stddev": 0.43144687321918196, "key": "has_uncertainty", "label": "uncertainty", "missing_rate": 0.9604148660328435, @@ -174,6 +270,8 @@ }, { "benchmark_stddev": 0.40252007074704804, + "group": "benchmark metadata", + "group_stddev": 0.40252007074704804, "key": "detailed_results_file", "label": "detailed results", "missing_rate": 0.9803432522533646, @@ -181,6 +279,8 @@ }, { "benchmark_stddev": 0.49820130361691756, + "group": "benchmark metadata", + "group_stddev": 0.49820130361691756, "key": "source_organization_url", "label": "source org URL", "missing_rate": 0.7824175824175824, @@ -188,6 +288,8 @@ }, { "benchmark_stddev": 0.3595458209423123, + "group": "benchmark metadata", + "group_stddev": 0.3595458209423123, "key": "metric_id", "label": "metric ID", "missing_rate": 0.8815409309791332, @@ -195,34 +297,17 @@ }, { "benchmark_stddev": 0.3595458209423123, + "group": "benchmark metadata", + "group_stddev": 0.3595458209423123, "key": "metric_unit", "label": "metric unit", "missing_rate": 0.8815409309791332, "selection_score": 0.31695435772314273 }, - { - "benchmark_stddev": 0.30224668644283065, - "key": "generation_agentic_config_present", - "label": "agentic config", - "missing_rate": 0.9977775033954809, - "selection_score": 0.3015749442084843 - }, - { - "benchmark_stddev": 0.30224668644283065, - "key": "generation_max_tokens", - "label": "max tokens", - "missing_rate": 0.9880726015557476, - "selection_score": 0.298641669785172 - }, - { - "benchmark_stddev": 0.30224668644283065, - "key": "generation_temperature", - "label": "temperature", - "missing_rate": 0.9880726015557476, - "selection_score": 0.298641669785172 - }, { "benchmark_stddev": 0.21968612536975798, + "group": "benchmark metadata", + "group_stddev": 0.21968612536975798, "key": "metric_kind", "label": "metric kind", "missing_rate": 0.8892702802815162, @@ -230,25 +315,20 @@ }, { "benchmark_stddev": 0.12908090009938827, + "group": "model metadata", + "group_stddev": 0.12908090009938827, "key": "model_license", "label": "model license", "missing_rate": 0.9172737374984566, "selection_score": 0.11840251967383078 - }, - { - "benchmark_stddev": 0.40252007074704804, - "key": "inference_engine", - "label": "inference engine/platform", - "missing_rate": 0.1724410421039635, - "selection_score": 0.06941098046738207 } ], "matrix": [ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -256,8 +336,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -265,8 +345,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -274,8 +354,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -283,8 +363,17 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4574 + }, + { + "benchmark": "BBH", + "benchmark_label": "BBH (n=4,574)", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -292,8 +381,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -301,8 +390,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -310,8 +399,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -319,8 +408,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -328,8 +417,8 @@ { "benchmark": "BBH", "benchmark_label": "BBH (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -353,19 +442,19 @@ "result_rows": 4574 }, { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, + "benchmark": "MATH Level 5", + "benchmark_label": "MATH Level 5 (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 4574 }, { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -373,8 +462,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -382,8 +471,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -391,17 +480,17 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 4574 }, { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -409,8 +498,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -418,8 +507,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -427,8 +516,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -436,8 +525,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -445,8 +534,8 @@ { "benchmark": "MATH Level 5", "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -470,19 +559,19 @@ "result_rows": 4574 }, { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, + "benchmark": "MMLU-PRO", + "benchmark_label": "MMLU-PRO (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 4574 }, { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -490,8 +579,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -499,8 +588,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -508,17 +597,17 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 4574 }, { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -526,8 +615,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -535,8 +624,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -544,8 +633,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -553,8 +642,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -562,8 +651,8 @@ { "benchmark": "MMLU-PRO", "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -587,19 +676,19 @@ "result_rows": 4574 }, { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, + "benchmark": "MUSR", + "benchmark_label": "MUSR (n=4,574)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 4574 }, { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -607,8 +696,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -616,8 +705,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -625,17 +714,17 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 4574 }, { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -643,8 +732,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -652,8 +741,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -661,8 +750,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -670,8 +759,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -679,8 +768,8 @@ { "benchmark": "MUSR", "benchmark_label": "MUSR (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4574 @@ -703,20 +792,11 @@ "present_rate": 0.0, "result_rows": 4574 }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4574 - }, { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -724,8 +804,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -733,8 +813,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -742,8 +822,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "detailed_results_file", - "field_label": "detailed results", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -751,17 +831,17 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 1025 }, { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_id", - "field_label": "metric ID", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -769,8 +849,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -778,8 +858,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -787,17 +867,17 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 1025 }, { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -805,8 +885,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_kind", - "field_label": "metric kind", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -814,8 +894,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "model_license", - "field_label": "model license", + "field": "metric_kind", + "field_label": "metric kind", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -823,8 +903,8 @@ { "benchmark": "RewardBench", "benchmark_label": "RewardBench (n=1,025)", - "field": "inference_engine", - "field_label": "inference engine/platform", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1025 @@ -832,8 +912,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -841,8 +921,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_config_present", - "field_label": "generation config", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -850,8 +930,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -859,8 +939,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "detailed_results_file", - "field_label": "detailed results", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -868,17 +948,17 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 1379 }, { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_id", - "field_label": "metric ID", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -886,8 +966,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -895,8 +975,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -904,17 +984,17 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 1379 }, { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -922,8 +1002,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_kind", - "field_label": "metric kind", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -931,8 +1011,8 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "model_license", - "field_label": "model license", + "field": "metric_kind", + "field_label": "metric kind", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 @@ -940,21 +1020,12 @@ { "benchmark": "RewardBench 2", "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "inference_engine", - "field_label": "inference engine/platform", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 1379 }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", @@ -967,8 +1038,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "has_uncertainty", - "field_label": "uncertainty", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -976,8 +1047,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "detailed_results_file", - "field_label": "detailed results", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -985,8 +1056,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -994,17 +1065,17 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, "result_rows": 4635 }, { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1012,8 +1083,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1021,8 +1092,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1030,8 +1101,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1039,8 +1110,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "metric_kind", - "field_label": "metric kind", + "field": "metric_id", + "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1048,8 +1119,8 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "model_license", - "field_label": "model license", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1057,17 +1128,17 @@ { "benchmark": "GPQA", "benchmark_label": "GPQA (n=4,635)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, "result_rows": 4635 }, { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "benchmark": "GPQA", + "benchmark_label": "GPQA (n=4,635)", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1084,17 +1155,17 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 }, { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "detailed_results_file", - "field_label": "detailed results", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1102,8 +1173,8 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "source_organization_url", - "field_label": "source org URL", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1111,8 +1182,17 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "metric_id", - "field_label": "metric ID", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1120,8 +1200,8 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "metric_unit", - "field_label": "metric unit", + "field": "has_uncertainty", + "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1129,8 +1209,8 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "field": "detailed_results_file", + "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1138,8 +1218,8 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "field": "source_organization_url", + "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1147,8 +1227,17 @@ { "benchmark": "IFEval", "benchmark_label": "IFEval (n=4,635)", - "field": "generation_temperature", - "field_label": "temperature", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 4635 + }, + { + "benchmark": "IFEval", + "benchmark_label": "IFEval (n=4,635)", + "field": "metric_unit", + "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, "result_rows": 4635 @@ -1172,602 +1261,4791 @@ "result_rows": 4635 }, { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "inference_engine", - "field_label": "inference engine/platform", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 4635 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_config_present", - "field_label": "generation config", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "inference_engine", + "field_label": "inference engine/platform", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", "field": "has_uncertainty", "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", "field": "detailed_results_file", "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", "field": "source_organization_url", "field_label": "source org URL", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", "field": "metric_id", "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", "field": "metric_unit", "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "metric_kind", + "field_label": "metric kind", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "benchmark": "GSM8K", + "benchmark_label": "GSM8K (n=91)", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_temperature", - "field_label": "temperature", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 + }, + { + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "metric_kind", - "field_label": "metric kind", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "model_license", - "field_label": "model license", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "inference_engine", "field_label": "inference engine/platform", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 2844 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "uncertainty_num_samples", "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "has_uncertainty", "field_label": "uncertainty", - "missing_rate": 0.1578947368421053, - "present_rate": 0.8421052631578947, - "result_rows": 912 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "detailed_results_file", "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "source_organization_url", "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 912 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "metric_id", "field_label": "metric ID", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", "field": "metric_unit", "field_label": "metric unit", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "metric_kind", + "field_label": "metric kind", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "benchmark": "LegalBench", + "benchmark_label": "LegalBench (n=91)", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_temperature", - "field_label": "temperature", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 + }, + { + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "metric_kind", - "field_label": "metric kind", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "model_license", - "field_label": "model license", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "inference_engine", "field_label": "inference engine/platform", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 912 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "uncertainty_num_samples", "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "has_uncertainty", "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "detailed_results_file", "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "source_organization_url", "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "metric_id", "field_label": "metric ID", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", "field": "metric_unit", "field_label": "metric unit", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_max_tokens", - "field_label": "max tokens", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "metric_kind", + "field_label": "metric kind", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_temperature", - "field_label": "temperature", + "benchmark": "MATH", + "benchmark_label": "MATH (n=91)", + "field": "model_license", + "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "metric_kind", - "field_label": "metric kind", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "model_license", - "field_label": "model license", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "inference_engine", - "field_label": "inference engine/platform", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "generation_max_tokens", + "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 1020 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "uncertainty_num_samples", - "field_label": "sample count", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "generation_temperature", + "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "generation_config_present", - "field_label": "generation config", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 + }, + { + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "uncertainty_num_samples", + "field_label": "sample count", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", "field": "has_uncertainty", "field_label": "uncertainty", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", "field": "detailed_results_file", "field_label": "detailed results", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", "field": "source_organization_url", "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", "field": "metric_id", "field_label": "metric ID", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", "field": "metric_unit", "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "MMLU", + "benchmark_label": "MMLU (n=91)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", + "field": "generation_config_present", + "field_label": "generation config", "missing_rate": 0.0, "present_rate": 1.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "generation_agentic_config_present", "field_label": "agentic config", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "generation_max_tokens", "field_label": "max tokens", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "generation_temperature", "field_label": "temperature", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 + "result_rows": 91 }, { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "inference_engine", "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "uncertainty_num_samples", "field_label": "sample count", - "missing_rate": 0.6648603584827011, - "present_rate": 0.33513964151729886, - "result_rows": 2399 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.22634431012922052, - "present_rate": 0.7736556898707795, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "has_uncertainty", "field_label": "uncertainty", - "missing_rate": 0.6519383076281784, - "present_rate": 0.3480616923718216, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "detailed_results_file", "field_label": "detailed results", - "missing_rate": 0.6681950812838683, - "present_rate": 0.3318049187161317, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "source_organization_url", "field_label": "source org URL", - "missing_rate": 0.5310546060858692, - "present_rate": 0.46894539391413087, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "metric_id", "field_label": "metric ID", - "missing_rate": 0.8220091704877033, - "present_rate": 0.17799082951229678, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", "field": "metric_unit", "field_label": "metric unit", - "missing_rate": 0.8220091704877033, - "present_rate": 0.17799082951229678, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "MedQA", + "benchmark_label": "MedQA (n=91)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", "field": "generation_agentic_config_present", "field_label": "agentic config", - "missing_rate": 0.9624843684868696, - "present_rate": 0.03751563151313047, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", "field": "generation_max_tokens", "field_label": "max tokens", - "missing_rate": 0.7986661108795331, - "present_rate": 0.20133388912046687, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", "field": "generation_temperature", "field_label": "temperature", - "missing_rate": 0.7986661108795331, - "present_rate": 0.20133388912046687, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 + }, + { + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", "field": "metric_kind", "field_label": "metric kind", - "missing_rate": 0.9524802000833681, - "present_rate": 0.04751979991663193, - "result_rows": 2399 + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", + "benchmark": "NarrativeQA", + "benchmark_label": "NarrativeQA (n=91)", "field": "model_license", "field_label": "model license", "missing_rate": 1.0, "present_rate": 0.0, - "result_rows": 2399 + "result_rows": 91 }, { - "benchmark": "Other", - "benchmark_label": "Other (n=2,399)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.08711963318049187, - "present_rate": 0.9128803668195081, - "result_rows": 2399 + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "helm_mmlu", + "benchmark_label": "helm_mmlu (n=2,844)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 2844 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.1578947368421053, + "present_rate": 0.8421052631578947, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "global-mmlu-lite", + "benchmark_label": "global-mmlu-lite (n=912)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 912 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 1020 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1020 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 114 + }, + { + "benchmark": "SciArena leaderboard API", + "benchmark_label": "SciArena leaderboard API (n=114)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 114 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "BFCL leaderboard CSV", + "benchmark_label": "BFCL leaderboard CSV (n=3,350)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 3350 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.0, + "present_rate": 1.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 92 + }, + { + "benchmark": "wordle_arena_daily", + "benchmark_label": "wordle_arena_daily (n=92)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 92 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.26047358834244083, + "present_rate": 0.7395264116575592, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "generation_agentic_config_present", + "field_label": "agentic config", + "missing_rate": 0.9453551912568307, + "present_rate": 0.0546448087431694, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 0.7625986642380085, + "present_rate": 0.2374013357619915, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 0.7625986642380085, + "present_rate": 0.2374013357619915, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.05768063145112323, + "present_rate": 0.9423193685488768, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 0.567698846387371, + "present_rate": 0.432301153612629, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.548876745598057, + "present_rate": 0.45112325440194295, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 0.5725561627200971, + "present_rate": 0.42744383727990287, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.44201578627808136, + "present_rate": 0.5579842137219186, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.8099574984820886, + "present_rate": 0.19004250151791136, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.8099574984820886, + "present_rate": 0.19004250151791136, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1647 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=1,647)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "present_rate": 0.0, + "result_rows": 1647 + } + ], + "other_result_rows": 1647, + "top_benchmark_count": 20 + }, + "model_family_metadata_completeness": { + "field_group_order": [ + "eval metadata", + "benchmark metadata", + "model metadata" + ], + "fields": [ + { + "benchmark_stddev": 0.13379488104112633, + "group": "eval metadata", + "group_stddev": 0.13379488104112633, + "key": "generation_config_present", + "label": "generation config", + "missing_rate": 0.8809235708112113, + "selection_score": 0.11786306436301024 + }, + { + "benchmark_stddev": 0.07810422518841251, + "group": "eval metadata", + "group_stddev": 0.07810422518841251, + "key": "generation_max_tokens", + "label": "max tokens", + "missing_rate": 0.9880726015557476, + "selection_score": 0.0771726449744107 + }, + { + "benchmark_stddev": 0.07810422518841251, + "group": "eval metadata", + "group_stddev": 0.07810422518841251, + "key": "generation_temperature", + "label": "temperature", + "missing_rate": 0.9880726015557476, + "selection_score": 0.0771726449744107 + }, + { + "benchmark_stddev": 0.2624427571702142, + "group": "eval metadata", + "group_stddev": 0.2624427571702142, + "key": "inference_engine", + "label": "inference engine/platform", + "missing_rate": 0.1724410421039635, + "selection_score": 0.04525590253906917 + }, + { + "benchmark_stddev": 0.2850357391633949, + "group": "benchmark metadata", + "group_stddev": 0.2850357391633949, + "key": "source_organization_url", + "label": "source org URL", + "missing_rate": 0.7824175824175824, + "selection_score": 0.22301697393883207 + }, + { + "benchmark_stddev": 0.15209815212736208, + "group": "benchmark metadata", + "group_stddev": 0.15209815212736208, + "key": "metric_id", + "label": "metric ID", + "missing_rate": 0.8815409309791332, + "selection_score": 0.1340807466265606 + }, + { + "benchmark_stddev": 0.15209815212736208, + "group": "benchmark metadata", + "group_stddev": 0.15209815212736208, + "key": "metric_unit", + "label": "metric unit", + "missing_rate": 0.8815409309791332, + "selection_score": 0.1340807466265606 + }, + { + "benchmark_stddev": 0.14580449624851055, + "group": "benchmark metadata", + "group_stddev": 0.14580449624851055, + "key": "metric_kind", + "label": "metric kind", + "missing_rate": 0.8892702802815162, + "selection_score": 0.12965960524521825 + }, + { + "benchmark_stddev": 0.11349462538693254, + "group": "benchmark metadata", + "group_stddev": 0.11349462538693254, + "key": "has_uncertainty", + "label": "uncertainty", + "missing_rate": 0.9604148660328435, + "selection_score": 0.10900192543643858 + }, + { + "benchmark_stddev": 0.10935499889469129, + "group": "benchmark metadata", + "group_stddev": 0.10935499889469129, + "key": "detailed_results_file", + "label": "detailed results", + "missing_rate": 0.9803432522533646, + "selection_score": 0.10720543526658476 + }, + { + "benchmark_stddev": 0.10524043602312402, + "group": "benchmark metadata", + "group_stddev": 0.10524043602312402, + "key": "uncertainty_num_samples", + "label": "sample count", + "missing_rate": 0.9801456969996296, + "selection_score": 0.10315096051842983 + }, + { + "benchmark_stddev": 0.12620100124075465, + "group": "model metadata", + "group_stddev": 0.12620100124075465, + "key": "model_license", + "label": "model license", + "missing_rate": 0.9172737374984566, + "selection_score": 0.11576086408415438 + }, + { + "benchmark_stddev": 0.32502359224633115, + "group": "model metadata", + "group_stddev": 0.32502359224633115, + "key": "model_parameters", + "label": "model parameters", + "missing_rate": 0.32228670206198295, + "selection_score": 0.10475078163740877 + } + ], + "matrix": [ + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.9151321056845476, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.08486789431545236, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.833466773418735, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.16653322658126501, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.16653322658126501, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.833466773418735, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.9975980784627703, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0024019215372297837, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.9975980784627703, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0024019215372297837, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.9975980784627703, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0024019215372297837, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.0, + "result_rows": 1249 + }, + { + "benchmark": "allenai", + "benchmark_label": "allenai (n=1,249)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.9183346677341874, + "model_family": "allenai", + "model_family_label": "allenai (n=1,249)", + "present_rate": 0.08166533226581266, + "result_rows": 1249 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 1.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 0.0, + "result_rows": 714 + }, + { + "benchmark": "DreadPoor", + "benchmark_label": "DreadPoor (n=714)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "DreadPoor", + "model_family_label": "DreadPoor (n=714)", + "present_rate": 1.0, + "result_rows": 714 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 1.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 0.0, + "result_rows": 1044 + }, + { + "benchmark": "JayHyeon", + "benchmark_label": "JayHyeon (n=1,044)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "JayHyeon", + "model_family_label": "JayHyeon (n=1,044)", + "present_rate": 1.0, + "result_rows": 1044 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 1.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 0.0, + "result_rows": 360 + }, + { + "benchmark": "LeroyDyer", + "benchmark_label": "LeroyDyer (n=360)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "LeroyDyer", + "model_family_label": "LeroyDyer (n=360)", + "present_rate": 1.0, + "result_rows": 360 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 1.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 0.0, + "result_rows": 426 + }, + { + "benchmark": "Quazim0t0", + "benchmark_label": "Quazim0t0 (n=426)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "Quazim0t0", + "model_family_label": "Quazim0t0 (n=426)", + "present_rate": 1.0, + "result_rows": 426 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 1.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 0.0, + "result_rows": 396 + }, + { + "benchmark": "Sakalti", + "benchmark_label": "Sakalti (n=396)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "Sakalti", + "model_family_label": "Sakalti (n=396)", + "present_rate": 1.0, + "result_rows": 396 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 1.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 0.0, + "result_rows": 366 + }, + { + "benchmark": "Triangle104", + "benchmark_label": "Triangle104 (n=366)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "Triangle104", + "model_family_label": "Triangle104 (n=366)", + "present_rate": 1.0, + "result_rows": 366 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 1.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 0.0, + "result_rows": 528 + }, + { + "benchmark": "allknowingroger", + "benchmark_label": "allknowingroger (n=528)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "allknowingroger", + "model_family_label": "allknowingroger (n=528)", + "present_rate": 1.0, + "result_rows": 528 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 1.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 0.0, + "result_rows": 516 + }, + { + "benchmark": "bunnycore", + "benchmark_label": "bunnycore (n=516)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "bunnycore", + "model_family_label": "bunnycore (n=516)", + "present_rate": 1.0, + "result_rows": 516 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 1.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 0.0, + "result_rows": 372 + }, + { + "benchmark": "icefog72", + "benchmark_label": "icefog72 (n=372)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "icefog72", + "model_family_label": "icefog72 (n=372)", + "present_rate": 1.0, + "result_rows": 372 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 1.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 0.0, + "result_rows": 1182 + }, + { + "benchmark": "jaspionjader", + "benchmark_label": "jaspionjader (n=1,182)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "jaspionjader", + "model_family_label": "jaspionjader (n=1,182)", + "present_rate": 1.0, + "result_rows": 1182 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 1.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 0.0, + "result_rows": 666 + }, + { + "benchmark": "prithivMLmods", + "benchmark_label": "prithivMLmods (n=666)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "prithivMLmods", + "model_family_label": "prithivMLmods (n=666)", + "present_rate": 1.0, + "result_rows": 666 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 1.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 0.0, + "result_rows": 468 + }, + { + "benchmark": "zelk12", + "benchmark_label": "zelk12 (n=468)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.0, + "model_family": "zelk12", + "model_family_label": "zelk12 (n=468)", + "present_rate": 1.0, + "result_rows": 468 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.272, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.728, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.272, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.728, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.728, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.272, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.728, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.272, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.728, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.272, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.728, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.272, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.7573333333333333, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.24266666666666667, + "result_rows": 750 + }, + { + "benchmark": "meta", + "benchmark_label": "meta (n=750)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 1.0, + "model_family": "meta", + "model_family_label": "meta (n=750)", + "present_rate": 0.0, + "result_rows": 750 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.593613024420789, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.406386975579211, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.0, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.0, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.3268628678772699, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.6731371321227301, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.5122103944896681, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.48778960551033185, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.7113337507827175, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.2886662492172824, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.7113337507827175, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.2886662492172824, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.7113337507827175, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.2886662492172824, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.8753913587977458, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.12460864120225423, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.0, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.0, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.8033813400125235, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.19661865998747652, + "result_rows": 1597 + }, + { + "benchmark": "google", + "benchmark_label": "google (n=1,597)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.876017532874139, + "model_family": "google", + "model_family_label": "google (n=1,597)", + "present_rate": 0.12398246712586099, + "result_rows": 1597 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.097029702970297, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.902970297029703, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.7128712871287128, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.2871287128712871, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 1.0, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.0, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 1.0, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.0, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 1.0, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.0, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 0.80990099009901, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.1900990099009901, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 1.0, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.0, + "result_rows": 505 + }, + { + "benchmark": "Qwen", + "benchmark_label": "Qwen (n=505)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.28712871287128716, + "model_family": "Qwen", + "model_family_label": "Qwen (n=505)", + "present_rate": 0.7128712871287128, + "result_rows": 505 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.5506607929515419, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.44933920704845814, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.0, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.0, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.3480176211453745, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.6519823788546255, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.5682819383259912, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.43171806167400884, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.6585903083700441, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.34140969162995594, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.6585903083700441, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.34140969162995594, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.6585903083700441, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.34140969162995594, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.9295154185022027, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.07048458149779736, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.0, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.0, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.6916299559471366, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.30837004405286345, + "result_rows": 908 + }, + { + "benchmark": "mistralai", + "benchmark_label": "mistralai (n=908)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.8810572687224669, + "model_family": "mistralai", + "model_family_label": "mistralai (n=908)", + "present_rate": 0.11894273127753303, + "result_rows": 908 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.5881818181818181, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.4118181818181818, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.0, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.0, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.44727272727272727, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.5527272727272727, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.4, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.6, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.5909090909090908, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.4090909090909091, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.5909090909090908, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.4090909090909091, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.5909090909090908, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.4090909090909091, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.8809090909090909, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.1190909090909091, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.0, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.0, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.8309090909090909, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.1690909090909091, + "result_rows": 1100 + }, + { + "benchmark": "anthropic", + "benchmark_label": "anthropic (n=1,100)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 1.0, + "model_family": "anthropic", + "model_family_label": "anthropic (n=1,100)", + "present_rate": 0.0, + "result_rows": 1100 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.6962616822429907, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.3037383177570093, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.6127336448598131, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.3872663551401869, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.2762850467289719, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.7237149532710281, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.42757009345794394, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.572429906542056, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.42757009345794394, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.572429906542056, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.42757009345794394, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.572429906542056, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.9258177570093458, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0741822429906542, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.7102803738317758, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.2897196261682243, + "result_rows": 1712 + }, + { + "benchmark": "openai", + "benchmark_label": "openai (n=1,712)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 1.0, + "model_family": "openai", + "model_family_label": "openai (n=1,712)", + "present_rate": 0.0, + "result_rows": 1712 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.5555555555555556, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.4444444444444444, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.5555555555555556, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.4444444444444444, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.4444444444444444, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.5555555555555556, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.4444444444444444, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.5555555555555556, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.4444444444444444, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.5555555555555556, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.4444444444444444, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.5555555555555556, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.4739229024943311, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.5260770975056689, + "result_rows": 882 + }, + { + "benchmark": "qwen", + "benchmark_label": "qwen (n=882)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 1.0, + "model_family": "qwen", + "model_family_label": "qwen (n=882)", + "present_rate": 0.0, + "result_rows": 882 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "generation_config_present", + "field_label": "generation config", + "missing_rate": 0.9332633109800437, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.06673668901995637, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "generation_max_tokens", + "field_label": "max tokens", + "missing_rate": 0.9843661630443564, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.01563383695564353, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "generation_temperature", + "field_label": "temperature", + "missing_rate": 0.9843661630443564, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.01563383695564353, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "inference_engine", + "field_label": "inference engine/platform", + "missing_rate": 0.11392098246748006, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.8860790175325199, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "source_organization_url", + "field_label": "source org URL", + "missing_rate": 0.8440252080471843, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.15597479195281572, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "metric_id", + "field_label": "metric ID", + "missing_rate": 0.9232851256362609, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.0767148743637392, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "metric_unit", + "field_label": "metric unit", + "missing_rate": 0.9232851256362609, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.0767148743637392, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "metric_kind", + "field_label": "metric kind", + "missing_rate": 0.9359295467399208, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.06407045326007918, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "has_uncertainty", + "field_label": "uncertainty", + "missing_rate": 0.9601680536478953, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.03983194635210471, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "detailed_results_file", + "field_label": "detailed results", + "missing_rate": 0.9717217419406965, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.028278258059303545, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "uncertainty_num_samples", + "field_label": "sample count", + "missing_rate": 0.9713985618485901, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.028601438151409874, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "model_license", + "field_label": "model license", + "missing_rate": 0.9423123535590208, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.057687646440979234, + "result_rows": 24754 + }, + { + "benchmark": "Other", + "benchmark_label": "Other (n=24,754)", + "field": "model_parameters", + "field_label": "model parameters", + "missing_rate": 0.20667366890199568, + "model_family": "Other", + "model_family_label": "Other (n=24,754)", + "present_rate": 0.7933263310980043, + "result_rows": 24754 + } + ], + "model_families": [ + { + "label": "allenai (n=1,249)", + "model_family": "allenai", + "overall_completeness": 0.09028761470715034, + "result_rows": 1249 + }, + { + "label": "DreadPoor (n=714)", + "model_family": "DreadPoor", + "overall_completeness": 0.15384615384615385, + "result_rows": 714 + }, + { + "label": "JayHyeon (n=1,044)", + "model_family": "JayHyeon", + "overall_completeness": 0.15384615384615385, + "result_rows": 1044 + }, + { + "label": "LeroyDyer (n=360)", + "model_family": "LeroyDyer", + "overall_completeness": 0.15384615384615385, + "result_rows": 360 + }, + { + "label": "Quazim0t0 (n=426)", + "model_family": "Quazim0t0", + "overall_completeness": 0.15384615384615385, + "result_rows": 426 + }, + { + "label": "Sakalti (n=396)", + "model_family": "Sakalti", + "overall_completeness": 0.15384615384615385, + "result_rows": 396 + }, + { + "label": "Triangle104 (n=366)", + "model_family": "Triangle104", + "overall_completeness": 0.15384615384615385, + "result_rows": 366 + }, + { + "label": "allknowingroger (n=528)", + "model_family": "allknowingroger", + "overall_completeness": 0.15384615384615385, + "result_rows": 528 + }, + { + "label": "bunnycore (n=516)", + "model_family": "bunnycore", + "overall_completeness": 0.15384615384615385, + "result_rows": 516 + }, + { + "label": "icefog72 (n=372)", + "model_family": "icefog72", + "overall_completeness": 0.15384615384615385, + "result_rows": 372 + }, + { + "label": "jaspionjader (n=1,182)", + "model_family": "jaspionjader", + "overall_completeness": 0.15384615384615385, + "result_rows": 1182 + }, + { + "label": "prithivMLmods (n=666)", + "model_family": "prithivMLmods", + "overall_completeness": 0.15384615384615385, + "result_rows": 666 + }, + { + "label": "zelk12 (n=468)", + "model_family": "zelk12", + "overall_completeness": 0.15384615384615385, + "result_rows": 468 + }, + { + "label": "meta (n=750)", + "model_family": "meta", + "overall_completeness": 0.21435897435897436, + "result_rows": 750 + }, + { + "label": "google (n=1,597)", + "model_family": "google", + "overall_completeness": 0.22142478685997785, + "result_rows": 1597 + }, + { + "label": "Qwen (n=505)", + "model_family": "Qwen", + "overall_completeness": 0.23412033511043412, + "result_rows": 505 + }, + { + "label": "mistralai (n=908)", + "model_family": "mistralai", + "overall_completeness": 0.23500508302270418, + "result_rows": 908 + }, + { + "label": "anthropic (n=1,100)", + "model_family": "anthropic", + "overall_completeness": 0.23692307692307693, + "result_rows": 1100 + }, + { + "label": "openai (n=1,712)", + "model_family": "openai", + "overall_completeness": 0.26891624730409774, + "result_rows": 1712 + }, + { + "label": "qwen (n=882)", + "model_family": "qwen", + "overall_completeness": 0.27978370835513694, + "result_rows": 882 + }, + { + "label": "Other (n=24,754)", + "model_family": "Other", + "overall_completeness": 0.17732953803891835, + "result_rows": 24754 } ], - "other_result_rows": 2399, - "top_benchmark_count": 12 + "model_family_selection": "Top model families/developers by result-row count, with remaining families aggregated as Other; rows are sorted by overall metadata completeness.", + "other_result_rows": 24754, + "top_model_family_count": 20 }, "models_per_benchmark": [ { diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py index 5667d1df1..349d041fc 100644 --- a/every_eval_ever/helpers/dataset_statistics.py +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -11,7 +11,7 @@ import sys from collections import Counter, defaultdict from pathlib import Path -from typing import Any, Iterable +from typing import Any, Callable, Iterable SEP = '=' * 72 SUB = '-' * 72 @@ -32,23 +32,93 @@ 'metric_kind', 'metric_unit', ) +METADATA_FIELD_GROUP_ORDER = ( + 'eval metadata', + 'benchmark metadata', + 'model metadata', +) +REQUIRED_METADATA_FIELDS = ('inference_engine',) METADATA_FIELD_CANDIDATES = ( - {'key': 'generation_config_present', 'label': 'generation config'}, - {'key': 'generation_temperature', 'label': 'temperature'}, - {'key': 'generation_max_tokens', 'label': 'max tokens'}, - {'key': 'generation_agentic_config_present', 'label': 'agentic config'}, - {'key': 'inference_engine', 'label': 'runtime/platform'}, - {'key': 'source_locator', 'label': 'source URL / HF repo'}, - {'key': 'source_organization_url', 'label': 'source org URL'}, - {'key': 'evaluator_relationship', 'label': 'evaluator relationship'}, - {'key': 'detailed_results_file', 'label': 'detailed results'}, - {'key': 'has_uncertainty', 'label': 'uncertainty'}, - {'key': 'uncertainty_num_samples', 'label': 'sample count'}, - {'key': 'metric_id', 'label': 'metric ID'}, - {'key': 'metric_kind', 'label': 'metric kind'}, - {'key': 'metric_unit', 'label': 'metric unit'}, - {'key': 'model_parameters', 'label': 'model parameters'}, - {'key': 'model_license', 'label': 'model license'}, + { + 'key': 'generation_config_present', + 'label': 'generation config', + 'group': 'eval metadata', + }, + { + 'key': 'generation_temperature', + 'label': 'temperature', + 'group': 'eval metadata', + }, + { + 'key': 'generation_max_tokens', + 'label': 'max tokens', + 'group': 'eval metadata', + }, + { + 'key': 'generation_agentic_config_present', + 'label': 'agentic config', + 'group': 'eval metadata', + }, + { + 'key': 'inference_engine', + 'label': 'inference engine/platform', + 'group': 'eval metadata', + }, + { + 'key': 'source_locator', + 'label': 'source URL / HF repo', + 'group': 'benchmark metadata', + }, + { + 'key': 'source_organization_url', + 'label': 'source org URL', + 'group': 'benchmark metadata', + }, + { + 'key': 'evaluator_relationship', + 'label': 'evaluator relationship', + 'group': 'benchmark metadata', + }, + { + 'key': 'detailed_results_file', + 'label': 'detailed results', + 'group': 'benchmark metadata', + }, + { + 'key': 'has_uncertainty', + 'label': 'uncertainty', + 'group': 'benchmark metadata', + }, + { + 'key': 'uncertainty_num_samples', + 'label': 'sample count', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_id', + 'label': 'metric ID', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_kind', + 'label': 'metric kind', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_unit', + 'label': 'metric unit', + 'group': 'benchmark metadata', + }, + { + 'key': 'model_parameters', + 'label': 'model parameters', + 'group': 'model metadata', + }, + { + 'key': 'model_license', + 'label': 'model license', + 'group': 'model metadata', + }, ) @@ -403,10 +473,29 @@ def field_present_rate(rows: list[dict[str, Any]], field: str) -> float: return sum(has_value(row.get(field)) for row in rows) / len(rows) -def metadata_completeness( +def model_family_name(row: dict[str, Any]) -> str: + value = row.get('model_developer') or row.get('model_id') + if value is None: + return 'unknown' + text = str(value).strip() + return text or 'unknown' + + +def format_group_label(group: str, result_rows: int) -> str: + return f'{group} (n={result_rows:,})' + + +def metadata_completeness_by_group( rows: list[dict[str, Any]], - top_benchmarks: int = 12, - top_fields: int = 12, + group_name: Callable[[dict[str, Any]], str], + group_key: str, + groups_key: str, + top_count_key: str, + other_count_key: str, + selection_key: str, + selection_description: str, + top_groups: int, + top_fields: int, ) -> dict[str, Any]: candidate_fields = [ field @@ -416,37 +505,40 @@ def metadata_completeness( if not rows or not candidate_fields: return { 'fields': [], - 'benchmarks': [], + groups_key: [], 'matrix': [], - 'top_benchmark_count': top_benchmarks, - 'other_result_rows': 0, + top_count_key: top_groups, + other_count_key: 0, + selection_key: selection_description, + 'field_group_order': list(METADATA_FIELD_GROUP_ORDER), } - rows_by_benchmark: dict[str, list[dict[str, Any]]] = defaultdict(list) + rows_by_group: dict[str, list[dict[str, Any]]] = defaultdict(list) for row in rows: - rows_by_benchmark[benchmark_name(row)].append(row) + rows_by_group[group_name(row)].append(row) field_summaries = [] for field in candidate_fields: key = str(field['key']) - benchmark_rates = [ - field_present_rate(items, key) - for items in rows_by_benchmark.values() + group_rates = [ + field_present_rate(items, key) for items in rows_by_group.values() ] present_rate = field_present_rate(rows, key) missing_rate = 1.0 - present_rate - benchmark_stddev = ( - statistics.pstdev(benchmark_rates) - if len(benchmark_rates) > 1 + group_stddev = ( + statistics.pstdev(group_rates) + if len(group_rates) > 1 else 0.0 ) - selection_score = missing_rate * max(benchmark_stddev, 0.05) + selection_score = missing_rate * max(group_stddev, 0.05) field_summaries.append( { 'key': key, 'label': str(field['label']), + 'group': str(field['group']), 'missing_rate': missing_rate, - 'benchmark_stddev': benchmark_stddev, + 'benchmark_stddev': group_stddev, + 'group_stddev': group_stddev, 'selection_score': selection_score, } ) @@ -458,28 +550,46 @@ def metadata_completeness( ) ) selected_fields = field_summaries[:top_fields] + selected_field_keys_set = {field['key'] for field in selected_fields} + required_fields = [ + field + for field in field_summaries + if field['key'] in REQUIRED_METADATA_FIELDS + and field['key'] not in selected_field_keys_set + ] + selected_fields.extend(required_fields) + group_rank = { + group: index for index, group in enumerate(METADATA_FIELD_GROUP_ORDER) + } + selected_fields.sort( + key=lambda item: ( + group_rank.get(str(item['group']), len(group_rank)), + -float(item['selection_score']), + str(item['label']), + ) + ) - top_benchmark_names = [ - benchmark - for benchmark, _ in sorted( + top_group_names = [ + group + for group, _ in sorted( ( - (benchmark, len(items)) - for benchmark, items in rows_by_benchmark.items() + (group, len(items)) + for group, items in rows_by_group.items() ), key=lambda item: (-item[1], item[0]), - )[:top_benchmarks] + )[:top_groups] ] selected_field_keys = [field['key'] for field in selected_fields] - benchmark_summaries = [] - benchmark_groups: dict[str, list[dict[str, Any]]] = {} - for benchmark in top_benchmark_names: - items = rows_by_benchmark[benchmark] - benchmark_groups[benchmark] = items - benchmark_summaries.append( + group_summaries = [] + selected_groups: dict[str, list[dict[str, Any]]] = {} + for group in top_group_names: + items = rows_by_group[group] + selected_groups[group] = items + group_summaries.append( { - 'benchmark': benchmark, - 'label': format_benchmark_label(benchmark, len(items)), + group_key: group, + 'label': format_group_label(group, len(items)), 'result_rows': len(items), 'overall_completeness': average_completeness( items, selected_field_keys @@ -489,16 +599,16 @@ def metadata_completeness( other_rows = [ row - for benchmark, items in rows_by_benchmark.items() - if benchmark not in top_benchmark_names + for group, items in rows_by_group.items() + if group not in top_group_names for row in items ] if other_rows: - benchmark_groups['Other'] = other_rows - benchmark_summaries.append( + selected_groups['Other'] = other_rows + group_summaries.append( { - 'benchmark': 'Other', - 'label': format_benchmark_label('Other', len(other_rows)), + group_key: 'Other', + 'label': format_group_label('Other', len(other_rows)), 'result_rows': len(other_rows), 'overall_completeness': average_completeness( other_rows, selected_field_keys @@ -506,11 +616,11 @@ def metadata_completeness( } ) - benchmark_summaries.sort( + group_summaries.sort( key=lambda item: ( - item['benchmark'] == 'Other', + item[group_key] == 'Other', float(item['overall_completeness']), - str(item['benchmark']), + str(item[group_key]), ) ) @@ -518,16 +628,18 @@ def metadata_completeness( selected_fields_by_key = { str(field['key']): field for field in selected_fields } - for benchmark_summary in benchmark_summaries: - benchmark = str(benchmark_summary['benchmark']) - items = benchmark_groups[benchmark] + for group_summary in group_summaries: + group = str(group_summary[group_key]) + items = selected_groups[group] for field_key in selected_field_keys: present_rate = field_present_rate(items, str(field_key)) field = selected_fields_by_key[str(field_key)] matrix.append( { - 'benchmark': benchmark, - 'benchmark_label': benchmark_summary['label'], + group_key: group, + f'{group_key}_label': group_summary['label'], + 'benchmark': group, + 'benchmark_label': group_summary['label'], 'field': str(field_key), 'field_label': field['label'], 'present_rate': present_rate, @@ -538,13 +650,61 @@ def metadata_completeness( return { 'fields': selected_fields, - 'benchmarks': benchmark_summaries, + groups_key: group_summaries, 'matrix': matrix, - 'top_benchmark_count': top_benchmarks, - 'other_result_rows': len(other_rows), + top_count_key: top_groups, + other_count_key: len(other_rows), + selection_key: selection_description, + 'field_group_order': list(METADATA_FIELD_GROUP_ORDER), } +def metadata_completeness( + rows: list[dict[str, Any]], + top_benchmarks: int = 20, + top_fields: int = 12, +) -> dict[str, Any]: + return metadata_completeness_by_group( + rows, + group_name=benchmark_name, + group_key='benchmark', + groups_key='benchmarks', + top_count_key='top_benchmark_count', + other_count_key='other_result_rows', + selection_key='benchmark_selection', + selection_description=( + 'Top benchmarks by result-row count, with remaining benchmarks ' + 'aggregated as Other; rows are sorted by overall metadata ' + 'completeness.' + ), + top_groups=top_benchmarks, + top_fields=top_fields, + ) + + +def model_family_metadata_completeness( + rows: list[dict[str, Any]], + top_model_families: int = 20, + top_fields: int = 12, +) -> dict[str, Any]: + return metadata_completeness_by_group( + rows, + group_name=model_family_name, + group_key='model_family', + groups_key='model_families', + top_count_key='top_model_family_count', + other_count_key='other_result_rows', + selection_key='model_family_selection', + selection_description=( + 'Top model families/developers by result-row count, with ' + 'remaining families aggregated as Other; rows are sorted by ' + 'overall metadata completeness.' + ), + top_groups=top_model_families, + top_fields=top_fields, + ) + + def average_completeness( rows: list[dict[str, Any]], fields: list[str] ) -> float: @@ -747,7 +907,10 @@ def pairwise_model_comparisons( def descriptive_statistics( - rows: list[dict[str, Any]], summary_limit: int + rows: list[dict[str, Any]], + summary_limit: int, + metadata_top_benchmarks: int, + metadata_top_model_families: int, ) -> dict[str, Any]: valid_rows, exclusions = valid_normalized_rows(rows) return { @@ -763,7 +926,14 @@ def descriptive_statistics( rows, 'inference_engine' ), 'models_per_benchmark': models_per_benchmark(rows), - 'metadata_completeness': metadata_completeness(rows), + 'metadata_completeness': metadata_completeness( + rows, top_benchmarks=metadata_top_benchmarks + ), + 'model_family_metadata_completeness': ( + model_family_metadata_completeness( + rows, top_model_families=metadata_top_model_families + ) + ), 'quality': quality_counts(rows), 'normalization_exclusions': exclusions, 'score_summaries': grouped_summaries( @@ -784,6 +954,8 @@ def descriptive_statistics( def build_statistics_report( rows: list[dict[str, Any]], summary_limit: int, + metadata_top_benchmarks: int, + metadata_top_model_families: int, comparison_limit: int, top_model_limit: int, min_shared_evals: int, @@ -791,7 +963,12 @@ def build_statistics_report( ) -> dict[str, Any]: valid_rows, exclusions = valid_normalized_rows(rows) report = { - 'descriptive': descriptive_statistics(rows, summary_limit), + 'descriptive': descriptive_statistics( + rows, + summary_limit, + metadata_top_benchmarks, + metadata_top_model_families, + ), 'observational': { 'valid_normalized_rows': len(valid_rows), 'exclusions': exclusions, @@ -941,6 +1118,24 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: type=int, help='Number of descriptive summary rows to print', ) + parser.add_argument( + '--metadata-top-benchmarks', + default=20, + type=int, + help=( + 'Number of largest benchmarks to show in metadata completeness; ' + 'remaining benchmarks are aggregated as Other' + ), + ) + parser.add_argument( + '--metadata-top-model-families', + default=20, + type=int, + help=( + 'Number of largest model families/developers to show in metadata ' + 'completeness; remaining families are aggregated as Other' + ), + ) parser.add_argument( '--comparison-limit', default=50, @@ -969,6 +1164,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace: parser.error('--table must be a valid SQL identifier') if args.summary_limit < 1: parser.error('--summary-limit must be at least 1') + if args.metadata_top_benchmarks < 1: + parser.error('--metadata-top-benchmarks must be at least 1') + if args.metadata_top_model_families < 1: + parser.error('--metadata-top-model-families must be at least 1') if args.comparison_limit < 1: parser.error('--comparison-limit must be at least 1') if args.top_model_limit < 1: @@ -996,6 +1195,8 @@ def main(argv: list[str] | None = None) -> None: report = build_statistics_report( rows, summary_limit=args.summary_limit, + metadata_top_benchmarks=args.metadata_top_benchmarks, + metadata_top_model_families=args.metadata_top_model_families, comparison_limit=args.comparison_limit, top_model_limit=args.top_model_limit, min_shared_evals=args.min_shared_evals, diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index 1740a47a0..e2eb04f13 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -469,20 +469,41 @@ def plot_inference_engine_spread( def plot_writeup_overview( stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None ) -> Path: - fig, (ax_missing, ax_score) = plt.subplots( + fig, (ax_benchmark, ax_model_family, ax_score) = plt.subplots( 1, - 2, - figsize=(14, 6.6), - gridspec_kw={'width_ratios': [1.35, 1.0], 'wspace': 0.34}, + 3, + figsize=(22.5, 9.6), + gridspec_kw={'width_ratios': [1.55, 1.35, 1.0], 'wspace': 0.34}, + ) + draw_metadata_completeness( + ax_benchmark, + stats, + plt, + sns, + completeness_key='metadata_completeness', + rows_key='benchmarks', + row_key='benchmark', + title='A. Reporting completeness by benchmark', + show_colorbar=False, + ) + draw_metadata_completeness( + ax_model_family, + stats, + plt, + sns, + completeness_key='model_family_metadata_completeness', + rows_key='model_families', + row_key='model_family', + title='B. Reporting completeness by model family', + show_colorbar=False, ) - draw_metadata_completeness(ax_missing, stats, plt, sns) draw_score_landscape( ax_score, stats['descriptive'].get('normalized_score_summaries', []), sns, annotation_limit=7, ) - ax_score.set_title('B. Score landscape by metric') + ax_score.set_title('C. Score landscape by metric') ax_score.title.set_fontsize(15) path = output_dir / PLOT_FILES['writeup_overview'] @@ -492,13 +513,21 @@ def plot_writeup_overview( def draw_metadata_completeness( - ax: Any, stats: dict[str, Any], plt: Any, sns: Any | None + ax: Any, + stats: dict[str, Any], + plt: Any, + sns: Any | None, + completeness_key: str, + rows_key: str, + row_key: str, + title: str, + show_colorbar: bool, ) -> None: - completeness = stats['descriptive'].get('metadata_completeness', {}) + completeness = stats['descriptive'].get(completeness_key, {}) fields = completeness.get('fields', []) - benchmarks = completeness.get('benchmarks', []) + row_groups = completeness.get(rows_key, []) matrix_rows = completeness.get('matrix', []) - if not fields or not benchmarks or not matrix_rows: + if not fields or not row_groups or not matrix_rows: ax.text( 0.5, 0.5, @@ -512,23 +541,31 @@ def draw_metadata_completeness( field_order = [field['key'] for field in fields] field_labels = [wrapped_label(str(field['label']), 13) for field in fields] - benchmark_order = [benchmark['benchmark'] for benchmark in benchmarks] - benchmark_labels = [ - short_label(str(benchmark['label']), 38) for benchmark in benchmarks + field_groups = [str(field.get('group', 'metadata')) for field in fields] + row_order = [row_group[row_key] for row_group in row_groups] + row_labels = [ + short_label(str(row_group['label']), 38) for row_group in row_groups ] value_by_cell = { - (row['benchmark'], row['field']): 100.0 * row['present_rate'] + (row.get(row_key, row.get('benchmark')), row['field']): ( + 100.0 * row['present_rate'] + ) for row in matrix_rows } values = [ [ - value_by_cell.get((benchmark, field), 0.0) + value_by_cell.get((row_group, field), 0.0) for field in field_order ] - for benchmark in benchmark_order + for row_group in row_order ] if sns is not None: + cbar_kws = ( + {'label': '% present', 'fraction': 0.05, 'pad': 0.05} + if show_colorbar + else {} + ) sns.heatmap( values, ax=ax, @@ -536,28 +573,101 @@ def draw_metadata_completeness( vmax=100, cmap='RdYlGn', xticklabels=field_labels, - yticklabels=benchmark_labels, + yticklabels=row_labels, linewidths=0.35, linecolor='white', - cbar_kws={'label': '% present', 'fraction': 0.05, 'pad': 0.05}, + alpha=0.68, + cbar=show_colorbar, + cbar_kws=cbar_kws, ) else: - image = ax.imshow(values, vmin=0, vmax=100, cmap='RdYlGn') - colorbar = plt.colorbar(image, ax=ax, fraction=0.046, pad=0.04) - colorbar.set_label('% present') - ax.set_xticks(range(len(field_labels))) + image = ax.imshow( + values, + vmin=0, + vmax=100, + cmap='RdYlGn', + alpha=0.68, + extent=(0, len(field_labels), len(row_labels), 0), + ) + if show_colorbar: + colorbar = plt.colorbar(image, ax=ax, fraction=0.046, pad=0.04) + colorbar.set_label('% present') + ax.set_xticks([index + 0.5 for index in range(len(field_labels))]) ax.set_xticklabels(field_labels) - ax.set_yticks(range(len(benchmark_labels))) - ax.set_yticklabels(benchmark_labels) + ax.set_yticks( + [index + 0.5 for index in range(len(row_labels))] + ) + ax.set_yticklabels(row_labels) + draw_heatmap_values(ax, values) + draw_metadata_field_groups(ax, field_groups) - ax.set_title('A. Reporting completeness is uneven') + ax.set_title(title, pad=28) ax.title.set_fontsize(15) ax.set_xlabel('') ax.set_ylabel('') - ax.tick_params(axis='x', labelrotation=0, labelsize=9, pad=2) + ax.tick_params(axis='x', labelrotation=60, labelsize=9, pad=2) ax.tick_params(axis='y', labelsize=10) for tick in ax.get_xticklabels(): - tick.set_ha('center') + tick.set_ha('right') + tick.set_rotation_mode('anchor') + + +def draw_heatmap_values(ax: Any, values: list[list[float]]) -> None: + for y_index, row in enumerate(values): + for x_index, value in enumerate(row): + ax.text( + x_index + 0.5, + y_index + 0.5, + f'{value:.0f}', + ha='center', + va='center', + color='#111111', + fontsize=6.2, + bbox={ + 'facecolor': 'white', + 'edgecolor': 'none', + 'boxstyle': 'round,pad=0.12', + 'alpha': 0.58, + }, + ) + + +def draw_metadata_field_groups(ax: Any, field_groups: list[str]) -> None: + if not field_groups: + return + + start = 0 + while start < len(field_groups): + group = field_groups[start] + end = start + 1 + while end < len(field_groups) and field_groups[end] == group: + end += 1 + + center = (start + end) / 2 + ax.text( + center, + 1.03, + group, + transform=ax.get_xaxis_transform(), + ha='center', + va='bottom', + fontsize=8.5, + fontweight='bold', + color='#303030', + clip_on=False, + ) + ax.plot( + [start + 0.1, end - 0.1], + [1.025, 1.025], + transform=ax.get_xaxis_transform(), + color='#303030', + linewidth=0.8, + clip_on=False, + ) + if end < len(field_groups): + ax.axvline(end, color='white', linewidth=2.0) + ax.axvline(end, color='#303030', linewidth=0.55, alpha=0.55) + start = end def main() -> None: diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py index de38e63d3..cc8266315 100644 --- a/tests/test_dataset_statistics.py +++ b/tests/test_dataset_statistics.py @@ -157,6 +157,8 @@ def test_json_report_shape(): report = stats.build_statistics_report( rows, summary_limit=5, + metadata_top_benchmarks=12, + metadata_top_model_families=20, comparison_limit=5, top_model_limit=5, min_shared_evals=1, @@ -168,6 +170,7 @@ def test_json_report_shape(): assert 'inference_engines' in report['descriptive'] assert 'models_per_benchmark' in report['descriptive'] assert 'metadata_completeness' in report['descriptive'] + assert 'model_family_metadata_completeness' in report['descriptive'] assert 'metric_id' in report['descriptive']['score_summaries'][0] assert 'coverage_aware_model_summaries' in report['observational'] assert 'pairwise_model_comparisons' in report['observational'] @@ -193,6 +196,8 @@ def test_score_summaries_group_by_metric_identity(): report = stats.build_statistics_report( rows, summary_limit=10, + metadata_top_benchmarks=12, + metadata_top_model_families=20, comparison_limit=5, top_model_limit=5, min_shared_evals=1, @@ -318,6 +323,63 @@ def test_metadata_completeness_aggregates_other_benchmarks(): ) +def test_metadata_top_benchmarks_argument_controls_report_shape(): + rows = [ + row('model/a', 'bench-a', 'eval', 0.9, generation_temperature=0.1), + row('model/b', 'bench-a', 'eval', 0.8, generation_temperature=0.2), + row('model/c', 'bench-b', 'eval', 0.7, generation_temperature=None), + row('model/d', 'bench-c', 'eval', 0.6, generation_temperature=None), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + metadata_top_benchmarks=1, + metadata_top_model_families=20, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + completeness = report['descriptive']['metadata_completeness'] + + assert completeness['top_benchmark_count'] == 1 + assert [item['benchmark'] for item in completeness['benchmarks']] == [ + 'bench-a', + 'Other', + ] + assert completeness['other_result_rows'] == 2 + + +def test_metadata_top_model_families_argument_controls_report_shape(): + rows = [ + row('family-a/model-1', 'bench', 'eval', 0.9, model_license='mit'), + row('family-a/model-2', 'bench', 'eval', 0.8, model_license='apache'), + row('family-b/model-1', 'bench', 'eval', 0.7, model_license=None), + row('family-c/model-1', 'bench', 'eval', 0.6, model_license=None), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + metadata_top_benchmarks=20, + metadata_top_model_families=1, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + completeness = report['descriptive'][ + 'model_family_metadata_completeness' + ] + + assert completeness['top_model_family_count'] == 1 + assert [ + item['model_family'] for item in completeness['model_families'] + ] == ['family-a', 'Other'] + assert completeness['other_result_rows'] == 2 + + def test_metadata_field_selection_favors_missing_and_uneven_fields(): rows = [ row( @@ -357,4 +419,6 @@ def test_cli_help_uses_summary_limit_not_top_n(capsys): output = capsys.readouterr().out assert '--summary-limit' in output + assert '--metadata-top-benchmarks' in output + assert '--metadata-top-model-families' in output assert '--top-n' not in output From c86aa2ee5049bbf6fe09bf46f34a240d3dcce824 Mon Sep 17 00:00:00 2001 From: Tommaso Cerruti <79256764+tommasocerruti@users.noreply.github.com> Date: Tue, 5 May 2026 21:52:07 +0200 Subject: [PATCH 15/15] Delete audit/dataset_statistics.json --- audit/dataset_statistics.json | 11943 -------------------------------- 1 file changed, 11943 deletions(-) delete mode 100644 audit/dataset_statistics.json diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json deleted file mode 100644 index b6d21586b..000000000 --- a/audit/dataset_statistics.json +++ /dev/null @@ -1,11943 +0,0 @@ -{ - "descriptive": { - "counts": { - "result_rows": 40495, - "unique_benchmarks": 59, - "unique_developers": 794, - "unique_evaluations": 178, - "unique_models": 5299 - }, - "inference_engines": [ - { - "count": 39618, - "value": "unknown" - }, - { - "count": 450, - "value": "ollama" - }, - { - "count": 150, - "value": "openai" - }, - { - "count": 54, - "value": "google" - }, - { - "count": 47, - "value": "anthropic" - }, - { - "count": 33, - "value": "gemini" - }, - { - "count": 30, - "value": "openrouter" - }, - { - "count": 26, - "value": "deepseek" - }, - { - "count": 18, - "value": "minimax" - }, - { - "count": 15, - "value": "moonshot" - }, - { - "count": 15, - "value": "ark" - }, - { - "count": 12, - "value": "zhipu" - }, - { - "count": 12, - "value": "qwen" - }, - { - "count": 12, - "value": "aliyun" - }, - { - "count": 3, - "value": "kuaishou" - } - ], - "metadata_completeness": { - "benchmark_selection": "Top benchmarks by result-row count, with remaining benchmarks aggregated as Other; rows are sorted by overall metadata completeness.", - "benchmarks": [ - { - "benchmark": "BBH", - "label": "BBH (n=4,574)", - "overall_completeness": 0.07692307692307693, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "label": "MATH Level 5 (n=4,574)", - "overall_completeness": 0.07692307692307693, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "label": "MMLU-PRO (n=4,574)", - "overall_completeness": 0.07692307692307693, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "label": "MUSR (n=4,574)", - "overall_completeness": 0.07692307692307693, - "result_rows": 4574 - }, - { - "benchmark": "RewardBench", - "label": "RewardBench (n=1,025)", - "overall_completeness": 0.07692307692307693, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench 2", - "label": "RewardBench 2 (n=1,379)", - "overall_completeness": 0.07692307692307693, - "result_rows": 1379 - }, - { - "benchmark": "GPQA", - "label": "GPQA (n=4,635)", - "overall_completeness": 0.07793544104223715, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "label": "IFEval (n=4,635)", - "overall_completeness": 0.07793544104223715, - "result_rows": 4635 - }, - { - "benchmark": "GSM8K", - "label": "GSM8K (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "label": "LegalBench (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "label": "MATH (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "label": "MMLU (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "label": "MedQA (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "label": "NarrativeQA (n=91)", - "overall_completeness": 0.15384615384615385, - "result_rows": 91 - }, - { - "benchmark": "helm_mmlu", - "label": "helm_mmlu (n=2,844)", - "overall_completeness": 0.15384615384615385, - "result_rows": 2844 - }, - { - "benchmark": "global-mmlu-lite", - "label": "global-mmlu-lite (n=912)", - "overall_completeness": 0.21862348178137653, - "result_rows": 912 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "overall_completeness": 0.3076923076923077, - "result_rows": 1020 - }, - { - "benchmark": "SciArena leaderboard API", - "label": "SciArena leaderboard API (n=114)", - "overall_completeness": 0.3076923076923077, - "result_rows": 114 - }, - { - "benchmark": "BFCL leaderboard CSV", - "label": "BFCL leaderboard CSV (n=3,350)", - "overall_completeness": 0.38461538461538464, - "result_rows": 3350 - }, - { - "benchmark": "wordle_arena_daily", - "label": "wordle_arena_daily (n=92)", - "overall_completeness": 0.6153846153846154, - "result_rows": 92 - }, - { - "benchmark": "Other", - "label": "Other (n=1,647)", - "overall_completeness": 0.3430946709635234, - "result_rows": 1647 - } - ], - "field_group_order": [ - "eval metadata", - "benchmark metadata", - "model metadata" - ], - "fields": [ - { - "benchmark_stddev": 0.4794204729297534, - "group": "eval metadata", - "group_stddev": 0.4794204729297534, - "key": "generation_config_present", - "label": "generation config", - "missing_rate": 0.8809235708112113, - "selection_score": 0.4223327949332781 - }, - { - "benchmark_stddev": 0.30224668644283065, - "group": "eval metadata", - "group_stddev": 0.30224668644283065, - "key": "generation_agentic_config_present", - "label": "agentic config", - "missing_rate": 0.9977775033954809, - "selection_score": 0.3015749442084843 - }, - { - "benchmark_stddev": 0.30224668644283065, - "group": "eval metadata", - "group_stddev": 0.30224668644283065, - "key": "generation_max_tokens", - "label": "max tokens", - "missing_rate": 0.9880726015557476, - "selection_score": 0.298641669785172 - }, - { - "benchmark_stddev": 0.30224668644283065, - "group": "eval metadata", - "group_stddev": 0.30224668644283065, - "key": "generation_temperature", - "label": "temperature", - "missing_rate": 0.9880726015557476, - "selection_score": 0.298641669785172 - }, - { - "benchmark_stddev": 0.40252007074704804, - "group": "eval metadata", - "group_stddev": 0.40252007074704804, - "key": "inference_engine", - "label": "inference engine/platform", - "missing_rate": 0.1724410421039635, - "selection_score": 0.06941098046738207 - }, - { - "benchmark_stddev": 0.43200860863664675, - "group": "benchmark metadata", - "group_stddev": 0.43200860863664675, - "key": "uncertainty_num_samples", - "label": "sample count", - "missing_rate": 0.9801456969996296, - "selection_score": 0.4234313788220063 - }, - { - "benchmark_stddev": 0.43144687321918196, - "group": "benchmark metadata", - "group_stddev": 0.43144687321918196, - "key": "has_uncertainty", - "label": "uncertainty", - "missing_rate": 0.9604148660328435, - "selection_score": 0.41436799094308985 - }, - { - "benchmark_stddev": 0.40252007074704804, - "group": "benchmark metadata", - "group_stddev": 0.40252007074704804, - "key": "detailed_results_file", - "label": "detailed results", - "missing_rate": 0.9803432522533646, - "selection_score": 0.3946078352534155 - }, - { - "benchmark_stddev": 0.49820130361691756, - "group": "benchmark metadata", - "group_stddev": 0.49820130361691756, - "key": "source_organization_url", - "label": "source org URL", - "missing_rate": 0.7824175824175824, - "selection_score": 0.3898014595332366 - }, - { - "benchmark_stddev": 0.3595458209423123, - "group": "benchmark metadata", - "group_stddev": 0.3595458209423123, - "key": "metric_id", - "label": "metric ID", - "missing_rate": 0.8815409309791332, - "selection_score": 0.31695435772314273 - }, - { - "benchmark_stddev": 0.3595458209423123, - "group": "benchmark metadata", - "group_stddev": 0.3595458209423123, - "key": "metric_unit", - "label": "metric unit", - "missing_rate": 0.8815409309791332, - "selection_score": 0.31695435772314273 - }, - { - "benchmark_stddev": 0.21968612536975798, - "group": "benchmark metadata", - "group_stddev": 0.21968612536975798, - "key": "metric_kind", - "label": "metric kind", - "missing_rate": 0.8892702802815162, - "selection_score": 0.195360342281525 - }, - { - "benchmark_stddev": 0.12908090009938827, - "group": "model metadata", - "group_stddev": 0.12908090009938827, - "key": "model_license", - "label": "model license", - "missing_rate": 0.9172737374984566, - "selection_score": 0.11840251967383078 - } - ], - "matrix": [ - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "BBH", - "benchmark_label": "BBH (n=4,574)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MATH Level 5", - "benchmark_label": "MATH Level 5 (n=4,574)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MMLU-PRO", - "benchmark_label": "MMLU-PRO (n=4,574)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "MUSR", - "benchmark_label": "MUSR (n=4,574)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4574 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench", - "benchmark_label": "RewardBench (n=1,025)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1025 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "RewardBench 2", - "benchmark_label": "RewardBench 2 (n=1,379)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1379 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.9868392664509169, - "present_rate": 0.013160733549083063, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GPQA", - "benchmark_label": "GPQA (n=4,635)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.9868392664509169, - "present_rate": 0.013160733549083063, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "IFEval", - "benchmark_label": "IFEval (n=4,635)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 4635 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "GSM8K", - "benchmark_label": "GSM8K (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "LegalBench", - "benchmark_label": "LegalBench (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MATH", - "benchmark_label": "MATH (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MMLU", - "benchmark_label": "MMLU (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "MedQA", - "benchmark_label": "MedQA (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "NarrativeQA", - "benchmark_label": "NarrativeQA (n=91)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 91 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "helm_mmlu", - "benchmark_label": "helm_mmlu (n=2,844)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 2844 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.1578947368421053, - "present_rate": 0.8421052631578947, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "global-mmlu-lite", - "benchmark_label": "global-mmlu-lite (n=912)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 912 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 1020 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "benchmark_label": "ARC Prize evaluations leaderboard JSON (n=1,020)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1020 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 114 - }, - { - "benchmark": "SciArena leaderboard API", - "benchmark_label": "SciArena leaderboard API (n=114)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 114 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "BFCL leaderboard CSV", - "benchmark_label": "BFCL leaderboard CSV (n=3,350)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 3350 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.0, - "present_rate": 1.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 92 - }, - { - "benchmark": "wordle_arena_daily", - "benchmark_label": "wordle_arena_daily (n=92)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 92 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.26047358834244083, - "present_rate": 0.7395264116575592, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "generation_agentic_config_present", - "field_label": "agentic config", - "missing_rate": 0.9453551912568307, - "present_rate": 0.0546448087431694, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 0.7625986642380085, - "present_rate": 0.2374013357619915, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 0.7625986642380085, - "present_rate": 0.2374013357619915, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.05768063145112323, - "present_rate": 0.9423193685488768, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 0.567698846387371, - "present_rate": 0.432301153612629, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.548876745598057, - "present_rate": 0.45112325440194295, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 0.5725561627200971, - "present_rate": 0.42744383727990287, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.44201578627808136, - "present_rate": 0.5579842137219186, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.8099574984820886, - "present_rate": 0.19004250151791136, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.8099574984820886, - "present_rate": 0.19004250151791136, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1647 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=1,647)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "present_rate": 0.0, - "result_rows": 1647 - } - ], - "other_result_rows": 1647, - "top_benchmark_count": 20 - }, - "model_family_metadata_completeness": { - "field_group_order": [ - "eval metadata", - "benchmark metadata", - "model metadata" - ], - "fields": [ - { - "benchmark_stddev": 0.13379488104112633, - "group": "eval metadata", - "group_stddev": 0.13379488104112633, - "key": "generation_config_present", - "label": "generation config", - "missing_rate": 0.8809235708112113, - "selection_score": 0.11786306436301024 - }, - { - "benchmark_stddev": 0.07810422518841251, - "group": "eval metadata", - "group_stddev": 0.07810422518841251, - "key": "generation_max_tokens", - "label": "max tokens", - "missing_rate": 0.9880726015557476, - "selection_score": 0.0771726449744107 - }, - { - "benchmark_stddev": 0.07810422518841251, - "group": "eval metadata", - "group_stddev": 0.07810422518841251, - "key": "generation_temperature", - "label": "temperature", - "missing_rate": 0.9880726015557476, - "selection_score": 0.0771726449744107 - }, - { - "benchmark_stddev": 0.2624427571702142, - "group": "eval metadata", - "group_stddev": 0.2624427571702142, - "key": "inference_engine", - "label": "inference engine/platform", - "missing_rate": 0.1724410421039635, - "selection_score": 0.04525590253906917 - }, - { - "benchmark_stddev": 0.2850357391633949, - "group": "benchmark metadata", - "group_stddev": 0.2850357391633949, - "key": "source_organization_url", - "label": "source org URL", - "missing_rate": 0.7824175824175824, - "selection_score": 0.22301697393883207 - }, - { - "benchmark_stddev": 0.15209815212736208, - "group": "benchmark metadata", - "group_stddev": 0.15209815212736208, - "key": "metric_id", - "label": "metric ID", - "missing_rate": 0.8815409309791332, - "selection_score": 0.1340807466265606 - }, - { - "benchmark_stddev": 0.15209815212736208, - "group": "benchmark metadata", - "group_stddev": 0.15209815212736208, - "key": "metric_unit", - "label": "metric unit", - "missing_rate": 0.8815409309791332, - "selection_score": 0.1340807466265606 - }, - { - "benchmark_stddev": 0.14580449624851055, - "group": "benchmark metadata", - "group_stddev": 0.14580449624851055, - "key": "metric_kind", - "label": "metric kind", - "missing_rate": 0.8892702802815162, - "selection_score": 0.12965960524521825 - }, - { - "benchmark_stddev": 0.11349462538693254, - "group": "benchmark metadata", - "group_stddev": 0.11349462538693254, - "key": "has_uncertainty", - "label": "uncertainty", - "missing_rate": 0.9604148660328435, - "selection_score": 0.10900192543643858 - }, - { - "benchmark_stddev": 0.10935499889469129, - "group": "benchmark metadata", - "group_stddev": 0.10935499889469129, - "key": "detailed_results_file", - "label": "detailed results", - "missing_rate": 0.9803432522533646, - "selection_score": 0.10720543526658476 - }, - { - "benchmark_stddev": 0.10524043602312402, - "group": "benchmark metadata", - "group_stddev": 0.10524043602312402, - "key": "uncertainty_num_samples", - "label": "sample count", - "missing_rate": 0.9801456969996296, - "selection_score": 0.10315096051842983 - }, - { - "benchmark_stddev": 0.12620100124075465, - "group": "model metadata", - "group_stddev": 0.12620100124075465, - "key": "model_license", - "label": "model license", - "missing_rate": 0.9172737374984566, - "selection_score": 0.11576086408415438 - }, - { - "benchmark_stddev": 0.32502359224633115, - "group": "model metadata", - "group_stddev": 0.32502359224633115, - "key": "model_parameters", - "label": "model parameters", - "missing_rate": 0.32228670206198295, - "selection_score": 0.10475078163740877 - } - ], - "matrix": [ - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.9151321056845476, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.08486789431545236, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.833466773418735, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.16653322658126501, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.16653322658126501, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.833466773418735, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.9975980784627703, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0024019215372297837, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.9975980784627703, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0024019215372297837, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.9975980784627703, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0024019215372297837, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.0, - "result_rows": 1249 - }, - { - "benchmark": "allenai", - "benchmark_label": "allenai (n=1,249)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.9183346677341874, - "model_family": "allenai", - "model_family_label": "allenai (n=1,249)", - "present_rate": 0.08166533226581266, - "result_rows": 1249 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 1.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 0.0, - "result_rows": 714 - }, - { - "benchmark": "DreadPoor", - "benchmark_label": "DreadPoor (n=714)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "DreadPoor", - "model_family_label": "DreadPoor (n=714)", - "present_rate": 1.0, - "result_rows": 714 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 1.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 0.0, - "result_rows": 1044 - }, - { - "benchmark": "JayHyeon", - "benchmark_label": "JayHyeon (n=1,044)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "JayHyeon", - "model_family_label": "JayHyeon (n=1,044)", - "present_rate": 1.0, - "result_rows": 1044 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 1.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 0.0, - "result_rows": 360 - }, - { - "benchmark": "LeroyDyer", - "benchmark_label": "LeroyDyer (n=360)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "LeroyDyer", - "model_family_label": "LeroyDyer (n=360)", - "present_rate": 1.0, - "result_rows": 360 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 1.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 0.0, - "result_rows": 426 - }, - { - "benchmark": "Quazim0t0", - "benchmark_label": "Quazim0t0 (n=426)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "Quazim0t0", - "model_family_label": "Quazim0t0 (n=426)", - "present_rate": 1.0, - "result_rows": 426 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 1.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 0.0, - "result_rows": 396 - }, - { - "benchmark": "Sakalti", - "benchmark_label": "Sakalti (n=396)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "Sakalti", - "model_family_label": "Sakalti (n=396)", - "present_rate": 1.0, - "result_rows": 396 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 1.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 0.0, - "result_rows": 366 - }, - { - "benchmark": "Triangle104", - "benchmark_label": "Triangle104 (n=366)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "Triangle104", - "model_family_label": "Triangle104 (n=366)", - "present_rate": 1.0, - "result_rows": 366 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 1.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 0.0, - "result_rows": 528 - }, - { - "benchmark": "allknowingroger", - "benchmark_label": "allknowingroger (n=528)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "allknowingroger", - "model_family_label": "allknowingroger (n=528)", - "present_rate": 1.0, - "result_rows": 528 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 1.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 0.0, - "result_rows": 516 - }, - { - "benchmark": "bunnycore", - "benchmark_label": "bunnycore (n=516)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "bunnycore", - "model_family_label": "bunnycore (n=516)", - "present_rate": 1.0, - "result_rows": 516 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 1.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 0.0, - "result_rows": 372 - }, - { - "benchmark": "icefog72", - "benchmark_label": "icefog72 (n=372)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "icefog72", - "model_family_label": "icefog72 (n=372)", - "present_rate": 1.0, - "result_rows": 372 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 1.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 0.0, - "result_rows": 1182 - }, - { - "benchmark": "jaspionjader", - "benchmark_label": "jaspionjader (n=1,182)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "jaspionjader", - "model_family_label": "jaspionjader (n=1,182)", - "present_rate": 1.0, - "result_rows": 1182 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 1.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 0.0, - "result_rows": 666 - }, - { - "benchmark": "prithivMLmods", - "benchmark_label": "prithivMLmods (n=666)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "prithivMLmods", - "model_family_label": "prithivMLmods (n=666)", - "present_rate": 1.0, - "result_rows": 666 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 1.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 0.0, - "result_rows": 468 - }, - { - "benchmark": "zelk12", - "benchmark_label": "zelk12 (n=468)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.0, - "model_family": "zelk12", - "model_family_label": "zelk12 (n=468)", - "present_rate": 1.0, - "result_rows": 468 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.272, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.728, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.272, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.728, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.728, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.272, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.728, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.272, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.728, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.272, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.728, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.272, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.7573333333333333, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.24266666666666667, - "result_rows": 750 - }, - { - "benchmark": "meta", - "benchmark_label": "meta (n=750)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 1.0, - "model_family": "meta", - "model_family_label": "meta (n=750)", - "present_rate": 0.0, - "result_rows": 750 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.593613024420789, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.406386975579211, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.0, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.0, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.3268628678772699, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.6731371321227301, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.5122103944896681, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.48778960551033185, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.7113337507827175, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.2886662492172824, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.7113337507827175, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.2886662492172824, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.7113337507827175, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.2886662492172824, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.8753913587977458, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.12460864120225423, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.0, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.0, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.8033813400125235, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.19661865998747652, - "result_rows": 1597 - }, - { - "benchmark": "google", - "benchmark_label": "google (n=1,597)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.876017532874139, - "model_family": "google", - "model_family_label": "google (n=1,597)", - "present_rate": 0.12398246712586099, - "result_rows": 1597 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.097029702970297, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.902970297029703, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.7128712871287128, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.2871287128712871, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 1.0, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.0, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 1.0, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.0, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 1.0, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.0, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 0.80990099009901, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.1900990099009901, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 1.0, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.0, - "result_rows": 505 - }, - { - "benchmark": "Qwen", - "benchmark_label": "Qwen (n=505)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.28712871287128716, - "model_family": "Qwen", - "model_family_label": "Qwen (n=505)", - "present_rate": 0.7128712871287128, - "result_rows": 505 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.5506607929515419, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.44933920704845814, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.0, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.0, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.3480176211453745, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.6519823788546255, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.5682819383259912, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.43171806167400884, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.6585903083700441, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.34140969162995594, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.6585903083700441, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.34140969162995594, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.6585903083700441, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.34140969162995594, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.9295154185022027, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.07048458149779736, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.0, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.0, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.6916299559471366, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.30837004405286345, - "result_rows": 908 - }, - { - "benchmark": "mistralai", - "benchmark_label": "mistralai (n=908)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.8810572687224669, - "model_family": "mistralai", - "model_family_label": "mistralai (n=908)", - "present_rate": 0.11894273127753303, - "result_rows": 908 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.5881818181818181, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.4118181818181818, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.0, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.0, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.44727272727272727, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.5527272727272727, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.4, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.6, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.5909090909090908, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.4090909090909091, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.5909090909090908, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.4090909090909091, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.5909090909090908, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.4090909090909091, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.8809090909090909, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.1190909090909091, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.0, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.0, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.8309090909090909, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.1690909090909091, - "result_rows": 1100 - }, - { - "benchmark": "anthropic", - "benchmark_label": "anthropic (n=1,100)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 1.0, - "model_family": "anthropic", - "model_family_label": "anthropic (n=1,100)", - "present_rate": 0.0, - "result_rows": 1100 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.6962616822429907, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.3037383177570093, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.6127336448598131, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.3872663551401869, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.2762850467289719, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.7237149532710281, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.42757009345794394, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.572429906542056, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.42757009345794394, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.572429906542056, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.42757009345794394, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.572429906542056, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.9258177570093458, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0741822429906542, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.7102803738317758, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.2897196261682243, - "result_rows": 1712 - }, - { - "benchmark": "openai", - "benchmark_label": "openai (n=1,712)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 1.0, - "model_family": "openai", - "model_family_label": "openai (n=1,712)", - "present_rate": 0.0, - "result_rows": 1712 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.5555555555555556, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.4444444444444444, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.5555555555555556, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.4444444444444444, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.4444444444444444, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.5555555555555556, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.4444444444444444, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.5555555555555556, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.4444444444444444, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.5555555555555556, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.4444444444444444, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.5555555555555556, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.4739229024943311, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.5260770975056689, - "result_rows": 882 - }, - { - "benchmark": "qwen", - "benchmark_label": "qwen (n=882)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 1.0, - "model_family": "qwen", - "model_family_label": "qwen (n=882)", - "present_rate": 0.0, - "result_rows": 882 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "generation_config_present", - "field_label": "generation config", - "missing_rate": 0.9332633109800437, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.06673668901995637, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "generation_max_tokens", - "field_label": "max tokens", - "missing_rate": 0.9843661630443564, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.01563383695564353, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "generation_temperature", - "field_label": "temperature", - "missing_rate": 0.9843661630443564, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.01563383695564353, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "inference_engine", - "field_label": "inference engine/platform", - "missing_rate": 0.11392098246748006, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.8860790175325199, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "source_organization_url", - "field_label": "source org URL", - "missing_rate": 0.8440252080471843, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.15597479195281572, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "metric_id", - "field_label": "metric ID", - "missing_rate": 0.9232851256362609, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.0767148743637392, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "metric_unit", - "field_label": "metric unit", - "missing_rate": 0.9232851256362609, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.0767148743637392, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "metric_kind", - "field_label": "metric kind", - "missing_rate": 0.9359295467399208, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.06407045326007918, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "has_uncertainty", - "field_label": "uncertainty", - "missing_rate": 0.9601680536478953, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.03983194635210471, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "detailed_results_file", - "field_label": "detailed results", - "missing_rate": 0.9717217419406965, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.028278258059303545, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "uncertainty_num_samples", - "field_label": "sample count", - "missing_rate": 0.9713985618485901, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.028601438151409874, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "model_license", - "field_label": "model license", - "missing_rate": 0.9423123535590208, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.057687646440979234, - "result_rows": 24754 - }, - { - "benchmark": "Other", - "benchmark_label": "Other (n=24,754)", - "field": "model_parameters", - "field_label": "model parameters", - "missing_rate": 0.20667366890199568, - "model_family": "Other", - "model_family_label": "Other (n=24,754)", - "present_rate": 0.7933263310980043, - "result_rows": 24754 - } - ], - "model_families": [ - { - "label": "allenai (n=1,249)", - "model_family": "allenai", - "overall_completeness": 0.09028761470715034, - "result_rows": 1249 - }, - { - "label": "DreadPoor (n=714)", - "model_family": "DreadPoor", - "overall_completeness": 0.15384615384615385, - "result_rows": 714 - }, - { - "label": "JayHyeon (n=1,044)", - "model_family": "JayHyeon", - "overall_completeness": 0.15384615384615385, - "result_rows": 1044 - }, - { - "label": "LeroyDyer (n=360)", - "model_family": "LeroyDyer", - "overall_completeness": 0.15384615384615385, - "result_rows": 360 - }, - { - "label": "Quazim0t0 (n=426)", - "model_family": "Quazim0t0", - "overall_completeness": 0.15384615384615385, - "result_rows": 426 - }, - { - "label": "Sakalti (n=396)", - "model_family": "Sakalti", - "overall_completeness": 0.15384615384615385, - "result_rows": 396 - }, - { - "label": "Triangle104 (n=366)", - "model_family": "Triangle104", - "overall_completeness": 0.15384615384615385, - "result_rows": 366 - }, - { - "label": "allknowingroger (n=528)", - "model_family": "allknowingroger", - "overall_completeness": 0.15384615384615385, - "result_rows": 528 - }, - { - "label": "bunnycore (n=516)", - "model_family": "bunnycore", - "overall_completeness": 0.15384615384615385, - "result_rows": 516 - }, - { - "label": "icefog72 (n=372)", - "model_family": "icefog72", - "overall_completeness": 0.15384615384615385, - "result_rows": 372 - }, - { - "label": "jaspionjader (n=1,182)", - "model_family": "jaspionjader", - "overall_completeness": 0.15384615384615385, - "result_rows": 1182 - }, - { - "label": "prithivMLmods (n=666)", - "model_family": "prithivMLmods", - "overall_completeness": 0.15384615384615385, - "result_rows": 666 - }, - { - "label": "zelk12 (n=468)", - "model_family": "zelk12", - "overall_completeness": 0.15384615384615385, - "result_rows": 468 - }, - { - "label": "meta (n=750)", - "model_family": "meta", - "overall_completeness": 0.21435897435897436, - "result_rows": 750 - }, - { - "label": "google (n=1,597)", - "model_family": "google", - "overall_completeness": 0.22142478685997785, - "result_rows": 1597 - }, - { - "label": "Qwen (n=505)", - "model_family": "Qwen", - "overall_completeness": 0.23412033511043412, - "result_rows": 505 - }, - { - "label": "mistralai (n=908)", - "model_family": "mistralai", - "overall_completeness": 0.23500508302270418, - "result_rows": 908 - }, - { - "label": "anthropic (n=1,100)", - "model_family": "anthropic", - "overall_completeness": 0.23692307692307693, - "result_rows": 1100 - }, - { - "label": "openai (n=1,712)", - "model_family": "openai", - "overall_completeness": 0.26891624730409774, - "result_rows": 1712 - }, - { - "label": "qwen (n=882)", - "model_family": "qwen", - "overall_completeness": 0.27978370835513694, - "result_rows": 882 - }, - { - "label": "Other (n=24,754)", - "model_family": "Other", - "overall_completeness": 0.17732953803891835, - "result_rows": 24754 - } - ], - "model_family_selection": "Top model families/developers by result-row count, with remaining families aggregated as Other; rows are sorted by overall metadata completeness.", - "other_result_rows": 24754, - "top_model_family_count": 20 - }, - "models_per_benchmark": [ - { - "benchmark": "GPQA", - "result_rows": 4635, - "unique_models": 4557 - }, - { - "benchmark": "IFEval", - "result_rows": 4635, - "unique_models": 4557 - }, - { - "benchmark": "BBH", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MATH Level 5", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MMLU-PRO", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "MUSR", - "result_rows": 4574, - "unique_models": 4496 - }, - { - "benchmark": "RewardBench 2", - "result_rows": 1379, - "unique_models": 197 - }, - { - "benchmark": "RewardBench", - "result_rows": 1025, - "unique_models": 179 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "result_rows": 1020, - "unique_models": 139 - }, - { - "benchmark": "BFCL leaderboard CSV", - "result_rows": 3350, - "unique_models": 109 - }, - { - "benchmark": "GSM8K", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "LegalBench", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MATH", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MMLU", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "MedQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "NarrativeQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "OpenbookQA", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "WMT 2014", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "helm_lite", - "result_rows": 91, - "unique_models": 91 - }, - { - "benchmark": "helm_mmlu", - "result_rows": 2844, - "unique_models": 79 - }, - { - "benchmark": "MMLU-Pro", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "Omni-MATH", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "WildBench", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "helm_capabilities", - "result_rows": 61, - "unique_models": 61 - }, - { - "benchmark": "Wordle Arena Word Set", - "result_rows": 75, - "unique_models": 43 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "result_rows": 64, - "unique_models": 40 - }, - { - "benchmark": "SciArena leaderboard API", - "result_rows": 114, - "unique_models": 38 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "result_rows": 46, - "unique_models": 38 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "result_rows": 50, - "unique_models": 37 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "result_rows": 40, - "unique_models": 37 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "result_rows": 38, - "unique_models": 36 - }, - { - "benchmark": "wordle_arena_daily", - "result_rows": 92, - "unique_models": 32 - }, - { - "benchmark": "fibble4_arena_daily", - "result_rows": 84, - "unique_models": 28 - }, - { - "benchmark": "fibble5_arena_daily", - "result_rows": 84, - "unique_models": 28 - }, - { - "benchmark": "fibble_arena_daily", - "result_rows": 82, - "unique_models": 28 - }, - { - "benchmark": "global-mmlu-lite", - "result_rows": 912, - "unique_models": 27 - }, - { - "benchmark": "Easy Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "Hard Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "Medium Problems", - "result_rows": 29, - "unique_models": 27 - }, - { - "benchmark": "fibble3_arena_daily", - "result_rows": 75, - "unique_models": 25 - }, - { - "benchmark": "fibble2_arena_daily", - "result_rows": 66, - "unique_models": 22 - }, - { - "benchmark": "apex-agents", - "result_rows": 74, - "unique_models": 20 - }, - { - "benchmark": "ace", - "result_rows": 32, - "unique_models": 12 - }, - { - "benchmark": "apex-v1", - "result_rows": 19, - "unique_models": 10 - }, - { - "benchmark": "La Leaderboard composite dataset", - "result_rows": 5, - "unique_models": 5 - }, - { - "benchmark": "Anthropic RLHF dataset", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Best ChatGPT Prompts", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Koala test dataset", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Open Assistant", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Self Instruct", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "Vicuna", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "helm_instruct", - "result_rows": 4, - "unique_models": 4 - }, - { - "benchmark": "appworld/test_normal", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "browsecompplus", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "swe-bench", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/airline", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/retail", - "result_rows": 15, - "unique_models": 3 - }, - { - "benchmark": "tau-bench-2/telecom", - "result_rows": 15, - "unique_models": 3 - } - ], - "normalization_exclusions": { - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_score": 0, - "out_of_range": 100, - "zero_width_bounds": 0 - }, - "normalized_score_summaries": [ - { - "benchmark": "GPQA", - "count": 4635, - "evaluation_name": "GPQA", - "max": 0.791, - "mean": 0.30281846817691477, - "median": 0.2953, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.168, - "stddev": 0.04912650528590854 - }, - { - "benchmark": "IFEval", - "count": 4635, - "evaluation_name": "IFEval", - "max": 0.951, - "mean": 0.46067240560949296, - "median": 0.4545, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.20767533842318336 - }, - { - "benchmark": "BBH", - "count": 4574, - "evaluation_name": "BBH", - "max": 0.8269, - "mean": 0.4867208351552252, - "median": 0.5038, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2178, - "stddev": 0.11398463853942328 - }, - { - "benchmark": "MATH Level 5", - "count": 4574, - "evaluation_name": "MATH Level 5", - "max": 0.7145, - "mean": 0.1555723874070835, - "median": 0.108, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.14625658002062183 - }, - { - "benchmark": "MMLU-PRO", - "count": 4574, - "evaluation_name": "MMLU-PRO", - "max": 0.7303, - "mean": 0.32874433756012245, - "median": 0.34475, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.1026, - "stddev": 0.12833971558059434 - }, - { - "benchmark": "MUSR", - "count": 4574, - "evaluation_name": "MUSR", - "max": 0.6024, - "mean": 0.40635732400524704, - "median": 0.4091, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2929, - "stddev": 0.04536121071938266 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Factuality", - "max": 0.8716, - "mean": 0.6400781725888325, - "median": 0.6779, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0274, - "stddev": 0.14060436598989037 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Focus", - "max": 0.9838, - "mean": 0.6965137055837564, - "median": 0.7293, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0646, - "stddev": 0.1999740938960993 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Math", - "max": 0.898, - "mean": 0.6002578680203046, - "median": 0.6175, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0546, - "stddev": 0.11530869084864068 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Precise IF", - "max": 0.6625, - "mean": 0.3724553299492386, - "median": 0.375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.1313, - "stddev": 0.06683254610514013 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Safety", - "max": 0.9756, - "mean": 0.770956345177665, - "median": 0.8044, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0378, - "stddev": 0.16859961817216138 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Score", - "max": 0.8413, - "mean": 0.602605076142132, - "median": 0.6194, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0576, - "stddev": 0.13540270878209892 - }, - { - "benchmark": "RewardBench 2", - "count": 191, - "evaluation_name": "Ties", - "max": 0.9063, - "mean": 0.5524884816753927, - "median": 0.5604, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.008, - "stddev": 0.19526001389051642 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat Hard", - "max": 0.9145, - "mean": 0.6117941176470588, - "median": 0.6053, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2654, - "stddev": 0.1713479724227396 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat", - "max": 0.9944, - "mean": 0.8923390374331551, - "median": 0.9413, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3547, - "stddev": 0.12437365150350695 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Safety", - "max": 0.9514, - "mean": 0.75624064171123, - "median": 0.7946, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3743, - "stddev": 0.14897429003710377 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Score", - "max": 0.9511, - "mean": 0.7524326203208556, - "median": 0.7455, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.4727, - "stddev": 0.12766260032441618 - }, - { - "benchmark": "RewardBench", - "count": 172, - "evaluation_name": "Reasoning", - "max": 0.9912, - "mean": 0.779306976744186, - "median": 0.80125, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2821, - "stddev": 0.16510278548710738 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 133, - "evaluation_name": "v2_Semi_Private", - "max": 0.9999676010927855, - "mean": 0.9675588722465054, - "median": 0.9969557986781246, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.0, - "stddev": 0.11170694755172818 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 133, - "evaluation_name": "v2_Semi_Private", - "max": 1.0, - "mean": 0.1482124060150376, - "median": 0.0333, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.23541775910763008 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 131, - "evaluation_name": "v1_Semi_Private", - "max": 0.9999805606556713, - "mean": 0.9826512407799388, - "median": 0.9976983816314812, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.4264226887417708, - "stddev": 0.058880684082207674 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 131, - "evaluation_name": "v1_Semi_Private", - "max": 0.98, - "mean": 0.44456030534351143, - "median": 0.4, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.2907857931349756 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 125, - "evaluation_name": "v2_Public_Eval", - "max": 0.9999663051364969, - "mean": 0.9846037386298053, - "median": 0.9968910008636955, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.7719116932098415, - "stddev": 0.03818843389200095 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 125, - "evaluation_name": "v2_Public_Eval", - "max": 1.0, - "mean": 0.1310936, - "median": 0.029, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.23801453457380936 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 114, - "evaluation_name": "v1_Public_Eval", - "max": 0.999984448524537, - "mean": 0.9935590006174354, - "median": 0.998216116168769, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.899950878565301, - "stddev": 0.014694808632437306 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 114, - "evaluation_name": "v1_Public_Eval", - "max": 0.9825, - "mean": 0.5073622807017544, - "median": 0.5056499999999999, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0175, - "stddev": 0.2800617230927051 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_accuracy", - "max": 0.9312, - "mean": 0.6721155963302752, - "median": 0.7076, - "metric_id": "bfcl.live.live_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.16692855101327364 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", - "max": 0.9401999999999999, - "mean": 0.6615788990825688, - "median": 0.7104, - "metric_id": "bfcl.live.live_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.17084967242914786 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", - "max": 0.9375, - "mean": 0.6427752293577982, - "median": 0.75, - "metric_id": "bfcl.live.live_parallel_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live parallel AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.24460198666555008 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", - "max": 0.9582999999999999, - "mean": 0.5703339449541285, - "median": 0.625, - "metric_id": "bfcl.live.live_parallel_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live parallel multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.2059801726435246 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_simple_ast_accuracy", - "max": 0.9031, - "mean": 0.726408256880734, - "median": 0.7636, - "metric_id": "bfcl.live.live_simple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live simple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.1625125032958663 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.accuracy", - "max": 0.7376, - "mean": 0.20235045871559632, - "median": 0.157, - "metric_id": "bfcl.memory.accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.1699218603771948 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.kv_accuracy", - "max": 0.7097, - "mean": 0.13904036697247707, - "median": 0.0839, - "metric_id": "bfcl.memory.kv_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory KV accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.1515138492137527 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", - "max": 0.8323, - "mean": 0.2820403669724771, - "median": 0.271, - "metric_id": "bfcl.memory.recursive_summarization_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory recursive summarization accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.208463795648454 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.vector_accuracy", - "max": 0.7290000000000001, - "mean": 0.18597155963302753, - "median": 0.1161, - "metric_id": "bfcl.memory.vector_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory vector accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.18379301567138523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.accuracy", - "max": 0.7737999999999999, - "mean": 0.23962385321100918, - "median": 0.165, - "metric_id": "bfcl.multi_turn.accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.21479676048452157 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.base_accuracy", - "max": 0.825, - "mean": 0.29009174311926605, - "median": 0.2, - "metric_id": "bfcl.multi_turn.base_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn base accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.24897845144318115 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.long_context_accuracy", - "max": 0.76, - "mean": 0.24009174311926607, - "median": 0.175, - "metric_id": "bfcl.multi_turn.long_context_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn long-context accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.2138372755020874 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", - "max": 0.77, - "mean": 0.21591743119266055, - "median": 0.14, - "metric_id": "bfcl.multi_turn.miss_function_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn missing function accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.2171396175036615 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", - "max": 0.74, - "mean": 0.21238532110091743, - "median": 0.15, - "metric_id": "bfcl.multi_turn.miss_parameter_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn missing parameter accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.194452693868985 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.ast_accuracy", - "max": 0.9065000000000001, - "mean": 0.7661733944954129, - "median": 0.83, - "metric_id": "bfcl.non_live.ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.18657086363085557 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", - "max": 0.97, - "mean": 0.8535779816513761, - "median": 0.92, - "metric_id": "bfcl.non_live.multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.182740318362281 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", - "max": 0.96, - "mean": 0.7979816513761467, - "median": 0.88, - "metric_id": "bfcl.non_live.parallel_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live parallel AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.2273336991546167 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", - "max": 0.925, - "mean": 0.7347706422018349, - "median": 0.825, - "metric_id": "bfcl.non_live.parallel_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live parallel multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.24427840192832814 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.simple_ast_accuracy", - "max": 0.8067, - "mean": 0.6783633027522936, - "median": 0.7258, - "metric_id": "bfcl.non_live.simple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live simple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.14843039998882532 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_mean_s", - "max": 0.9959969388355802, - "mean": 0.910949171600733, - "median": 0.9723906516748102, - "metric_id": "bfcl.overall.latency_mean_s", - "metric_kind": "latency", - "metric_name": "Latency mean", - "metric_unit": "seconds", - "min": 0.0, - "stddev": 0.16788751393048792 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_p95_s", - "max": 0.9983116129372659, - "mean": 0.9052860681766953, - "median": 0.9794227826729278, - "metric_id": "bfcl.overall.latency_p95_s", - "metric_kind": "latency", - "metric_name": "Latency 95th percentile", - "metric_unit": "seconds", - "min": 0.0, - "stddev": 0.17750828285090742 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_std_s", - "max": 0.9978872247523358, - "mean": 0.8712378709255851, - "median": 0.9528616366965585, - "metric_id": "bfcl.overall.latency_std_s", - "metric_kind": "latency", - "metric_name": "Latency standard deviation", - "metric_unit": "seconds", - "min": 0.0, - "stddev": 0.18715211182331667 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.overall_accuracy", - "max": 0.7746999999999999, - "mean": 0.3809394495412844, - "median": 0.3552, - "metric_id": "bfcl.overall.overall_accuracy", - "metric_kind": "accuracy", - "metric_name": "Overall accuracy", - "metric_unit": "percentage", - "min": 0.0717, - "stddev": 0.1568359888890471 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.rank", - "max": 1.0, - "mean": 0.5, - "median": 0.5, - "metric_id": "bfcl.overall.rank", - "metric_kind": "rank", - "metric_name": "Overall rank", - "metric_unit": "position", - "min": 0.0, - "stddev": 0.2926814601721238 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.total_cost_usd", - "max": 0.9987048455669116, - "mean": 0.8673404362764129, - "median": 0.9486161556437762, - "metric_id": "bfcl.overall.total_cost_usd", - "metric_kind": "cost", - "metric_name": "Total cost", - "metric_unit": "usd", - "min": 0.0, - "stddev": 0.2029161256124978 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", - "max": 1.0, - "mean": 0.7561073394495413, - "median": 0.8079000000000001, - "metric_id": "bfcl.relevance.irrelevance_detection_accuracy", - "metric_kind": "accuracy", - "metric_name": "Irrelevance detection accuracy", - "metric_unit": "percentage", - "min": 0.06280000000000001, - "stddev": 0.16896574532662487 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", - "max": 1.0, - "mean": 0.7637614678899083, - "median": 0.8125, - "metric_id": "bfcl.relevance.relevance_detection_accuracy", - "metric_kind": "accuracy", - "metric_name": "Relevance detection accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.19862042242738473 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.accuracy", - "max": 0.845, - "mean": 0.24573394495412845, - "median": 0.105, - "metric_id": "bfcl.web_search.accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.28751797503234583 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.base_accuracy", - "max": 0.87, - "mean": 0.2646788990825688, - "median": 0.13, - "metric_id": "bfcl.web_search.base_accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search base accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.29552705211555524 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.no_snippet_accuracy", - "max": 0.85, - "mean": 0.22678899082568807, - "median": 0.09, - "metric_id": "bfcl.web_search.no_snippet_accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search no-snippet accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 0.28410639873751836 - }, - { - "benchmark": "RewardBench", - "count": 105, - "evaluation_name": "Prior Sets (0.5 weight)", - "max": 0.782, - "mean": 0.5625428571428571, - "median": 0.5757, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.17788750218625798 - }, - { - "benchmark": "LegalBench", - "count": 91, - "evaluation_name": "LegalBench", - "max": 0.757, - "mean": 0.5902087912087912, - "median": 0.629, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.331, - "stddev": 0.11619442676283923 - }, - { - "benchmark": "MATH", - "count": 91, - "evaluation_name": "MATH", - "max": 0.92, - "mean": 0.5574065934065934, - "median": 0.656, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.026, - "stddev": 0.2685588691111619 - }, - { - "benchmark": "MMLU", - "count": 91, - "evaluation_name": "MMLU", - "max": 0.809, - "mean": 0.6220989010989011, - "median": 0.643, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.243, - "stddev": 0.12023218786489331 - }, - { - "benchmark": "MedQA", - "count": 91, - "evaluation_name": "MedQA", - "max": 0.863, - "mean": 0.6103296703296703, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.229, - "stddev": 0.15792234765120447 - }, - { - "benchmark": "NarrativeQA", - "count": 91, - "evaluation_name": "NarrativeQA", - "max": 0.804, - "mean": 0.6938461538461539, - "median": 0.742, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.111, - "stddev": 0.1228501275789075 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "count": 91, - "evaluation_name": "NaturalQuestions (closed-book)", - "max": 0.502, - "mean": 0.3627912087912088, - "median": 0.378, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.028, - "stddev": 0.08850543190907255 - }, - { - "benchmark": "OpenbookQA", - "count": 91, - "evaluation_name": "OpenbookQA", - "max": 0.972, - "mean": 0.8312527472527472, - "median": 0.882, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.222, - "stddev": 0.16911788087383792 - }, - { - "benchmark": "WMT 2014", - "count": 91, - "evaluation_name": "WMT 2014", - "max": 0.262, - "mean": 0.18178021978021977, - "median": 0.191, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.023, - "stddev": 0.04641450975187302 - }, - { - "benchmark": "helm_lite", - "count": 91, - "evaluation_name": "Mean win rate", - "max": 0.938, - "mean": 0.499967032967033, - "median": 0.488, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.041, - "stddev": 0.24004497034928224 - }, - { - "benchmark": "GSM8K", - "count": 90, - "evaluation_name": "GSM8K", - "max": 0.956, - "mean": 0.6740333333333334, - "median": 0.765, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.028, - "stddev": 0.24790177694247365 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Abstract Algebra", - "max": 0.84, - "mean": 0.4692405063291139, - "median": 0.44, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.21, - "stddev": 0.1566784405169303 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Anatomy", - "max": 0.911, - "mean": 0.7049620253164557, - "median": 0.719, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.222, - "stddev": 0.12203524533321435 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Astronomy", - "max": 0.974, - "mean": 0.8196835443037974, - "median": 0.855, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.342, - "stddev": 0.12503810130124515 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Business Ethics", - "max": 0.89, - "mean": 0.7354430379746836, - "median": 0.77, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.24, - "stddev": 0.1177001565076888 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Clinical Knowledge", - "max": 0.928, - "mean": 0.7806329113924051, - "median": 0.8, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.10518545005348215 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "College Physics", - "max": 0.863, - "mean": 0.5205189873417722, - "median": 0.51, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.196, - "stddev": 0.13341576241396605 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Computer Security", - "max": 0.89, - "mean": 0.7888607594936708, - "median": 0.8, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3, - "stddev": 0.07740978772295665 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Conceptual Physics", - "max": 0.949, - "mean": 0.7394050632911392, - "median": 0.774, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.319, - "stddev": 0.1436847973853721 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Econometrics", - "max": 0.807, - "mean": 0.5924556962025317, - "median": 0.614, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.307, - "stddev": 0.12405156056525753 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Electrical Engineering", - "max": 0.869, - "mean": 0.7012531645569621, - "median": 0.724, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.29, - "stddev": 0.10967007262512768 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Elementary Mathematics", - "max": 0.942, - "mean": 0.6168481012658228, - "median": 0.622, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.254, - "stddev": 0.17076712953141734 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Formal Logic", - "max": 0.786, - "mean": 0.5559240506329114, - "median": 0.571, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.27, - "stddev": 0.11667484646986527 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Global Facts", - "max": 0.8, - "mean": 0.49860759493670886, - "median": 0.5, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.25, - "stddev": 0.11856767165669667 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "High School World History", - "max": 0.958, - "mean": 0.8590253164556962, - "median": 0.89, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.253, - "stddev": 0.1104488482004626 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Human Sexuality", - "max": 0.939, - "mean": 0.7969367088607595, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.267, - "stddev": 0.14067149783040647 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "International Law", - "max": 0.959, - "mean": 0.8525189873417721, - "median": 0.884, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.306, - "stddev": 0.09770414010589916 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Jurisprudence", - "max": 0.907, - "mean": 0.8231518987341773, - "median": 0.852, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.25, - "stddev": 0.09722219971870344 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Logical Fallacies", - "max": 0.926, - "mean": 0.8139873417721519, - "median": 0.834, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.264, - "stddev": 0.0972786763034739 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "MMLU All Subjects", - "max": 0.873, - "mean": 0.7308227848101266, - "median": 0.757, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.295, - "stddev": 0.10005918242229046 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Machine Learning", - "max": 0.839, - "mean": 0.592126582278481, - "median": 0.616, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.286, - "stddev": 0.12807703682255595 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Management", - "max": 0.942, - "mean": 0.8453037974683544, - "median": 0.864, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.272, - "stddev": 0.09395052631917909 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Marketing", - "max": 0.962, - "mean": 0.9024556962025316, - "median": 0.923, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.269, - "stddev": 0.08556236254220637 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Mean win rate", - "max": 1.0, - "mean": 0.5000506329113924, - "median": 0.517, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.014, - "stddev": 0.2741845671999428 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Medical Genetics", - "max": 0.98, - "mean": 0.8162025316455697, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.28, - "stddev": 0.11717074761250226 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Miscellaneous", - "max": 0.964, - "mean": 0.8688607594936709, - "median": 0.893, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.292, - "stddev": 0.09859535722376811 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Moral Scenarios", - "max": 0.902, - "mean": 0.5793924050632911, - "median": 0.575, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.231, - "stddev": 0.19478445797799818 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Nutrition", - "max": 0.928, - "mean": 0.7968987341772152, - "median": 0.82, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.34, - "stddev": 0.1008295839442827 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Philosophy", - "max": 0.9, - "mean": 0.7844303797468355, - "median": 0.807, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.325, - "stddev": 0.09312807331625374 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Prehistory", - "max": 0.951, - "mean": 0.824746835443038, - "median": 0.858, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.318, - "stddev": 0.10757030716441658 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Professional Psychology", - "max": 0.922, - "mean": 0.7793291139240506, - "median": 0.812, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.232, - "stddev": 0.1177310844427953 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Public Relations", - "max": 0.855, - "mean": 0.724873417721519, - "median": 0.736, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.345, - "stddev": 0.0757594653625247 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Security Studies", - "max": 0.886, - "mean": 0.778126582278481, - "median": 0.804, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.408, - "stddev": 0.09570378540441088 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Sociology", - "max": 0.96, - "mean": 0.8729493670886076, - "median": 0.9, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.383, - "stddev": 0.08587676004752948 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Us Foreign Policy", - "max": 0.97, - "mean": 0.8918987341772152, - "median": 0.92, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.09360413026947771 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Virology", - "max": 0.602, - "mean": 0.5457215189873418, - "median": 0.56, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.392, - "stddev": 0.047070851318166546 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "World Religions", - "max": 0.924, - "mean": 0.8426455696202532, - "median": 0.865, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.234, - "stddev": 0.08472202480187987 - }, - { - "benchmark": "MMLU-Pro", - "count": 61, - "evaluation_name": "MMLU-Pro", - "max": 0.875, - "mean": 0.6609344262295082, - "median": 0.723, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.169, - "stddev": 0.1866150109050233 - }, - { - "benchmark": "Omni-MATH", - "count": 61, - "evaluation_name": "Omni-MATH", - "max": 0.722, - "mean": 0.3746065573770492, - "median": 0.364, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.072, - "stddev": 0.17904862269679006 - }, - { - "benchmark": "WildBench", - "count": 61, - "evaluation_name": "WildBench", - "max": 0.866, - "mean": 0.7791803278688525, - "median": 0.797, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.477, - "stddev": 0.07613989497338025 - }, - { - "benchmark": "helm_capabilities", - "count": 61, - "evaluation_name": "Mean score", - "max": 0.819, - "mean": 0.6281803278688525, - "median": 0.642, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.325, - "stddev": 0.12667261058817744 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Arabic", - "max": 0.9475, - "mean": 0.8123458333333333, - "median": 0.82375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.455, - "stddev": 0.11404825771861875 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Bengali", - "max": 0.9425, - "mean": 0.8118458333333334, - "median": 0.82375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5175, - "stddev": 0.10786060736231451 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Burmese", - "max": 0.945, - "mean": 0.8254416666666666, - "median": 0.8375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.63, - "stddev": 0.08983182356393916 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Chinese", - "max": 0.9475, - "mean": 0.80325, - "median": 0.835, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5075, - "stddev": 0.12931314787277418 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Culturally Agnostic", - "max": 0.9528, - "mean": 0.8264125, - "median": 0.857, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5631, - "stddev": 0.10811543599320127 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Culturally Sensitive", - "max": 0.9397, - "mean": 0.788525, - "median": 0.78935, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5391, - "stddev": 0.1149148963548909 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "English", - "max": 0.9475, - "mean": 0.7939833333333334, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.38, - "stddev": 0.15081692344416497 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "French", - "max": 0.9575, - "mean": 0.7944791666666666, - "median": 0.8275, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.41, - "stddev": 0.14230966528431346 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "German", - "max": 0.94, - "mean": 0.8004833333333333, - "median": 0.8275, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.4775, - "stddev": 0.12445258061886479 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Global MMLU Lite", - "max": 0.9453, - "mean": 0.8074583333333334, - "median": 0.82315, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5511, - "stddev": 0.11081356363967734 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Hindi", - "max": 0.9475, - "mean": 0.7983333333333333, - "median": 0.82355, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.555, - "stddev": 0.11719085240122123 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Indonesian", - "max": 0.955, - "mean": 0.801275, - "median": 0.80625, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.515, - "stddev": 0.11649187077838011 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Italian", - "max": 0.955, - "mean": 0.8056875, - "median": 0.8300000000000001, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.48, - "stddev": 0.1239779332201175 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Japanese", - "max": 0.94, - "mean": 0.8170291666666667, - "median": 0.84375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.58, - "stddev": 0.10297801657229139 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Korean", - "max": 0.95, - "mean": 0.820125, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.595, - "stddev": 0.10111529652574511 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Portuguese", - "max": 0.945, - "mean": 0.8010041666666666, - "median": 0.8323, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5175, - "stddev": 0.12492813757011505 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Spanish", - "max": 0.9475, - "mean": 0.8042458333333333, - "median": 0.8325, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.485, - "stddev": 0.12684843352857172 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Swahili", - "max": 0.94, - "mean": 0.8143708333333334, - "median": 0.8200000000000001, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.6075, - "stddev": 0.09313423156204427 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Yoruba", - "max": 0.9425, - "mean": 0.8155583333333334, - "median": 0.8223, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5825, - "stddev": 0.09530013023440752 - }, - { - "benchmark": "Wordle Arena Word Set", - "count": 43, - "evaluation_name": "wordle_arena_win_rate", - "max": 1.0, - "mean": 0.38320930232558137, - "median": 0.3, - "metric_id": "wordle_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.3652171551113076 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 40, - "evaluation_name": "bfcl.format_sensitivity.max_delta", - "max": 1.0, - "mean": 0.654875, - "median": 0.775, - "metric_id": "bfcl.format_sensitivity.max_delta", - "metric_kind": "difference", - "metric_name": "Format sensitivity max delta", - "metric_unit": "percentage_points", - "min": 0.18500000000000005, - "stddev": 0.2671153776928614 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 40, - "evaluation_name": "bfcl.format_sensitivity.stddev", - "max": 1.0, - "mean": 0.8911525, - "median": 0.9430000000000001, - "metric_id": "bfcl.format_sensitivity.stddev", - "metric_kind": "difference", - "metric_name": "Format sensitivity standard deviation", - "metric_unit": "percentage_points", - "min": 0.6582, - "stddev": 0.0973636534813211 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "count": 40, - "evaluation_name": "fibble1_arena_win_rate", - "max": 0.881, - "mean": 0.1804375, - "median": 0.08990000000000001, - "metric_id": "fibble1_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.24891262362266392 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "count": 38, - "evaluation_name": "fibble2_arena_win_rate", - "max": 0.3, - "mean": 0.03426315789473684, - "median": 0.0, - "metric_id": "fibble2_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.07868617592468241 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_cost_per_100_calls_usd", - "max": 0.9980941077911198, - "mean": 0.8975751958435603, - "median": 0.9749930187098577, - "metric_id": "cost_per_100_calls_usd", - "metric_kind": "cost", - "metric_name": "Cost per 100 calls", - "metric_unit": "usd", - "min": 0.0, - "stddev": 0.22179374764532372 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_elo", - "max": 1.0, - "mean": 0.5286286504070941, - "median": 0.5551610136078723, - "metric_id": "elo", - "metric_kind": "elo", - "metric_name": "Elo rating", - "metric_unit": "points", - "min": 0.0, - "stddev": 0.23322432731710405 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_rank", - "max": 1.0, - "mean": 0.5, - "median": 0.5, - "metric_id": "rank", - "metric_kind": "rank", - "metric_name": "Rank", - "metric_unit": "position", - "min": 0.0, - "stddev": 0.30035284825530906 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "count": 37, - "evaluation_name": "fibble3_arena_win_rate", - "max": 0.333, - "mean": 0.010551351351351351, - "median": 0.0, - "metric_id": "fibble3_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.0548872866357432 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "count": 37, - "evaluation_name": "fibble5_arena_win_rate", - "max": 0.6364, - "mean": 0.09143783783783783, - "median": 0.0, - "metric_id": "fibble5_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.17735344709138076 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "count": 36, - "evaluation_name": "fibble4_arena_win_rate", - "max": 0.0732, - "mean": 0.0028055555555555555, - "median": 0.0, - "metric_id": "fibble4_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.012925853261571653 - }, - { - "benchmark": "Wordle Arena Word Set", - "count": 32, - "evaluation_name": "wordle_arena_avg_attempts", - "max": 0.534, - "mean": 0.29775, - "median": 0.308, - "metric_id": "wordle_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.0, - "stddev": 0.15686834422328438 - }, - { - "benchmark": "wordle_arena_daily", - "count": 32, - "evaluation_name": "wordle_arena_avg_attempts", - "max": 0.46599999999999997, - "mean": 0.161125, - "median": 0.12700000000000006, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.1632183872469149 - }, - { - "benchmark": "wordle_arena_daily", - "count": 32, - "evaluation_name": "wordle_arena_win_rate", - "max": 1.0, - "mean": 0.4733125, - "median": 0.4165, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.43730335211863347 - }, - { - "benchmark": "Easy Problems", - "count": 29, - "evaluation_name": "Easy Problems", - "max": 0.9014, - "mean": 0.4996824672170957, - "median": 0.5352, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.056338028169014086, - "stddev": 0.2844141675332875 - }, - { - "benchmark": "Hard Problems", - "count": 29, - "evaluation_name": "Hard Problems", - "max": 0.1594, - "mean": 0.009876639145216123, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.03091194562296296 - }, - { - "benchmark": "Medium Problems", - "count": 29, - "evaluation_name": "Medium Problems", - "max": 0.5211, - "mean": 0.11304244779018942, - "median": 0.056338028169014086, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.14517143188248943 - }, - { - "benchmark": "fibble4_arena_daily", - "count": 28, - "evaluation_name": "fibble4_arena_avg_latency_ms", - "max": 0.9995416666666667, - "mean": 0.8651820238095238, - "median": 0.9580791666666666, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.25770499999999996, - "stddev": 0.21579466004033868 - }, - { - "benchmark": "fibble4_arena_daily", - "count": 28, - "evaluation_name": "fibble4_arena_win_rate", - "max": 0.667, - "mean": 0.06310714285714286, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.1740985968598582 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 28, - "evaluation_name": "fibble5_arena_avg_latency_ms", - "max": 0.9994566666666667, - "mean": 0.94403, - "median": 0.9936783333333333, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.6904066666666666, - "stddev": 0.07859265956074572 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 28, - "evaluation_name": "fibble5_arena_win_rate", - "max": 1.0, - "mean": 0.2728214285714286, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.40591789477871915 - }, - { - "benchmark": "fibble_arena_daily", - "count": 28, - "evaluation_name": "fibble_arena_avg_attempts", - "max": 0.48571428571428577, - "mean": 0.0935204081632653, - "median": 0.027142857142857135, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.1390761452271949 - }, - { - "benchmark": "fibble_arena_daily", - "count": 28, - "evaluation_name": "fibble_arena_win_rate", - "max": 1.0, - "mean": 0.2829642857142857, - "median": 0.14300000000000002, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.34601974704330873 - }, - { - "benchmark": "wordle_arena_daily", - "count": 28, - "evaluation_name": "wordle_arena_avg_latency_ms", - "max": 0.99951, - "mean": 0.9754844642857143, - "median": 0.9956175, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.91617, - "stddev": 0.02958724541010069 - }, - { - "benchmark": "fibble_arena_daily", - "count": 26, - "evaluation_name": "fibble_arena_avg_latency_ms", - "max": 0.9997083333333333, - "mean": 0.9607440384615384, - "median": 0.9966616666666667, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.80352, - "stddev": 0.05500541034838315 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 25, - "evaluation_name": "fibble3_arena_avg_latency_ms", - "max": 0.999565, - "mean": 0.9165300666666667, - "median": 0.9949433333333333, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.4411333333333334, - "stddev": 0.13092548313502184 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 25, - "evaluation_name": "fibble3_arena_win_rate", - "max": 1.0, - "mean": 0.07, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.22267315359812312 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "count": 24, - "evaluation_name": "fibble1_arena_avg_attempts", - "max": 0.6071428571428572, - "mean": 0.30160714285714285, - "median": 0.2957142857142857, - "metric_id": "fibble1_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.0, - "stddev": 0.13236522334576908 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 22, - "evaluation_name": "fibble2_arena_avg_latency_ms", - "max": 0.99951, - "mean": 0.9446994696969697, - "median": 0.9964583333333333, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.774135, - "stddev": 0.0790297132511389 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 22, - "evaluation_name": "fibble2_arena_win_rate", - "max": 0.75, - "mean": 0.049227272727272724, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.17183545058550076 - }, - { - "benchmark": "apex-agents", - "count": 19, - "evaluation_name": "Corporate Lawyer Mean Score", - "max": 0.548, - "mean": 0.38605263157894737, - "median": 0.394, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.147, - "stddev": 0.1127334484940327 - }, - { - "benchmark": "appworld/test_normal", - "count": 15, - "evaluation_name": "appworld/test_normal", - "max": 0.7, - "mean": 0.38053333333333333, - "median": 0.505, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.2795156184408681 - }, - { - "benchmark": "browsecompplus", - "count": 15, - "evaluation_name": "browsecompplus", - "max": 0.61, - "mean": 0.47951333333333335, - "median": 0.48, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.09206274930461185 - }, - { - "benchmark": "swe-bench", - "count": 15, - "evaluation_name": "swe-bench", - "max": 0.8072, - "mean": 0.6515666666666666, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5253, - "stddev": 0.08692541685397948 - }, - { - "benchmark": "tau-bench-2/airline", - "count": 15, - "evaluation_name": "tau-bench-2/airline", - "max": 0.74, - "mean": 0.6333333333333333, - "median": 0.66, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.48, - "stddev": 0.0830375703837612 - }, - { - "benchmark": "tau-bench-2/retail", - "count": 15, - "evaluation_name": "tau-bench-2/retail", - "max": 0.85, - "mean": 0.7409, - "median": 0.78, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.51, - "stddev": 0.09942657736095659 - }, - { - "benchmark": "tau-bench-2/telecom", - "count": 15, - "evaluation_name": "tau-bench-2/telecom", - "max": 0.8876, - "mean": 0.69824, - "median": 0.73, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.53, - "stddev": 0.12537163497834292 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "count": 13, - "evaluation_name": "fibble5_arena_avg_attempts", - "max": 0.8571428571428572, - "mean": 0.31, - "median": 0.2857142857142857, - "metric_id": "fibble5_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.1428571428571429, - "stddev": 0.18710286741040952 - }, - { - "benchmark": "apex-agents", - "count": 13, - "evaluation_name": "Overall Pass@1", - "max": 0.335, - "mean": 0.20892307692307693, - "median": 0.23, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.04, - "stddev": 0.09209276259878907 - }, - { - "benchmark": "ace", - "count": 12, - "evaluation_name": "Gaming Score", - "max": 0.613, - "mean": 0.4613333333333333, - "median": 0.46199999999999997, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.284, - "stddev": 0.130447713895668 - }, - { - "benchmark": "ace", - "count": 11, - "evaluation_name": "Overall Score", - "max": 0.561, - "mean": 0.47963636363636364, - "median": 0.478, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.38, - "stddev": 0.06701831500011432 - }, - { - "benchmark": "apex-agents", - "count": 9, - "evaluation_name": "Corporate Law Pass@1", - "max": 0.266, - "mean": 0.18122222222222223, - "median": 0.189, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.078, - "stddev": 0.06984586204238906 - }, - { - "benchmark": "apex-agents", - "count": 9, - "evaluation_name": "Overall Mean Score", - "max": 0.401, - "mean": 0.3071111111111111, - "median": 0.341, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.115, - "stddev": 0.10572658658592508 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 9, - "evaluation_name": "fibble5_arena_avg_attempts", - "max": 0.7142857142857143, - "mean": 0.29873015873015873, - "median": 0.18428571428571427, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0714285714285714, - "stddev": 0.239007775089615 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "count": 8, - "evaluation_name": "fibble2_arena_avg_attempts", - "max": 0.5714285714285714, - "mean": 0.28875, - "median": 0.3692857142857143, - "metric_id": "fibble2_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.0, - "stddev": 0.21391319661285788 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Investment Banking Pass@1", - "max": 0.273, - "mean": 0.17825000000000002, - "median": 0.202, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.012, - "stddev": 0.10551607866644239 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Management Consulting Pass@1", - "max": 0.227, - "mean": 0.122875, - "median": 0.1235, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.029, - "stddev": 0.06801982379109364 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Overall Pass@8", - "max": 0.4, - "mean": 0.29725, - "median": 0.3345, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.115, - "stddev": 0.10747724011555723 - }, - { - "benchmark": "apex-v1", - "count": 7, - "evaluation_name": "Overall Score", - "max": 0.67, - "mean": 0.6027142857142858, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.359, - "stddev": 0.10972953763035471 - }, - { - "benchmark": "La Leaderboard composite dataset", - "count": 5, - "evaluation_name": "la_leaderboard", - "max": 0.3362, - "mean": 0.28874, - "median": 0.2761, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2587, - "stddev": 0.03096309093097781 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 4, - "evaluation_name": "v3_Semi_Private", - "max": 0.7503406194310979, - "mean": 0.4348723241072839, - "median": 0.49457433849901883, - "metric_id": "cost", - "metric_kind": "cost", - "metric_name": "Cost", - "metric_unit": "usd", - "min": 0.0, - "stddev": 0.32065117443368096 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 4, - "evaluation_name": "v3_Semi_Private", - "max": 0.0026, - "mean": 0.001775, - "median": 0.0022500000000000003, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.001195477589361953 - }, - { - "benchmark": "Anthropic RLHF dataset", - "count": 4, - "evaluation_name": "Anthropic RLHF dataset", - "max": 0.993, - "mean": 0.9538500000000001, - "median": 0.9898, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.8428000000000001, - "stddev": 0.07408920299206893 - }, - { - "benchmark": "Best ChatGPT Prompts", - "count": 4, - "evaluation_name": "Best ChatGPT Prompts", - "max": 0.999, - "mean": 0.9971, - "median": 0.9974000000000001, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.9945999999999999, - "stddev": 0.0018366636418608235 - }, - { - "benchmark": "Koala test dataset", - "count": 4, - "evaluation_name": "Koala test dataset", - "max": 0.9974000000000001, - "mean": 0.99515, - "median": 0.995, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.9932000000000001, - "stddev": 0.0019824227601598896 - }, - { - "benchmark": "Open Assistant", - "count": 4, - "evaluation_name": "Open Assistant", - "max": 0.9974000000000001, - "mean": 0.9957499999999999, - "median": 0.9961, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.9934, - "stddev": 0.0019070046320517552 - }, - { - "benchmark": "Self Instruct", - "count": 4, - "evaluation_name": "Self Instruct", - "max": 0.9984, - "mean": 0.99645, - "median": 0.9965999999999999, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.9942, - "stddev": 0.0020680103158994805 - }, - { - "benchmark": "Vicuna", - "count": 4, - "evaluation_name": "Vicuna", - "max": 0.999, - "mean": 0.99855, - "median": 0.9986999999999999, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.9978, - "stddev": 0.0005744562646537976 - }, - { - "benchmark": "helm_instruct", - "count": 4, - "evaluation_name": "Mean win rate", - "max": 0.689, - "mean": 0.5, - "median": 0.611, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.089, - "stddev": 0.27645614480419856 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 3, - "evaluation_name": "v2_Private_Eval", - "max": 0.9983152568248455, - "mean": 0.9251369250630029, - "median": 0.9974080874228392, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.7796874309413243, - "stddev": 0.1259637735393401 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 3, - "evaluation_name": "v2_Private_Eval", - "max": 1.0, - "mean": 0.345, - "median": 0.031, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.004, - "stddev": 0.5674072611449381 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "DIY Score", - "max": 0.56, - "mean": 0.55, - "median": 0.55, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.54, - "stddev": 0.010000000000000009 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "Food Score", - "max": 0.7, - "mean": 0.65, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.6, - "stddev": 0.04999999999999999 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "Shopping Score", - "max": 0.45, - "mean": 0.45, - "median": 0.45, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.45, - "stddev": 0.0 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Big Law Score", - "max": 0.78, - "mean": 0.77, - "median": 0.77, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.76, - "stddev": 0.010000000000000009 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Consulting Score", - "max": 0.64, - "mean": 0.64, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.64, - "stddev": 0.0 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Investment Banking Score", - "max": 0.64, - "mean": 0.6266666666666667, - "median": 0.63, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.61, - "stddev": 0.01527525231651948 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Medicine (MD) Score", - "max": 0.66, - "mean": 0.6533333333333333, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.65, - "stddev": 0.005773502691896263 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "count": 2, - "evaluation_name": "fibble3_arena_avg_attempts", - "max": 0.7142857142857143, - "mean": 0.4285714285714286, - "median": 0.4285714285714286, - "metric_id": "fibble3_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.1428571428571429, - "stddev": 0.40406101782088427 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "count": 2, - "evaluation_name": "fibble4_arena_avg_attempts", - "max": 0.0, - "mean": 0.0, - "median": 0.0, - "metric_id": "fibble4_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 0.0, - "stddev": 0.0 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 1, - "evaluation_name": "fibble2_arena_avg_attempts", - "max": 0.2142857142857143, - "mean": 0.2142857142857143, - "median": 0.2142857142857143, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2142857142857143, - "stddev": 0.0 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 1, - "evaluation_name": "fibble3_arena_avg_attempts", - "max": 0.5, - "mean": 0.5, - "median": 0.5, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5, - "stddev": 0.0 - } - ], - "quality": { - "has_uncertainty": 1603, - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_metadata": 0, - "missing_score": 0, - "out_of_range": 100, - "total_result_rows": 40495, - "zero_width_bounds": 0 - }, - "schema_versions": [ - { - "count": 40495, - "value": "0.2.2" - } - ], - "score_summaries": [ - { - "benchmark": "GPQA", - "count": 4635, - "evaluation_name": "GPQA", - "max": 0.791, - "mean": 0.30281846817691477, - "median": 0.2953, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.168, - "stddev": 0.04912650528590854 - }, - { - "benchmark": "IFEval", - "count": 4635, - "evaluation_name": "IFEval", - "max": 0.951, - "mean": 0.46067240560949296, - "median": 0.4545, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.20767533842318336 - }, - { - "benchmark": "BBH", - "count": 4574, - "evaluation_name": "BBH", - "max": 0.8269, - "mean": 0.4867208351552252, - "median": 0.5038, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2178, - "stddev": 0.11398463853942328 - }, - { - "benchmark": "MATH Level 5", - "count": 4574, - "evaluation_name": "MATH Level 5", - "max": 0.7145, - "mean": 0.1555723874070835, - "median": 0.108, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.14625658002062183 - }, - { - "benchmark": "MMLU-PRO", - "count": 4574, - "evaluation_name": "MMLU-PRO", - "max": 0.7303, - "mean": 0.32874433756012245, - "median": 0.34475, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.1026, - "stddev": 0.12833971558059434 - }, - { - "benchmark": "MUSR", - "count": 4574, - "evaluation_name": "MUSR", - "max": 0.6024, - "mean": 0.40635732400524704, - "median": 0.4091, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2929, - "stddev": 0.04536121071938266 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Factuality", - "max": 0.8716, - "mean": 0.6400781725888325, - "median": 0.6779, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0274, - "stddev": 0.14060436598989037 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Focus", - "max": 0.9838, - "mean": 0.6965137055837564, - "median": 0.7293, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0646, - "stddev": 0.1999740938960993 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Math", - "max": 0.898, - "mean": 0.6002578680203046, - "median": 0.6175, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0546, - "stddev": 0.11530869084864068 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Precise IF", - "max": 0.6625, - "mean": 0.3724553299492386, - "median": 0.375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.1313, - "stddev": 0.06683254610514013 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Safety", - "max": 0.9756, - "mean": 0.770956345177665, - "median": 0.8044, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0378, - "stddev": 0.16859961817216138 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Score", - "max": 0.8413, - "mean": 0.602605076142132, - "median": 0.6194, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0576, - "stddev": 0.13540270878209892 - }, - { - "benchmark": "RewardBench 2", - "count": 197, - "evaluation_name": "Ties", - "max": 0.9063, - "mean": 0.5353568527918782, - "median": 0.5529, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": -0.01, - "stddev": 0.21529016446306679 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat Hard", - "max": 0.9145, - "mean": 0.6117941176470588, - "median": 0.6053, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2654, - "stddev": 0.1713479724227396 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Chat", - "max": 0.9944, - "mean": 0.8923390374331551, - "median": 0.9413, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3547, - "stddev": 0.12437365150350695 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Safety", - "max": 0.9514, - "mean": 0.75624064171123, - "median": 0.7946, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3743, - "stddev": 0.14897429003710377 - }, - { - "benchmark": "RewardBench", - "count": 187, - "evaluation_name": "Score", - "max": 0.9511, - "mean": 0.7524326203208556, - "median": 0.7455, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.4727, - "stddev": 0.12766260032441618 - }, - { - "benchmark": "RewardBench", - "count": 172, - "evaluation_name": "Reasoning", - "max": 0.9912, - "mean": 0.779306976744186, - "median": 0.80125, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.2821, - "stddev": 0.16510278548710738 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 133, - "evaluation_name": "v2_Semi_Private", - "max": 77.16309638, - "mean": 2.503257867518797, - "median": 0.2349, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.0025, - "stddev": 8.619653960249606 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 133, - "evaluation_name": "v2_Semi_Private", - "max": 1.0, - "mean": 0.1482124060150376, - "median": 0.0333, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.23541775910763008 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 131, - "evaluation_name": "v1_Semi_Private", - "max": 44.25900135, - "mean": 1.3386839797709924, - "median": 0.1776, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.0015, - "stddev": 4.543415900755722 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 131, - "evaluation_name": "v1_Semi_Private", - "max": 0.98, - "mean": 0.44456030534351143, - "median": 0.4, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.2907857931349756 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 125, - "evaluation_name": "v2_Public_Eval", - "max": 17.6, - "mean": 1.1880232, - "median": 0.2399, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.0026, - "stddev": 2.946737805009728 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 125, - "evaluation_name": "v2_Public_Eval", - "max": 1.0, - "mean": 0.1310936, - "median": 0.029, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.23801453457380936 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 114, - "evaluation_name": "v1_Public_Eval", - "max": 7.7201, - "mean": 0.49700745614035086, - "median": 0.13765, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.0012, - "stddev": 1.1338969347904155 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 114, - "evaluation_name": "v1_Public_Eval", - "max": 0.9825, - "mean": 0.5073622807017544, - "median": 0.5056499999999999, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0175, - "stddev": 0.2800617230927051 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_accuracy", - "max": 93.12, - "mean": 67.21155963302752, - "median": 70.76, - "metric_id": "bfcl.live.live_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 16.692855101327364 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", - "max": 94.02, - "mean": 66.15788990825688, - "median": 71.04, - "metric_id": "bfcl.live.live_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 17.084967242914786 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", - "max": 93.75, - "mean": 64.27752293577981, - "median": 75.0, - "metric_id": "bfcl.live.live_parallel_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live parallel AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 24.46019866655501 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", - "max": 95.83, - "mean": 57.03339449541284, - "median": 62.5, - "metric_id": "bfcl.live.live_parallel_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live parallel multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 20.59801726435246 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.live.live_simple_ast_accuracy", - "max": 90.31, - "mean": 72.64082568807339, - "median": 76.36, - "metric_id": "bfcl.live.live_simple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Live simple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 16.25125032958663 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.accuracy", - "max": 73.76, - "mean": 20.235045871559635, - "median": 15.7, - "metric_id": "bfcl.memory.accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 16.99218603771948 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.kv_accuracy", - "max": 70.97, - "mean": 13.904036697247706, - "median": 8.39, - "metric_id": "bfcl.memory.kv_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory KV accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 15.15138492137527 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", - "max": 83.23, - "mean": 28.204036697247705, - "median": 27.1, - "metric_id": "bfcl.memory.recursive_summarization_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory recursive summarization accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 20.8463795648454 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.memory.vector_accuracy", - "max": 72.9, - "mean": 18.597155963302754, - "median": 11.61, - "metric_id": "bfcl.memory.vector_accuracy", - "metric_kind": "accuracy", - "metric_name": "Memory vector accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 18.379301567138523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.accuracy", - "max": 77.38, - "mean": 23.962385321100918, - "median": 16.5, - "metric_id": "bfcl.multi_turn.accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 21.479676048452156 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.base_accuracy", - "max": 82.5, - "mean": 29.009174311926607, - "median": 20.0, - "metric_id": "bfcl.multi_turn.base_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn base accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 24.897845144318115 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.long_context_accuracy", - "max": 76.0, - "mean": 24.009174311926607, - "median": 17.5, - "metric_id": "bfcl.multi_turn.long_context_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn long-context accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 21.38372755020874 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", - "max": 77.0, - "mean": 21.591743119266056, - "median": 14.0, - "metric_id": "bfcl.multi_turn.miss_function_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn missing function accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 21.713961750366153 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", - "max": 74.0, - "mean": 21.238532110091743, - "median": 15.0, - "metric_id": "bfcl.multi_turn.miss_parameter_accuracy", - "metric_kind": "accuracy", - "metric_name": "Multi-turn missing parameter accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 19.445269386898502 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.ast_accuracy", - "max": 90.65, - "mean": 76.61733944954129, - "median": 83.0, - "metric_id": "bfcl.non_live.ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 18.657086363085554 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", - "max": 97.0, - "mean": 85.35779816513761, - "median": 92.0, - "metric_id": "bfcl.non_live.multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 18.274031836228097 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", - "max": 96.0, - "mean": 79.79816513761467, - "median": 88.0, - "metric_id": "bfcl.non_live.parallel_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live parallel AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 22.733369915461672 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", - "max": 92.5, - "mean": 73.4770642201835, - "median": 82.5, - "metric_id": "bfcl.non_live.parallel_multiple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live parallel multiple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 24.427840192832814 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.non_live.simple_ast_accuracy", - "max": 80.67, - "mean": 67.83633027522936, - "median": 72.58, - "metric_id": "bfcl.non_live.simple_ast_accuracy", - "metric_kind": "accuracy", - "metric_name": "Non-live simple AST accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 14.843039998882533 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_mean_s", - "max": 169.87, - "mean": 15.127064220183486, - "median": 4.69, - "metric_id": "bfcl.overall.latency_mean_s", - "metric_kind": "latency", - "metric_name": "Latency mean", - "metric_unit": "seconds", - "min": 0.68, - "stddev": 28.519051991371985 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_p95_s", - "max": 568.59, - "mean": 53.85339449541284, - "median": 11.7, - "metric_id": "bfcl.overall.latency_p95_s", - "metric_kind": "latency", - "metric_name": "Latency 95th percentile", - "metric_unit": "seconds", - "min": 0.96, - "stddev": 100.92943454619746 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.latency_std_s", - "max": 212.99, - "mean": 27.425045871559632, - "median": 10.04, - "metric_id": "bfcl.overall.latency_std_s", - "metric_kind": "latency", - "metric_name": "Latency standard deviation", - "metric_unit": "seconds", - "min": 0.45, - "stddev": 39.86152829724822 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.overall_accuracy", - "max": 77.47, - "mean": 38.09394495412844, - "median": 35.52, - "metric_id": "bfcl.overall.overall_accuracy", - "metric_kind": "accuracy", - "metric_name": "Overall accuracy", - "metric_unit": "percentage", - "min": 7.17, - "stddev": 15.683598888904708 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.rank", - "max": 109.0, - "mean": 55.0, - "median": 55.0, - "metric_id": "bfcl.overall.rank", - "metric_kind": "rank", - "metric_name": "Overall rank", - "metric_unit": "position", - "min": 1.0, - "stddev": 31.609597698589376 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.overall.total_cost_usd", - "max": 355.17, - "mean": 47.11669724770642, - "median": 18.25, - "metric_id": "bfcl.overall.total_cost_usd", - "metric_kind": "cost", - "metric_name": "Total cost", - "metric_unit": "usd", - "min": 0.46, - "stddev": 72.06972033379084 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", - "max": 100.0, - "mean": 75.61073394495413, - "median": 80.79, - "metric_id": "bfcl.relevance.irrelevance_detection_accuracy", - "metric_kind": "accuracy", - "metric_name": "Irrelevance detection accuracy", - "metric_unit": "percentage", - "min": 6.28, - "stddev": 16.896574532662488 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", - "max": 100.0, - "mean": 76.37614678899082, - "median": 81.25, - "metric_id": "bfcl.relevance.relevance_detection_accuracy", - "metric_kind": "accuracy", - "metric_name": "Relevance detection accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 19.86204224273847 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.accuracy", - "max": 84.5, - "mean": 24.573394495412845, - "median": 10.5, - "metric_id": "bfcl.web_search.accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 28.751797503234584 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.base_accuracy", - "max": 87.0, - "mean": 26.46788990825688, - "median": 13.0, - "metric_id": "bfcl.web_search.base_accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search base accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 29.552705211555523 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 109, - "evaluation_name": "bfcl.web_search.no_snippet_accuracy", - "max": 85.0, - "mean": 22.678899082568808, - "median": 9.0, - "metric_id": "bfcl.web_search.no_snippet_accuracy", - "metric_kind": "accuracy", - "metric_name": "Web-search no-snippet accuracy", - "metric_unit": "percentage", - "min": 0.0, - "stddev": 28.410639873751833 - }, - { - "benchmark": "RewardBench", - "count": 105, - "evaluation_name": "Prior Sets (0.5 weight)", - "max": 0.782, - "mean": 0.5625428571428571, - "median": 0.5757, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.17788750218625798 - }, - { - "benchmark": "GSM8K", - "count": 91, - "evaluation_name": "GSM8K", - "max": 0.956, - "mean": 0.6556373626373626, - "median": 0.762, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": -1.0, - "stddev": 0.30260192099278316 - }, - { - "benchmark": "LegalBench", - "count": 91, - "evaluation_name": "LegalBench", - "max": 0.757, - "mean": 0.5902087912087912, - "median": 0.629, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.331, - "stddev": 0.11619442676283923 - }, - { - "benchmark": "MATH", - "count": 91, - "evaluation_name": "MATH", - "max": 0.92, - "mean": 0.5574065934065934, - "median": 0.656, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.026, - "stddev": 0.2685588691111619 - }, - { - "benchmark": "MMLU", - "count": 91, - "evaluation_name": "MMLU", - "max": 0.809, - "mean": 0.6220989010989011, - "median": 0.643, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.243, - "stddev": 0.12023218786489331 - }, - { - "benchmark": "MedQA", - "count": 91, - "evaluation_name": "MedQA", - "max": 0.863, - "mean": 0.6103296703296703, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.229, - "stddev": 0.15792234765120447 - }, - { - "benchmark": "NarrativeQA", - "count": 91, - "evaluation_name": "NarrativeQA", - "max": 0.804, - "mean": 0.6938461538461539, - "median": 0.742, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.111, - "stddev": 0.1228501275789075 - }, - { - "benchmark": "NaturalQuestions (closed-book)", - "count": 91, - "evaluation_name": "NaturalQuestions (closed-book)", - "max": 0.502, - "mean": 0.3627912087912088, - "median": 0.378, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.028, - "stddev": 0.08850543190907255 - }, - { - "benchmark": "OpenbookQA", - "count": 91, - "evaluation_name": "OpenbookQA", - "max": 0.972, - "mean": 0.8312527472527472, - "median": 0.882, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.222, - "stddev": 0.16911788087383792 - }, - { - "benchmark": "WMT 2014", - "count": 91, - "evaluation_name": "WMT 2014", - "max": 0.262, - "mean": 0.18178021978021977, - "median": 0.191, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.023, - "stddev": 0.04641450975187302 - }, - { - "benchmark": "helm_lite", - "count": 91, - "evaluation_name": "Mean win rate", - "max": 0.938, - "mean": 0.499967032967033, - "median": 0.488, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.041, - "stddev": 0.24004497034928224 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Abstract Algebra", - "max": 0.84, - "mean": 0.4692405063291139, - "median": 0.44, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.21, - "stddev": 0.1566784405169303 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Anatomy", - "max": 0.911, - "mean": 0.7049620253164557, - "median": 0.719, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.222, - "stddev": 0.12203524533321435 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Astronomy", - "max": 0.974, - "mean": 0.8196835443037974, - "median": 0.855, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.342, - "stddev": 0.12503810130124515 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Business Ethics", - "max": 0.89, - "mean": 0.7354430379746836, - "median": 0.77, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.24, - "stddev": 0.1177001565076888 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Clinical Knowledge", - "max": 0.928, - "mean": 0.7806329113924051, - "median": 0.8, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.10518545005348215 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "College Physics", - "max": 0.863, - "mean": 0.5205189873417722, - "median": 0.51, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.196, - "stddev": 0.13341576241396605 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Computer Security", - "max": 0.89, - "mean": 0.7888607594936708, - "median": 0.8, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.3, - "stddev": 0.07740978772295665 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Conceptual Physics", - "max": 0.949, - "mean": 0.7394050632911392, - "median": 0.774, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.319, - "stddev": 0.1436847973853721 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Econometrics", - "max": 0.807, - "mean": 0.5924556962025317, - "median": 0.614, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.307, - "stddev": 0.12405156056525753 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Electrical Engineering", - "max": 0.869, - "mean": 0.7012531645569621, - "median": 0.724, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.29, - "stddev": 0.10967007262512768 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Elementary Mathematics", - "max": 0.942, - "mean": 0.6168481012658228, - "median": 0.622, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.254, - "stddev": 0.17076712953141734 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Formal Logic", - "max": 0.786, - "mean": 0.5559240506329114, - "median": 0.571, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.27, - "stddev": 0.11667484646986527 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Global Facts", - "max": 0.8, - "mean": 0.49860759493670886, - "median": 0.5, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.25, - "stddev": 0.11856767165669667 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "High School World History", - "max": 0.958, - "mean": 0.8590253164556962, - "median": 0.89, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.253, - "stddev": 0.1104488482004626 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Human Sexuality", - "max": 0.939, - "mean": 0.7969367088607595, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.267, - "stddev": 0.14067149783040647 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "International Law", - "max": 0.959, - "mean": 0.8525189873417721, - "median": 0.884, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.306, - "stddev": 0.09770414010589916 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Jurisprudence", - "max": 0.907, - "mean": 0.8231518987341773, - "median": 0.852, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.25, - "stddev": 0.09722219971870344 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Logical Fallacies", - "max": 0.926, - "mean": 0.8139873417721519, - "median": 0.834, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.264, - "stddev": 0.0972786763034739 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "MMLU All Subjects", - "max": 0.873, - "mean": 0.7308227848101266, - "median": 0.757, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.295, - "stddev": 0.10005918242229046 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Machine Learning", - "max": 0.839, - "mean": 0.592126582278481, - "median": 0.616, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.286, - "stddev": 0.12807703682255595 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Management", - "max": 0.942, - "mean": 0.8453037974683544, - "median": 0.864, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.272, - "stddev": 0.09395052631917909 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Marketing", - "max": 0.962, - "mean": 0.9024556962025316, - "median": 0.923, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.269, - "stddev": 0.08556236254220637 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Mean win rate", - "max": 1.0, - "mean": 0.5000506329113924, - "median": 0.517, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.014, - "stddev": 0.2741845671999428 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Medical Genetics", - "max": 0.98, - "mean": 0.8162025316455697, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.28, - "stddev": 0.11717074761250226 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Miscellaneous", - "max": 0.964, - "mean": 0.8688607594936709, - "median": 0.893, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.292, - "stddev": 0.09859535722376811 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Moral Scenarios", - "max": 0.902, - "mean": 0.5793924050632911, - "median": 0.575, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.231, - "stddev": 0.19478445797799818 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Nutrition", - "max": 0.928, - "mean": 0.7968987341772152, - "median": 0.82, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.34, - "stddev": 0.1008295839442827 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Philosophy", - "max": 0.9, - "mean": 0.7844303797468355, - "median": 0.807, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.325, - "stddev": 0.09312807331625374 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Prehistory", - "max": 0.951, - "mean": 0.824746835443038, - "median": 0.858, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.318, - "stddev": 0.10757030716441658 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Professional Psychology", - "max": 0.922, - "mean": 0.7793291139240506, - "median": 0.812, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.232, - "stddev": 0.1177310844427953 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Public Relations", - "max": 0.855, - "mean": 0.724873417721519, - "median": 0.736, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.345, - "stddev": 0.0757594653625247 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Security Studies", - "max": 0.886, - "mean": 0.778126582278481, - "median": 0.804, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.408, - "stddev": 0.09570378540441088 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Sociology", - "max": 0.96, - "mean": 0.8729493670886076, - "median": 0.9, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.383, - "stddev": 0.08587676004752948 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Us Foreign Policy", - "max": 0.97, - "mean": 0.8918987341772152, - "median": 0.92, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.09360413026947771 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "Virology", - "max": 0.602, - "mean": 0.5457215189873418, - "median": 0.56, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.392, - "stddev": 0.047070851318166546 - }, - { - "benchmark": "helm_mmlu", - "count": 79, - "evaluation_name": "World Religions", - "max": 0.924, - "mean": 0.8426455696202532, - "median": 0.865, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.234, - "stddev": 0.08472202480187987 - }, - { - "benchmark": "MMLU-Pro", - "count": 61, - "evaluation_name": "MMLU-Pro", - "max": 0.875, - "mean": 0.6609344262295082, - "median": 0.723, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.169, - "stddev": 0.1866150109050233 - }, - { - "benchmark": "Omni-MATH", - "count": 61, - "evaluation_name": "Omni-MATH", - "max": 0.722, - "mean": 0.3746065573770492, - "median": 0.364, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.072, - "stddev": 0.17904862269679006 - }, - { - "benchmark": "WildBench", - "count": 61, - "evaluation_name": "WildBench", - "max": 0.866, - "mean": 0.7791803278688525, - "median": 0.797, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.477, - "stddev": 0.07613989497338025 - }, - { - "benchmark": "helm_capabilities", - "count": 61, - "evaluation_name": "Mean score", - "max": 0.819, - "mean": 0.6281803278688525, - "median": 0.642, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.325, - "stddev": 0.12667261058817744 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Arabic", - "max": 0.9475, - "mean": 0.8123458333333333, - "median": 0.82375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.455, - "stddev": 0.11404825771861875 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Bengali", - "max": 0.9425, - "mean": 0.8118458333333334, - "median": 0.82375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5175, - "stddev": 0.10786060736231451 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Burmese", - "max": 0.945, - "mean": 0.8254416666666666, - "median": 0.8375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.63, - "stddev": 0.08983182356393916 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Chinese", - "max": 0.9475, - "mean": 0.80325, - "median": 0.835, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5075, - "stddev": 0.12931314787277418 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Culturally Agnostic", - "max": 0.9528, - "mean": 0.8264125, - "median": 0.857, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5631, - "stddev": 0.10811543599320127 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Culturally Sensitive", - "max": 0.9397, - "mean": 0.788525, - "median": 0.78935, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5391, - "stddev": 0.1149148963548909 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "English", - "max": 0.9475, - "mean": 0.7939833333333334, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.38, - "stddev": 0.15081692344416497 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "French", - "max": 0.9575, - "mean": 0.7944791666666666, - "median": 0.8275, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.41, - "stddev": 0.14230966528431346 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "German", - "max": 0.94, - "mean": 0.8004833333333333, - "median": 0.8275, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.4775, - "stddev": 0.12445258061886479 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Global MMLU Lite", - "max": 0.9453, - "mean": 0.8074583333333334, - "median": 0.82315, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5511, - "stddev": 0.11081356363967734 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Hindi", - "max": 0.9475, - "mean": 0.7983333333333333, - "median": 0.82355, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.555, - "stddev": 0.11719085240122123 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Indonesian", - "max": 0.955, - "mean": 0.801275, - "median": 0.80625, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.515, - "stddev": 0.11649187077838011 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Italian", - "max": 0.955, - "mean": 0.8056875, - "median": 0.8300000000000001, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.48, - "stddev": 0.1239779332201175 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Japanese", - "max": 0.94, - "mean": 0.8170291666666667, - "median": 0.84375, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.58, - "stddev": 0.10297801657229139 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Korean", - "max": 0.95, - "mean": 0.820125, - "median": 0.84, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.595, - "stddev": 0.10111529652574511 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Portuguese", - "max": 0.945, - "mean": 0.8010041666666666, - "median": 0.8323, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5175, - "stddev": 0.12492813757011505 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Spanish", - "max": 0.9475, - "mean": 0.8042458333333333, - "median": 0.8325, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.485, - "stddev": 0.12684843352857172 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Swahili", - "max": 0.94, - "mean": 0.8143708333333334, - "median": 0.8200000000000001, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.6075, - "stddev": 0.09313423156204427 - }, - { - "benchmark": "global-mmlu-lite", - "count": 48, - "evaluation_name": "Yoruba", - "max": 0.9425, - "mean": 0.8155583333333334, - "median": 0.8223, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5825, - "stddev": 0.09530013023440752 - }, - { - "benchmark": "Wordle Arena Word Set", - "count": 43, - "evaluation_name": "wordle_arena_win_rate", - "max": 1.0, - "mean": 0.38320930232558137, - "median": 0.3, - "metric_id": "wordle_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.3652171551113076 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 40, - "evaluation_name": "bfcl.format_sensitivity.max_delta", - "max": 81.5, - "mean": 34.5125, - "median": 22.5, - "metric_id": "bfcl.format_sensitivity.max_delta", - "metric_kind": "difference", - "metric_name": "Format sensitivity max delta", - "metric_unit": "percentage_points", - "min": 0.0, - "stddev": 26.711537769286142 - }, - { - "benchmark": "BFCL leaderboard CSV", - "count": 40, - "evaluation_name": "bfcl.format_sensitivity.stddev", - "max": 34.18, - "mean": 10.88475, - "median": 5.699999999999999, - "metric_id": "bfcl.format_sensitivity.stddev", - "metric_kind": "difference", - "metric_name": "Format sensitivity standard deviation", - "metric_unit": "percentage_points", - "min": 0.0, - "stddev": 9.736365348132109 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "count": 40, - "evaluation_name": "fibble1_arena_win_rate", - "max": 0.881, - "mean": 0.1804375, - "median": 0.08990000000000001, - "metric_id": "fibble1_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.24891262362266392 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "count": 38, - "evaluation_name": "fibble2_arena_win_rate", - "max": 0.3, - "mean": 0.03426315789473684, - "median": 0.0, - "metric_id": "fibble2_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.07868617592468241 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_cost_per_100_calls_usd", - "max": 28.648, - "mean": 2.934265789473684, - "median": 0.7163999999999999, - "metric_id": "cost_per_100_calls_usd", - "metric_kind": "cost", - "metric_name": "Cost per 100 calls", - "metric_unit": "usd", - "min": 0.0546, - "stddev": 6.353947282543234 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_elo", - "max": 1151.3779287263492, - "mean": 999.7829236774063, - "median": 1008.3158430770602, - "metric_id": "elo", - "metric_kind": "elo", - "metric_name": "Elo rating", - "metric_unit": "points", - "min": 829.7737302958208, - "stddev": 75.00592284131643 - }, - { - "benchmark": "SciArena leaderboard API", - "count": 38, - "evaluation_name": "overall_rank", - "max": 38.0, - "mean": 19.5, - "median": 19.5, - "metric_id": "rank", - "metric_kind": "rank", - "metric_name": "Rank", - "metric_unit": "position", - "min": 1.0, - "stddev": 11.113055385446435 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "count": 37, - "evaluation_name": "fibble3_arena_win_rate", - "max": 0.333, - "mean": 0.010551351351351351, - "median": 0.0, - "metric_id": "fibble3_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.0548872866357432 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "count": 37, - "evaluation_name": "fibble5_arena_win_rate", - "max": 0.6364, - "mean": 0.09143783783783783, - "median": 0.0, - "metric_id": "fibble5_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.17735344709138076 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "count": 36, - "evaluation_name": "fibble4_arena_win_rate", - "max": 0.0732, - "mean": 0.0028055555555555555, - "median": 0.0, - "metric_id": "fibble4_arena.win_rate", - "metric_kind": null, - "metric_name": "Win Rate", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.012925853261571653 - }, - { - "benchmark": "Wordle Arena Word Set", - "count": 32, - "evaluation_name": "wordle_arena_avg_attempts", - "max": 6.0, - "mean": 4.51125, - "median": 4.46, - "metric_id": "wordle_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 3.33, - "stddev": 0.7843417211164219 - }, - { - "benchmark": "wordle_arena_daily", - "count": 32, - "evaluation_name": "wordle_arena_avg_attempts", - "max": 6.0, - "mean": 5.194375, - "median": 5.365, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 3.67, - "stddev": 0.8160919362345744 - }, - { - "benchmark": "wordle_arena_daily", - "count": 32, - "evaluation_name": "wordle_arena_win_rate", - "max": 100.0, - "mean": 47.33125, - "median": 41.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 43.73033521186335 - }, - { - "benchmark": "Easy Problems", - "count": 29, - "evaluation_name": "Easy Problems", - "max": 0.9014, - "mean": 0.4996824672170957, - "median": 0.5352, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.056338028169014086, - "stddev": 0.2844141675332875 - }, - { - "benchmark": "Hard Problems", - "count": 29, - "evaluation_name": "Hard Problems", - "max": 0.1594, - "mean": 0.009876639145216123, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.03091194562296296 - }, - { - "benchmark": "Medium Problems", - "count": 29, - "evaluation_name": "Medium Problems", - "max": 0.5211, - "mean": 0.11304244779018942, - "median": 0.056338028169014086, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.14517143188248943 - }, - { - "benchmark": "fibble4_arena_daily", - "count": 28, - "evaluation_name": "fibble4_arena_avg_attempts", - "max": 12.0, - "mean": 11.7, - "median": 12.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 9.0, - "stddev": 0.7990735376162312 - }, - { - "benchmark": "fibble4_arena_daily", - "count": 28, - "evaluation_name": "fibble4_arena_avg_latency_ms", - "max": 445377.0, - "mean": 80890.78571428571, - "median": 25152.5, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 275.0, - "stddev": 129476.7960242032 - }, - { - "benchmark": "fibble4_arena_daily", - "count": 28, - "evaluation_name": "fibble4_arena_win_rate", - "max": 66.7, - "mean": 6.310714285714286, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 17.409859685985822 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 28, - "evaluation_name": "fibble5_arena_avg_attempts", - "max": 9.0, - "mean": 7.9835714285714285, - "median": 9.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 3.0, - "stddev": 1.7199229403188157 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 28, - "evaluation_name": "fibble5_arena_avg_latency_ms", - "max": 185756.0, - "mean": 33582.0, - "median": 3793.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 326.0, - "stddev": 47155.59573644743 - }, - { - "benchmark": "fibble5_arena_daily", - "count": 28, - "evaluation_name": "fibble5_arena_win_rate", - "max": 100.0, - "mean": 27.28214285714286, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 40.59178947787191 - }, - { - "benchmark": "fibble_arena_daily", - "count": 28, - "evaluation_name": "fibble_arena_avg_attempts", - "max": 8.0, - "mean": 7.345357142857143, - "median": 7.8100000000000005, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.6, - "stddev": 0.9735330165903643 - }, - { - "benchmark": "fibble_arena_daily", - "count": 28, - "evaluation_name": "fibble_arena_win_rate", - "max": 100.0, - "mean": 28.29642857142857, - "median": 14.3, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 34.60197470433087 - }, - { - "benchmark": "wordle_arena_daily", - "count": 28, - "evaluation_name": "wordle_arena_avg_latency_ms", - "max": 50298.0, - "mean": 14709.32142857143, - "median": 2629.5, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 294.0, - "stddev": 17752.34724606042 - }, - { - "benchmark": "fibble_arena_daily", - "count": 26, - "evaluation_name": "fibble_arena_avg_latency_ms", - "max": 117888.0, - "mean": 23553.576923076922, - "median": 2003.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 175.0, - "stddev": 33003.246209029894 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 25, - "evaluation_name": "fibble3_arena_avg_attempts", - "max": 12.0, - "mean": 11.6, - "median": 12.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.5, - "stddev": 1.5612494995995996 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 25, - "evaluation_name": "fibble3_arena_avg_latency_ms", - "max": 335320.0, - "mean": 50081.96, - "median": 3034.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 261.0, - "stddev": 78555.2898810131 - }, - { - "benchmark": "fibble3_arena_daily", - "count": 25, - "evaluation_name": "fibble3_arena_win_rate", - "max": 100.0, - "mean": 7.0, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 22.26731535981231 - }, - { - "benchmark": "Fibble Arena (1 lie) Word Set", - "count": 24, - "evaluation_name": "fibble1_arena_avg_attempts", - "max": 8.0, - "mean": 5.88875, - "median": 5.93, - "metric_id": "fibble1_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 3.75, - "stddev": 0.9265565634203836 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 22, - "evaluation_name": "fibble2_arena_avg_attempts", - "max": 10.0, - "mean": 9.765, - "median": 10.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 6.5, - "stddev": 0.8113466172563569 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 22, - "evaluation_name": "fibble2_arena_avg_latency_ms", - "max": 135519.0, - "mean": 33180.318181818184, - "median": 2125.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 294.0, - "stddev": 47417.827950683335 - }, - { - "benchmark": "fibble2_arena_daily", - "count": 22, - "evaluation_name": "fibble2_arena_win_rate", - "max": 75.0, - "mean": 4.922727272727273, - "median": 0.0, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 17.183545058550077 - }, - { - "benchmark": "apex-agents", - "count": 19, - "evaluation_name": "Corporate Lawyer Mean Score", - "max": 0.548, - "mean": 0.38605263157894737, - "median": 0.394, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.147, - "stddev": 0.1127334484940327 - }, - { - "benchmark": "appworld/test_normal", - "count": 15, - "evaluation_name": "appworld/test_normal", - "max": 0.7, - "mean": 0.38053333333333333, - "median": 0.505, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.0, - "stddev": 0.2795156184408681 - }, - { - "benchmark": "browsecompplus", - "count": 15, - "evaluation_name": "browsecompplus", - "max": 0.61, - "mean": 0.47951333333333335, - "median": 0.48, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.26, - "stddev": 0.09206274930461185 - }, - { - "benchmark": "swe-bench", - "count": 15, - "evaluation_name": "swe-bench", - "max": 0.8072, - "mean": 0.6515666666666666, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.5253, - "stddev": 0.08692541685397948 - }, - { - "benchmark": "tau-bench-2/airline", - "count": 15, - "evaluation_name": "tau-bench-2/airline", - "max": 0.74, - "mean": 0.6333333333333333, - "median": 0.66, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.48, - "stddev": 0.0830375703837612 - }, - { - "benchmark": "tau-bench-2/retail", - "count": 15, - "evaluation_name": "tau-bench-2/retail", - "max": 0.85, - "mean": 0.7409, - "median": 0.78, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.51, - "stddev": 0.09942657736095659 - }, - { - "benchmark": "tau-bench-2/telecom", - "count": 15, - "evaluation_name": "tau-bench-2/telecom", - "max": 0.8876, - "mean": 0.69824, - "median": 0.73, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.53, - "stddev": 0.12537163497834292 - }, - { - "benchmark": "Fibble5 Arena (5 lies) Word Set", - "count": 13, - "evaluation_name": "fibble5_arena_avg_attempts", - "max": 7.0, - "mean": 5.83, - "median": 6.0, - "metric_id": "fibble5_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 2.0, - "stddev": 1.3097200718728665 - }, - { - "benchmark": "apex-agents", - "count": 13, - "evaluation_name": "Overall Pass@1", - "max": 0.335, - "mean": 0.20892307692307693, - "median": 0.23, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.04, - "stddev": 0.09209276259878907 - }, - { - "benchmark": "ace", - "count": 12, - "evaluation_name": "Gaming Score", - "max": 0.613, - "mean": 0.4613333333333333, - "median": 0.46199999999999997, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.284, - "stddev": 0.130447713895668 - }, - { - "benchmark": "ace", - "count": 11, - "evaluation_name": "Overall Score", - "max": 0.561, - "mean": 0.47963636363636364, - "median": 0.478, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.38, - "stddev": 0.06701831500011432 - }, - { - "benchmark": "apex-agents", - "count": 9, - "evaluation_name": "Corporate Law Pass@1", - "max": 0.266, - "mean": 0.18122222222222223, - "median": 0.189, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.078, - "stddev": 0.06984586204238906 - }, - { - "benchmark": "apex-agents", - "count": 9, - "evaluation_name": "Overall Mean Score", - "max": 0.401, - "mean": 0.3071111111111111, - "median": 0.341, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.115, - "stddev": 0.10572658658592508 - }, - { - "benchmark": "Fibble2 Arena (2 lies) Word Set", - "count": 8, - "evaluation_name": "fibble2_arena_avg_attempts", - "max": 8.0, - "mean": 5.97875, - "median": 5.415, - "metric_id": "fibble2_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 4.0, - "stddev": 1.4973923762900052 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Investment Banking Pass@1", - "max": 0.273, - "mean": 0.17825000000000002, - "median": 0.202, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.012, - "stddev": 0.10551607866644239 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Management Consulting Pass@1", - "max": 0.227, - "mean": 0.122875, - "median": 0.1235, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.029, - "stddev": 0.06801982379109364 - }, - { - "benchmark": "apex-agents", - "count": 8, - "evaluation_name": "Overall Pass@8", - "max": 0.4, - "mean": 0.29725, - "median": 0.3345, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.115, - "stddev": 0.10747724011555723 - }, - { - "benchmark": "apex-v1", - "count": 7, - "evaluation_name": "Overall Score", - "max": 0.67, - "mean": 0.6027142857142858, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.359, - "stddev": 0.10972953763035471 - }, - { - "benchmark": "La Leaderboard composite dataset", - "count": 5, - "evaluation_name": "la_leaderboard", - "max": 33.62, - "mean": 28.874, - "median": 27.61, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 25.87, - "stddev": 3.0963090930977795 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 4, - "evaluation_name": "v3_Semi_Private", - "max": 8866.2, - "mean": 5010.535, - "median": 4481.205, - "metric_id": "cost", - "metric_kind": "cost", - "metric_name": "Cost", - "metric_unit": "usd", - "min": 2213.53, - "stddev": 2842.9574427639027 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 4, - "evaluation_name": "v3_Semi_Private", - "max": 0.0026, - "mean": 0.001775, - "median": 0.0022500000000000003, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.0, - "stddev": 0.001195477589361953 - }, - { - "benchmark": "Anthropic RLHF dataset", - "count": 4, - "evaluation_name": "Anthropic RLHF dataset", - "max": 4.965, - "mean": 4.76925, - "median": 4.949, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.214, - "stddev": 0.3704460149603447 - }, - { - "benchmark": "Best ChatGPT Prompts", - "count": 4, - "evaluation_name": "Best ChatGPT Prompts", - "max": 4.995, - "mean": 4.9855, - "median": 4.987, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.973, - "stddev": 0.009183318209304073 - }, - { - "benchmark": "Koala test dataset", - "count": 4, - "evaluation_name": "Koala test dataset", - "max": 4.987, - "mean": 4.97575, - "median": 4.975, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.966, - "stddev": 0.009912113800799387 - }, - { - "benchmark": "Open Assistant", - "count": 4, - "evaluation_name": "Open Assistant", - "max": 4.987, - "mean": 4.97875, - "median": 4.980499999999999, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.967, - "stddev": 0.0095350231602587 - }, - { - "benchmark": "Self Instruct", - "count": 4, - "evaluation_name": "Self Instruct", - "max": 4.992, - "mean": 4.9822500000000005, - "median": 4.9830000000000005, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.971, - "stddev": 0.010340051579497423 - }, - { - "benchmark": "Vicuna", - "count": 4, - "evaluation_name": "Vicuna", - "max": 4.995, - "mean": 4.99275, - "median": 4.9935, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 4.989, - "stddev": 0.0028722813232691232 - }, - { - "benchmark": "helm_instruct", - "count": 4, - "evaluation_name": "Mean win rate", - "max": 0.689, - "mean": 0.5, - "median": 0.611, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.089, - "stddev": 0.27645614480419856 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 3, - "evaluation_name": "v2_Private_Eval", - "max": 17.0, - "mean": 5.776666666666666, - "median": 0.2, - "metric_id": "cost_per_task", - "metric_kind": "cost", - "metric_name": "Cost per task", - "metric_unit": "usd", - "min": 0.13, - "stddev": 9.719754798004594 - }, - { - "benchmark": "ARC Prize evaluations leaderboard JSON", - "count": 3, - "evaluation_name": "v2_Private_Eval", - "max": 1.0, - "mean": 0.345, - "median": 0.031, - "metric_id": "score", - "metric_kind": "accuracy", - "metric_name": "ARC score", - "metric_unit": "proportion", - "min": 0.004, - "stddev": 0.5674072611449381 - }, - { - "benchmark": "Fibble3 Arena (3 lies) Word Set", - "count": 3, - "evaluation_name": "fibble3_arena_avg_attempts", - "max": 12.0, - "mean": 7.333333333333333, - "median": 7.0, - "metric_id": "fibble3_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 3.0, - "stddev": 4.509249752822894 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "DIY Score", - "max": 0.56, - "mean": 0.55, - "median": 0.55, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.54, - "stddev": 0.010000000000000009 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "Food Score", - "max": 0.7, - "mean": 0.65, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.6, - "stddev": 0.04999999999999999 - }, - { - "benchmark": "ace", - "count": 3, - "evaluation_name": "Shopping Score", - "max": 0.45, - "mean": 0.45, - "median": 0.45, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.45, - "stddev": 0.0 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Big Law Score", - "max": 0.78, - "mean": 0.77, - "median": 0.77, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.76, - "stddev": 0.010000000000000009 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Consulting Score", - "max": 0.64, - "mean": 0.64, - "median": 0.64, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.64, - "stddev": 0.0 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Investment Banking Score", - "max": 0.64, - "mean": 0.6266666666666667, - "median": 0.63, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.61, - "stddev": 0.01527525231651948 - }, - { - "benchmark": "apex-v1", - "count": 3, - "evaluation_name": "Medicine (MD) Score", - "max": 0.66, - "mean": 0.6533333333333333, - "median": 0.65, - "metric_id": null, - "metric_kind": null, - "metric_name": null, - "metric_unit": null, - "min": 0.65, - "stddev": 0.005773502691896263 - }, - { - "benchmark": "Fibble4 Arena (4 lies) Word Set", - "count": 2, - "evaluation_name": "fibble4_arena_avg_attempts", - "max": 8.0, - "mean": 8.0, - "median": 8.0, - "metric_id": "fibble4_arena.avg_attempts", - "metric_kind": null, - "metric_name": "Average Attempts", - "metric_unit": "guesses", - "min": 8.0, - "stddev": 0.0 - } - ] - }, - "observational": { - "exclusions": { - "incompatible_score_type": 0, - "missing_bounds": 0, - "missing_score": 0, - "out_of_range": 100, - "zero_width_bounds": 0 - }, - "valid_normalized_rows": 40395 - } -}

W`_7!{S8ZJws18FRH7p2$JN$EEXJtS98DYNDghWwjnl2bizcI7@W8Z z0$C1kyF91490z5cycRR~RYTgh(Gk>vKXds!mKtIL+k8ptn=v0~YlBs}5YdzYk5WIv zkkxm))h~gUBJ3OUBqe*^uiM$ki#j6eUniRFdWybdf-ms8K%9}8yKe}7LvGx=>xE7E ze(Rg*2FYorMb1X6IIK&OemsI-JS1UfP~RW}neBo%lwx>HlU%TCXz?iLaG02=wl)`L zP^ZO*mx`a{^sB6~3ImS`8`+*o!GfN_42Zx9V#3_H#>wa<^R^P7qoL1zKAVXpQ*v$_ z;4TEEjz?(&v7mS|Y9CGY>!+<2K>D0hD^ z>axq;gh@%?Y;%E^t!*yS-!oFhdV-DOL5M(AXTrFw8%Bq&v+odv+}e>Lsw3v&cKrJCSUiS&wNXaa{bkP*x{ ztHLSU0tu_bSK>}ox)zh;%0yV~7(nKXU6k>K)n`H3I_ZgD18J{w5@8UR`&V%w`2mf< zAFR1bH5fywlj%p(jTgo9#3^(BNThX9D+E=;pQ5sb3=;IHIz~IPV&(}NiNyLuUQQ-0 zmtaH)3rW}A8jCD?N$FIhrefaVb-#_Wdvh6tA4kkq=|?AV`6`KUbLRO+2ikDu&J`mW zS5Jsr`UT~iOS*a3QzOXM2E86ngU*5j z^xSD)v`Jmi*O>%{C$)lEH2oy?z>S(NTbOi*)+)i!b)GWJ$g-q)fL8Pf`rWsWZdt?b zB=@@bRTd)xaTp>Pc%RieyOu@vkUJjr6T|vxH~p^7NKvS?IHY5*kneou96S{v*ul^F zb0^uOApKLs&Yl2YcjG0quH%j}Rq@RT3DX@@{^Arj9vo`u*;or#3! z3Fc4Pw+p1;y^jWsY40;k6;OqRf(pTh0bi2{8Z5La=-6?3n&1T^R=>qOFiki0EuVL% zj2wIj-C)b$_IQ~nWjA+bLkd91joQz5H;*{@ah9IN)WdWN*i{XUiKZZ~D z(Fr+WXdx5M4{HVLu^A-x(lP~K30^Y4ls1-C4L8ZnFj0>Qynsg;=X1@6_MovRtYt05 z0dgY4b7ntakyp25&<=kudYx(>8qr*e$!fJu!lv=GFEoOk(Wy_Ed_@iU`cqVVvT{vB zwge8YBy8`Mk>`;gJ#I0|aY&K3`kJAEVefCaJ!s?x{y+}ENhqh@F+0h=qnbTS2fWq9 zS9dWB+(Tzj{k825h==6Xo4h*JAl;JBlcj`NvF$XTe> z)3lC@v}VXEIrVhkIqUpl7>_g6$L1g2lE>Zlg~yDRd#^`|o7p+HeCRL`hDLbgX$;MHNTmX~)y|oTxFBJJd9dL_ zU27cmtg>cc<>kdqr%bPuIA1%sW>PUT` zmr8b&M>QbEvLe7$ZEtGX{dMT&+mxwmF)7>`x|i%5vGc{ z*l(9Or`>~t)E!6`3CG~($DE!6;)l#QDLp^6)5H&B7T8|)%z)dern&SCNo$bNHK_%5 zP%n=)!1B)me3n7h%_sd{vHg?O6jsqDtSCEWKU&AO;_UMUcg?b{xi50*P9YifvAtyz zPmP1RKQ7>tG?Bac)(ZkYRcHQ8!}k$(Q~oRxFZ+3a&PDeVOi%jRZDF_$y$oOXHs6%~ z*}38e$cE7;7(K?116aO^^Os`l)#McXb+rwPQWha&TWEz0swnD@yQu3B6&=*O7+ABx^cPP{b#5U625`hgf>tD--z4DI=s*SF>hX;6CU0%=pIgpM|lxeQch&)XDao<8O40}8uKlQrEn9(S->2Q_XN2381Ixb zD{kNrDzbTivuhKb!C@vUo>CP)6+FDlrPFixGUV{C3O-a+QYW)8AmYgzs`NS~MAr-) zPgvoyUCgAZ0VD#>!$T*=L7L;gxb0*PAXx zypAg)PphGykugzfF7mG1=c5z~O>nYQyO_HM_fH=*iT-~9M4|9sky zRBJ^M()O1XsVDoTa-=PWF-SA6RP&=)38 z@#qMY$n+}7X&Rq1ydj;YSE}3hnER7s6qB+EEw^m(F`=eKywxaMh?WE{Ou=vUKbpp9 z?Ggybm(9*V43k6YI$s3JWq|=HyR!65SD31;&JCJ&+gKKiI5pv!VGFQCx|~ zXzycy!$?ItG|6({H!tkz0|VreqaGJf>0^jsUUvKW!8v z-uB%RcbG~;WIwo-&&#As04|jBNG5sP)_{`xVN7Z!Xl@T+AY4O{rQDjLL zS)5B(pSp~*;QpUVed`*iZJA3e9Fzw?5Vfgm_m~`p)E|AH zG_R&Wk<5R)5tUY7CTpUdz1C89q70 zv<-}fD6amjTc-?G^rL*LN#IOyjA`Nf!sa^;%GdO=G0KZT%q%StW1>6AN4Gd(k)`Dgp}LFy?sn0Il6v6@fE3{Gi5)Yl z>9rj%kHGTcMoqP_iL&Syk9wDK0&Dj1s3DcU^FC(&rO)~)CYvIQE%Gx`2ELn!da3e| z76ABPmxKOH)7JXqVx3?5+h1<@qo?y?i7AxRFsBiX_$VzV%1rLT_N(aorXk(7gu9r- zh1w4q-%?F`D~d?7sfB^6tkVyiVkDQib20^ZXAalCk7FC(%`(tLM-uot6=f zU`Y_IGnr_GAgNjeD+Jb!J9?=%kKSoOM`v#nMp>c>pPBWjzReBtvY+spT*IB?Oz?XzHAd2qiR>RV)xDe^Em=R_d@hK%>$j|6gS}Zg{D-CPVS~Sc#D=kAH^KtYFh+ z;n1_e0=mq$8%mQ;#34ltYM2VuViGJpwa$G|+}s3%C-FO^K@OJNU~jj0m@sF=2)h^Rld8;o1~flwTQ|YPKh$Bt|!lC*IoY>-W-9 z|MsL;hOSGk=Qt9dhDkEQq!VQ&_0ZAS0apz+Q~I85pst_syRt!n`H;$%NtXKt{9|Q{ z{c~d8^~MQ>h7?ZUIm2X$c!Eo2Tr?+!at|2Xzj+IPY#3tRZb+e%hq*{f zh~&8yQo>w68-}v&9P~QaQ}ONUb5qH@r`gU)1+%mNqSeiFT`!S1Zhub7!*is2cc6{K zB!@>g$pdRyMqvME`b?xwGlhEd2UFJj zi^|$qrBK(-+MHP;@W);4npo1!jVE9vnQ@g|GHbGribLE77mP82$I=37`W^3+^cdZ% z5peyc&U|26YNP6J?*J7#EDL4ozn?+dqYLqL-ykdcCGg0D_ituGwgw9O7F_DH2p2+7 ziM}{+7KPW~N2KnhpG-FiZfddsYEe=HTuzLYw?FSf=nd=I4%c+_`R*ktc?CySW1@Tt zev!Dc>Fsr0g}q=YMH`okY&BK*>94%!6%CHNJUsVpGxNtH+dp-cX~p-oA+KL8GNVSy z!y*K4hh<0t^?zv^^V}xZOki1sTrC;9qyInRZW;pt04|eLcVJ!YhHrmgpCJJYXga*c3%N&!9!h*i|(1 zW*4_le?rzMQ#9)v=_kY21DLBFXq;xU+&T4BzE437OJ!H&vjsiXk{U4lnvF~_}G588#T>_F-$Ww+v?R`^N!_a4B7hyW&zVcSNj`;K(1 zS7o^H_Hycbp?VLJofRL)V~eC|%Tt7~Qi+{huZ5xKxU>x8bD_1|1MCn}C#+(GNH*#r zQUk8;DEx*unPXra>Qn+Z85w&&ao%r@7V){Y*`Oif)_tW7_kAFv#?CC&+_0i?C00e= z-ml)h0-jpk;IVG|7GuIfqRbEcOFWf03-rf}fRpsoS6o%I|8J8Ig8Ff8W05s3#eZA6 z=ZH<>!4RLJYnip_DQaWL{8Y5whK7mS*8!d66ky-cMaQx1G3VOW?8WHPWbR`N@S<{X zHxh*>$hOHcj6U>IK6K;AZrrS^Z>mJSX+4%^G?t`1`7?8*xk2r~nLblrl7by~C=I6a zsX@w2?Rqx+o4-WPm>r!JOt$op1j`Gx!%^X$xxk5KBdm%i<#oBa{iAP9)WO9e( zj2ni+SxTrM=DvTQ6ts?dkkyD!9$Wb^M~$gLM%98wReN9wOpf6?<{zBnt1wOGESBZT zz3Miy${utu8|oU15w5S|5vka>UQ#X;a}()g%5fP}CON_w zjoBzObu&%K!`&FZ21H7Ou=#6b(hVo#h&<8iA&W%jd=P#3X0W&oM1nEX8|UsE{xG|K!LOA!Y>n!I+?o5YN*okYC(C#G6$xncAN z11|Nr`e;K+RnMMX1w@~_gHp~!3iPNMwFjZM*es?d<89bb?{Cs%fSv|{bQ0nfTl_1& z;VxSN9V?MIcBo8#8QdXce*8Y3sONFgEvRBiKY?iHYdF^R5wUrCJ5FsO5!E_RoMY=| zn$GDT7vFrI)`Hd*lf=GXVx;NLu7tel7tynufM2EuuT}+G>o;2F8VKf$!nEm~JSi|R z;h(i~@n`W1!+Lk|PdHcSE%487KB__QE|i$pyOt$aN?5?}v&!&u4&P9M%Bb)&gTE4I zn(W(r4s3@6ymt$jAyHvVZ6Iq{DA^JPPwcHR0>dUjb8eYkj z7{TVS1`g^NNBw!np|O#*Yp}^k(6x(Qry~5w5VcjrM&9IY4xIm>wv8tfc>JowsUi7y0G>s?v`SWJEDa~CE|?fJ|-ClX2ANB!6PtcLpb5=MhW z-#_h1OKk8=nLe+x7^p4Z(!1f^#EW(Ht6uD2|0)Qw;J#dr`_?jXPU5B%=f}sVd8x!* z=|XaymWuP)M{b+ypI(jI7+)uv5H>L<2W^cl%Rf|^yn?IRmhHie(Z@u;v_WQasdRNK z3Hwz7~wOB!xp^7#a}bs>8*k!X(~ph(sO(ti{U@a|e=QypqO1jX9-oTJVnNNCWsY zCs!h!HSZ2u(WJ{x|5(oLe*~FB{ zY&(dood!itxEm_S$87zX+5G{Xmf5d}sI6>_yKdU~`#e48E+*lFBCT zzVz!;&WG5WKKbCxGe*VtI4~$tOqAsSeOzO?5ymFyrzd0Uq=cQ8z~|#y`W}&fu9119cC*=$K9sTQ?rO? zhPQxwwOmQjAM*nekan*BK#{$He$cr(cm&oi=xrE=vD)@c>nCgm7mZg)&?A?YYZ=ou zoN_xGolHCn?OSa{2|<790Rgn;HnObeFpRIBgf# zassTLQRCd_2EO^$6@g9_FC6cvrC-Vj`_~8PoS-nEg}r+SQkndLX+ER zkd5ZP)5a`TxPD%L1?3p}M_s5D8y<@dt!z}fPx=~LG5A8b3tm7tyi^PUbF)U z%c42n#_W%C)vuOl2Ot{J1!}n3=_b6ZqB&^xIGghoF|S1w>~5|J`|1oHTT78hibeDvM|M# zH|!9DO8eB@3>-&t8xu{gt@uu)@%fZmgy_8#mNkQ zCWA4xJ*Du~BELCHU0Gcybz(>?%Nq2%lCJ#)XS=^N?LWVUNzkFeX(fq zvoxx=6lJ0!Da77BamN1in^)(BT`TSMj~eAb4C7(xKa}|dHhtk@pv7o2`*(Vll)O~Q zCK`(nqoGtJQG0XUaR25$H$%p5+V3A$!APf36+-5?wx%9{6uGhTLi{e^SjR`u>C}Dc zOI2Y%fuEQK!szn3WKE?Y8(1C^TVcv>{nMV3KP``>H`zPK!p)9P$X%;L3u?K>5!jP0 zT`>GPzYGf>`+KTmk7S52T&?X@B5zT$%2;W+vv77&_nRilg>2G$!mo?(c=1y&fPjWY z!%a9^w*skpw&EycZ)gnDqtdZ+#e=CLLgHq($3XsO%O>}2mjNBtA8VWs@WQd7cNQHQ zgOUuJFh5Vj-M8il5K66MKp_clRJ_U8kjMmKa`{|+nA-SQY%|hp%lAK8dH*ps@A1@5 zJUG!!p`}9r>J4wxmcv{LrSkc;(d5Xx*~dznsrqV<+14tW*JGcf zTQX<-+P?SdL?MMDIMqv^n4b5tZ+O*#tUlIHJOjv%;V=ss`>m=I0-2hwb3@i=8x})HjHVLYGUANw@_Zaw6+;Wi0k=JQez(gKrl z>rbuR{5uBWpBRLXwQEGxoz9|3GYUQ$5R_NQd5Oc?)@(~yk>XnKAWv0*Re2= zU`PnAxlO|uddB!ad+o39ay&+iKbXfL*=N|CnScKlOVSLQkAN4Z>NfPh$Q@Z6mT{XJ zO%3y%W(v9Xa#E5FJTTp*Zn;$4sU|C3vc!KKq^TQ@E`RjKNu|?>rDsJEyE7mM9+BGOVOo3**Tl$>b z5X2Q`P??^Zi9{jq%q+((?BA@%98H3Fjn{c@6Mq_Pxwv>wF>k=g&X&*X zpp7JJL0tr_%?ZgD5!K`-IxdD7(&lC{lf$UYJkNc+OSa?%kUG0bL&n#{MV^UJx$KuT z$EQcKTq~^Q#EwB*QB&y%PN-Cai3B=tYNxH*KP&V9l~acjSV~ME=kC5QHv=2$#M?9M zhHjS5mYeHr`+B7VEaFjb^{CHgtCc92VCfvbawKJ>)5cm^br2f<7BsPueuqZ%?$YaqacpkV9HcrsAy?73GiS*thCZD=QM2PZuGG z#Y&SXjosW4dEo2jEJU&8zeFRax&KON^+vM)7217hRhH768E@Wp4FwSDxUE235-lZXBez2>@X+-s4K zNNApT)5`lrIg=m;^1MslM|M+ZMV?kW55 zE$dGIkH z?{z%8{HaUf${F9|j#Q=^jjB@PVdiCo*(JSZt>58Ou_oP>!>(g=>dT`3U&p@&Bl3>z zzwzc9u`pBNS>o+$w0h}&+$hjHl!*V+iXrV5)CJdp8}0w}>My`670xpbXz0dE)^sqw zNB3LSeLT?SLQ*nJ>pXeRVn|A60;yr=Ct~&tL@+_y2SLk>M%RCWP-SnjuApB zZ?}Jy*yHAq`FWd#qE_BwWjo;S9*rb$Z-6}`HThWZJ&ece?|0x*1ZUN>Vr%sV3igQwuGTAkYMv zG`?BcFkA?o{y?(bRwp@iLgs2#f5OA{q1->7kP>J~<4)ji%X6!uOwi9i5ZKSOP=G#u z8hstfhmpB}TpgIe{Cy6H7Y7Z3rk)ritfZx?AZdMsnkQAq8#~dkD<$}1CVqHHI(xG| zXmO*0#ryLa(W&Zj*6OWn`g4@1 zDF@;`ZFi_xw*rO$_JWxj{l3PUs6JqzDWOlq3s2AACUxw?OxCsI;Gy4#pa?Z15Q}sd zgDc;eD=^c)#l3lm3&^l$)3_DTWzDC>u)NNfebO$_<#p%xi5pMd9}-^MdZEV{z$k32 z^Lj++c+gH>pvXsKL`^M?A}OCA=^8Pr>RuPUS&)6Sy3bO|$+#RIoAV9vb!cgb8$iy? zJ0`4}BszlsH9`5+0!=*kh+~HWSn{cV&p{-wr%jsQVFnk*J&#M1`7);#r?#FaM<(Bq z0q0MSKbuY%hPI?+(8jqs#7~_}8mhOfI&Xbq_ssG7Zc>zUUX|fLVmo%7hUB!Go8-#K zSxrOUSjWjJg9Qc|B%jPxMlS1r@el=|o`GGwRIf`Uo za{n?r(Z|;GK+OX*@6(*fiu89lG<8SW<-9eAuUu^5)QlnSl>L-UegHxO%IF*BqSX|{ zZfpBD$0+ulljZJDx|u1L<`PzT2G7Ime|dnrr=Bk=Z%So+HK_?Tys!qFEM(NmdFF~h zF!@~r!{c(!)ZTpZ$TqpP69mvO?sA*r_%YDpO+IjO7+R$WvnL4QPhq((t_G^4xLN{0 zys3Z-y|v8Iopk6l(Xq8$r4!qZt2~2t?cgFuh(wylVSM7Sue}CbuW-q{Fo%5TTV<6C zQVf|^#Q{MxT6Wt7IwQz~zQbSB zW~JIOET6UPrMDx#dB@)>`{zNg+4Je&<7*bkiw8M=+P~;XUA8i_DfOIPOwR#+0$7Y* zk^ptQ%3bglA}HIEo@XFwI--7+|My>bT|)9@upELw!3W{H@!!IEi5)|oUec-9WM(@o zWp??4z-(pfe1sbz^<+Ic(3Qp&)Iys^C05^6V-ok$g-Awe~;@iueDha(Z*aw_LTk$R5Xq>~R}uI>~=r>`$x{v=xLo z`*MB#kBy?760kBnTlDq2*^Z5=>&qQKqfIR=04Rq2t-Y>?H}3yGtvv-)R87<%pmZZr zvVertvb!uL-5@31(%s!5ol+toC4v%4mz0EnAP87=OP7jt{clnH{FMKEp8x!N&O2}R z&di-VcV_Or6L0QdU8pH~*B#Q~Wel#(O-+{Q;rF)p3{AY1iZB@R;XKkyx4LX*ef*n~ znPSa)_OPaV2n5nIWSPB+Ssfj4>5v!gWNx?PAAsAKRz&FxQ-csqjpd|CU8FKyBpoOXJ z2s!nARgiz@Cb!*=0QT(zKEl##b*?v1B|1oQnr-D-u1xC@9-BNBt54HZbRCzHU@p5ev-Zh8{V1pVAd{cP%>g0HI&;pw__lNzJT-GmSf z`-HY~^3UNp##cv^+H&_@yQBs?kBhcF;emC}npvMp5(f&a;7i|mTJ6T7H2ywK z8kMfLo9@CNUW!Oqi>h(5^7A;Ow;AyhC}1C1X;Sv3xN;EWKl67sWmgn) ziNsz|UN+B*=#j5P@oDN{3EI1uWus|y=P`u1hTBRry{*srm+2lc!RjwwV(elUDUEzD_S`#;``Ze><$4|0&ytsL&}NS0`ZnYpGV?KBDnyHl}3d zbuS^tD^7io;bGxX5bWdC<%hSX>`Un)pc2&aPJZ@FRh|pXzx5WwO;(N6tX-r`bv?;B za-$fnRpQ&%zAX+Q_5)@2EVTEOfRFyI+UZVB=AlPn`V%J50kj>@^2}IU6~oURnb*Hw zn(w_>GZ@ca-S;Nr?sJc_mQ_`opbDMFokI(5>*!IIj1KliPXX!n^+=v@JTEhgj~#A7 zJ_m_w0)0nJ+sGCC)H(NjFth}@W->;{F?Qwy7c~Rl)4+46l-4=PoR$`5WN?|x8+jikb(aY`7A`)di#X8#2`Z?vnd0dTF{p7L| zW2T~`V5>Rkc&bF4t#FygthaQkgcwnC*~9%GrDOTo21Bj8EhBiZrW8~ z&XO3H(60oL@y-s4aNX`4-J*81dHeC&JrT=U4B{rUs#41J$oQvu36G`rw2PlK@i|^i zzxNsIfxDh8S&>pWJw;60rG9m-$Dg#v`Kfi=@^1P-D8M9fiYvoVzbgLvdydU`Zf@<2 zYyh(%D;n<|6NiF_EAljFRZN4-Y1MOhnBT4BUUPj-I*d5$=di1usVo?&F8H3cdkc~y zTMKLuTBLhlFZOG}Y#u0>he%g=UoV;wxT(=FaDQHYgLyP!x~r2^e2@$a87f*B*XA98>ZsN2l6QSX@eDtOgT|ipc@U(1ooX=v`X@D>1cGZN3&xB9QZv~ zbk<~La*YR!s!19EBdI%HYVg|K2OYE4+d+}ow<%4l{e#2>_}GVrkdiY#IQIBHyktG@ z7_6cMpZ*dyk@Q);>}&S6QtaI=RlYJ{J+!POw|1ebwMS*bTs+x&<{t3@UMnjMBmIe5 zgQ=d{MTiak=<2hJV`%gKpC6;+jxbYPWtIIj%!?)Wfn1Y{S2Jk{xrh(%y`pMSh9ByCjIL1+TpDbrN()Uzsy(Da^sIo-{-1xjso3awWQ191E}8d0Az zM>QWO-eDm4Rm-ucIm{7d@`1L{*qkkcF7I_bGD_Gd9DPh&QyYB z$80^1CIr(ZN3NJe1}^fkIa94BcTkiFPt% zVV3IcL7Ucg`vk!KA)zx;u3@FXk~y$fp9$>MXORO2(@@Q&-nYqY?J?XW4N(>DDKWl` ziLL^Z4olH(%rN00$~4-hSW! zjX~nZ2H$$v+sI8EMKf-Fsgx(yF_5QDEfc)%_JW4PUe0Yx?Hzu(bm4Okte@qJD#aOy z&1OE-Jb~LAT~M*5n!Z6bpNhlqwA!RsAjEvT9=!oa=?ki=9=Z?l6O)?Qdzav^rZu0r z=c?QUMD|Yh-rm-Ol?v)7EB9YCzTK)|DR*kyDj_{QS=n5vc(J#8P$J}W7}n?S$QVhC~Ws&9k6ntF6?%KKqFYzb_woxRsA4s3~{7{lDy!>JNDC>oXlac!3Y1 zd#u#x=h?oZv3ugLce1w^({wb?ra!&E;viuyrKaf)s&KArbd(?4_T;}9w)<{WvB~3L zbz=rz3_eB+Zrf_tus`uQTI@Xe{BGri|IXIQk%#Z$(f!H84KVPp=Ed>a!NI|8y%QJT zFBLT}j%L;tCJ#sib&YO3$I1jV?AiqKn#^4~AhhMOvB*xUruk^Oz z2S#zC29~=^-5PrAG2GC6H< z$)Mm8*9A>^jtW=agUu7cNnstjx}KqkwF~Mu8(LY@UIZV?O~mLvn(`Ac+uj@B{nmD} zc6z@NPUuICL9ig=cB7RIq2uFO?!gWJ1HWnRRBu#&!Qm4kyPrt; zA1&CQ;K81cWg~77`*H2nkGsT9@5n;jn+4&8@Nk^nFm`&o76PgR()$TRTr2kb4OevN}7f-bh95^{5@PpMDx7Ou?6Acuv9^ll)V)HMqRBRbyCGp=tY6W)9J z+)^koRf4ws8$L(M+IQM_kE|4IS@QDK*b;A!$5&r_sl0Zcn zBy8g3Hg{C!QdA3Qy1P|vG<0q1R%UhM>Rq9uV?~jWK_XuN3Y`p7Cu_=^Z{Euvczt9l z^0jTR)E@olRP_AA^}_p+byX7(Q7ThoWikQFXC_VsM5Aw)_A96CoS?Bqu!8r~tL0Nq z&4O1HAeNaQK(LJ)Iq|ru8YsfY? zy4xg>U!_s9aNsDO2lj~~%P=IZ?Ve%AjCd1$wf#XyV7#>K<@l9C6eYXDOApF)9OWOn zCof|Yzl$%Ex)L$n+V`zPK~-Zi!rKc6SuBuvo$7^lN%^4E9vF=-FyJm~=y=3Ms#~=g z?~vYc&<=9&VdUREqMO_b)9vJ3yT5D9tVMsr6l;NM@?eQIKiZlo>1FYAZ}J;iBSK&4 z1!nO_892lYMqV`UKAJ(fOkHZW>vw#7i7+af{G8T0=PoxW?_cyZdS#hdR0w+g0?Bc+ zv?1e0!7XdtcPB5ZWvFHGK^<;|s$~QeLbR{AlDPa#CAc|JH{cQO++`r?yJ zEonUrMd<^FyH4z~jxLxjV}zxLqP-@irO7!&)r(8}qMG03FhE2UeGuL>tuAdY<48;( zz*HA6I&^JAnrU;jMdQx>D{@DL5ZTmS*1OIX_>u`xHYU9Z#C9ocNtDEK5-PQUD@FO` z*M*okUNNxps^k)xaP(wck+HS=H08>cB8_D;DffWtQGAcHqEv`98@^j>5Z^0vytLr9 zpkjxQ5kdn@{^`b&L?++fN?Z?HALUhb^jP;gsG7QQUaV)(uj;ARn(M#HI8!V8)Y$e zF(=Xz`W5&&6#aUo;E(YN=lu(c;BXZVB@XV3gpVadf2W8nO$$`UDi?qqXT{ToBptN% z$-&ei4;d-*7CzlMBp@yi{EaD)zY%u=3S|T+5cCy+5J};HmQ2RyW29L}qXwNaLBXiu zh8IQlnFZpp`G?(3^h<;5uWxctAm7vKk!S&>U$kmg6Nd1otuUFkX29b=3uaT!1$RB) z#5#apj}8~weoka%e?7T6e(vR)`P6a$i8ZSdylz|TPChjHb!Sv39_&bkEJg!-2qm5-b*y5W-7w7|a z6SiK>-f~zM+G1CDB;0zHG)t!Jnu} zHBe+Cs7!I{G~0VP+`BT$68eu1w zO`Rt_LlEsW`(Z;UdTdLJo32PB*Rs#diuO^yH!<11h*zHo)6UxXqX1JrWPFF#go;)(&ZU}tQ+GKL+R$@h~iYj7opcEure)}1{&!yv1%JT8o z_^lmk5&_E_=WrNtyXkKn=H!I_H;360Z`uL6M102=lO4ay$2QA6hbijQ@hE@F(`;K^ zRT7sE$@KAGA2R1%AC2`{Zj0DUn3NKD6?+&*Rc4ywD9&AW=dgQaz*pi~t?Piqz58QR zt;?^pX0UWg?=FA#I@Dx*$bKLl+-;eX&$1)?`5}2i%)DNfc1}UU{5#JX&c&YEwaE1m zd)5Q_G5U)JK_d@SiR`yy;B<@qqf4_^GZrE2s1I`4m}QsR*Veho({-u!18=(Htqlg% zHH%8OXT(i(W;jH@xo~qy-7W@YlG$NO)BMo|EcH$JdQGe2U{5m}!_3EtM2Rd57TRLT z*Cg531hC%Ir%H9ZO*XT%yos{b+R*KU=+BSzRA88k9x6FU(g!3KJ2Fj2iS4S}+~kM%kNCR?5|i_iM9L`DvM>T7Rbj%U8sFD%_OSC`kz z=1`22-`Gz-XAs?O@FzBYMQov1Y9`=SpXyvs`=o+h@$#^@foj^}i85gG?Cqm;Yvk6S z(*@2mTsb)Yy9>Z$%5d6*u~EF`TL?1y?i2UUf*ht-Cnf!OLXc)quhJ2s6oDJ!up2Bl z;S@f$whm-cf|kk6b%|*Mq@0(e!^BOdIoP;?7n0+tXB5n z?G92LokQ$7mMSObUq}Raf=mJxqQAOgviB1h^}A$lICu;+oqPlrZ0BHJLp5ROd2XZ0 ze&AHIWC{yA`uaq2nEBw~lUqndC~rrY`~kmXsIPp~NXC^+TcNvsHA&u@Ll}?77E<2} zGCQmI_x0wblc@6xCN^C*c5Gy#7`4Dxb1P|oZ@pt%#{Rt1Xi_2O5$Xc*{e_?&o!sd} zsY?N$hAMN}_B{7;=)Nu@m!%SyWPC%3-q-F1HV56-90; zFJo&6FnlswruxghZpVJ7{D| zwsB>Uug+EvjYV?~qGzw?aW&p+-gXU;MP0V7aQ;+)Y3(}Qx}TZs&&?$7f+?b9Z>S`` z;7u&gIk967A8Ht54i&JPAbKtw5Esr%@d0(xcK_i%&f4S&Qpd^@?sIy;dDbudFBGMG z{7t);!apQdgg^I!BeCi9nE!(debN=BOt&~cJBSmiT=b1llI~B=wbz=V*n5E%wzS^g z(RgiD3^=OVPMn@weLY%+g+c%Iw=PXkVSL6*{J`0nPp+?UCJmidr}OVrOkeTJqFoSH zi?}fE%>3ans}G&am=Cy- zE}?s&l~QW0p-^G-5(Of?>Y8$Q38`$Wxa>7zSD5Z@pM<3;bmDYDcll2?!p<>jp`1K_ zG3syKv_pfSk1h~53FigKNWX!7P$6AAs)0ASL7*5d)MmXA82UabG{uIFGw&pO0>&e< zZrx8NourFpt`EN_Ej3ioVy-lf_BQRMGhDEZvA%qWBW$TQc6~>c3T{0-!Y(?X+aHd< z;CvGTPMfdJU~{i8r;JW(*pZigkY1OGp~+ckdw4s9f1~! z_FRvG!Jb7KcA96N?Q|CvvKLMoMB&E3JAb0Etp6;9yQb&$MBI)qf}B!+^LPmB5* zDQ5`jDJKrn+Q4P}b4WSI7Kd{Eg%XD=K@bzdl}$`R2jynrgq>AO*Y?$uQJA>Suvmw| z4O&x7T|d?Kn%8fWZc_=yY$_Uu@}qc)>j4v0?WY}s95H5fmr0iKQ9Ai*JYK0!^UJGp zKk@l6Qx$K75w@YC)U?Au(z*kG zHN4$a4fP@j*@F?Si*SCujTKwU2Rf!7b2*RcqA}GWO0+)t3v%)M7s#E}Ej{B0)}VPE zqsdXXbvPs2Ny#rY`aWv7vT}rHXLZ};9Cn^FJ%Ikr^a`%jWs3^IKB&j!!WVvW*NZ{( zQ`yo9h3td@KfdA3+*dsqfnc8kT6jEgV2t2#|MSl`lySO?cfu>Ojl$_uA{|irV23=r z5}XlhI$4J06qXLL*JfVz9`L`sl~boZ3XL+0Z)+Y@T8dqCi-}3XNc=E5%%{hD3oK4E zOhu4Bci5(u!TK?l-%R5H{2fyeH}b5J-gS8@otO!+t^(Ct3Wb}uyL`=bFj@^+GjGg^5hQPl0Tq%u2j{W2KIdz2|N6a4?D5{Ze{H zdfGWx8oFh4KgW*TOT|m83FFFc;%s@5OWN!)e4%_cMj6Jp?|a|v^lR+z#kALC`Fy*Y zzWR~H*vA+G4@dl^E5z|f&6q+#uyE3&L&H1aU$FMbW(qu_UV)@t;GoAymlvv|J}g?6 zlBQ5-b3{}nW8q81IDD+})$@^1G^Q)wP5V~v*?W*(_JQ>RV+(pOz61_;d1zYZ^<~^z z`hCTzowvC?y=DC<=(b8BN!|Nh3y&?Puic>tn#+#P;SCsYo=lvs%H(dcE!;(>jC2ze zy&DGE?eL=%@2>%^2~;~ZXw?Sd69-;K@AtF|2qt@}-YqEpV3+*chp*GHvBQsZ0+~5k zd@u(N!i>4MGqmib6U6T?o6I8Fe#Ew7M`nbE(oBnW(l{L`^;emBQ9ST`owN0-(rk7S znGa?&d%Z7bHY&0`k8Xm|^Ew-|)XMzR$=(2FQ^-Bj?w}B@_^jdVJfxmU|bG zQiibapTmfAOlc_hUuZCSWf>=6#i4$M#OF?}%7)C7M6vx7YZ2z9Zw5h^rSVJk3M;Q) zsEHOUe$eF6yQvto`Ebx^)Bm293yEB^oUW)_iixogqd+p#ytp*2lu`}0VDd;4W6BDy zz~UHkhlv>4{mW$YJpo>61t=Fs+|-N)b=YPKB`Q{v?mV};wB)d4POAkI-kNy#A@qRh zSgd`lUi9%R+Gj|#r0F}E0kXG*TfDMpp2wiHL{NFOUgiE;x2w7Cq!{7O%>B->TTOjO4as*nC$~-sZE8V>;&%1%}k~%HwilwY(o_l zXJ4Bth~i#*v)!TpsCOa~!sHnGJafL!0kJ-I zgqdl8?=daVWm5d==tT@DQ7*Cy6n$~@b4^de1 zFiS*UPgH7|rwQ%p`mcPsz@RD4_9q88&$BRr?*88!B1$4oFFCO54oLQe%~>(JzIeYK zUdlUpaXI}PYD6%SMwR^QO0YZ2q&n}~=%o{J+XIp=W;&_L=T8jt#@nnNC)xO88Pus8 zq>QLSWrl6UoCU%$ zzw{~OF_4?D^X$rg^)Tm+wSB$!AvRuve7lb`4Y)f(Zk`=BX=<%yqA6s?R$@Y!@QKJN zO`~zeAsq!hd5GrlWNQh^s2MFe{p=r>cNvpY7#wvau4iVY@9?ACyAX^vp; z;(HHszYuyc&5G3-IZ#LQv*Ju+SKaD(%KF((_nOvct8jY$qa?50O|r`$6O%RHJz^v| zs$|U_@u;LELpGKyz9W+fAw4o*n=F*)ZhYbNmbU*K;?FTeIsZZfoY};v2%EUSSr}mx z_w)hw@TXIIm@}z+N)`p_z5{mfIc1(>twA{dW}w}zCqUsq7xPwZK?gNwc_zHhbJ(`0 zTP413)Adonv9><{+ow6mHN82@wFV)2%bRW$;X-M|QZ$t0jIX9xudN#|9l( zRmp_jAE_5&qgmHD_AGg{$ME*C(ih&f1xKOzm7$YPx*pgb+h&XX)f=4?ce4z=H9fB2 z7`b@}Io^;derBy@Xs)baC75q-bl5JN!K7pP9BCws<=r7}2Ku%#nI#1G$~#(@8B%eA zLJ3!w&Z*;r=frNq6PLefhDevCfy}NvOJu;5GEE>ib;(c3(W5Wdc`SG1^>VnI);J++ zvE0O`cJZ1Lkp|m~dKXq5M{mZfyu5qVm9Synd}*MLWm3c^wZX^j8h$1Z|Or>h07kg@$)8I!HVOJn7OfNqFCO& z!&^t|h8JCMl-Dw`C0e;>KX=#vP-r<+c8ANiuX9 zm5!>zjGCfi?Ec4BHWnr~kFaWTQ?brzKt!IUmfm9GyK_1z-JQuJTRi8osit4rE;UAcm0lLiCh8sivtS!ZaB7+zTA^hCy zwjw+}C2=9W`M&c-@0#>(uKwlGf5X8B}@~q73t)|eEsIDWNT=-=`a>Q_Iw&vS6NH7 zyZvLv>o;&w3A!qnWBYGj)f7wLNEBS5Ajm4)uOq=7xpyl1F)sydqdf8F6#txZ|6TH2b6I!kMMmZLP_vsyE zW)X+WPDF04TX=)q8FOlxw~Ma`9Gp|>b8Ju;*I)1xo^UhvDg-_F`}PCBZ5D7df|kkp ziwrzIt;+1)qCyZbKTJ7PPE@`3Ov7)bEh>{`^6oyN4zoXX41HR6jRZWRNcVo9G0SzL z%_LgS60WSyrO?@16H!D^wCQP%Ut<91vj?Au|SrWB%Jh7%oxt=QP|o<}H-#FXr{|Td!@~ zfysk6W~{-{ny3D)Z*lAXYh>IzgY{cR_VM3PwUejugt7+V$XAxw9v4Q%UzTgc<9@hb>i*!v2Xtv zWli2t_D`+?6Td%p;~>1S|2CQBmClp_uqcZZ*zBn-;0kFx*-q`@FTgS}5#*}9#9%ai z6P1Nj7gGrO(dKw#3RkzSl?EfxQ`@zxW#b_polK~}I{MojwBa}{4hb}$1bMM*+oG56 z;LKd}hIw0vHsHRu+vjq|aXh+T(3L1toeuFz&`xHfblb*MtCf;%sj$oPb~bzs!}Gqv+z7#+#d+Hy*?+WDM?7m2msVh##KA ziSx{0j=z|u!($-0z`7BNxBfI^08#lTff{Yw_yd3M*d!LD*UdcR)8UUl7R5mv!ywfM z@V!y3%b8LT_I_&?g_j>%*aIR9mZ+$WljD$a^|19+-{DnRR^TpVF}%rb{fMErJ+g_y zhfAq;4z=gly-?tg*ndxT;$Fu9`_y6^#iBS@@aIaFW$;&NMo-|m8R;(}or<|(_0J8p zaTHfDD5Sk}dP*bZ4!~lO{td4$s5NhYT)k+o!H-GXWNs@HH>|P~%j-ID-|+c|vGi-J z^w#c0hoXdV;F4QOWJ{j6U0xrfx+bGxj zQWs0)_%f*2=QT1@O!C`wY~Tmes#PQ@YoT!u{el?Y+c_G`uByrksCjU-*C}Ep+|6F9 zbqbp=ozxNPzU}FYWSu0Na1LwEGtU8+nE%Nd`PhFN5>-_6YolQ$;;hc1%K7@)4RBKub?{9 zAtTU1-Qi{MfWq_K5&W;hT8dASbejCvd}w*jq5B-;8_N9`(+c>H{gJ-wK7q1_GH*5u zpV)bw*vh|U>_sOmg(#Sb;D>_Ma#<_wd{){h6gxMrP4^PM5N9*t((f~3r;t@eGyXiL z(m!G0mbN!t{)XuG-Ic~K7Lp;GeaF_ zsJOM1_FPPK<%(OdWwfRpB*$r@DcAk%EpC!!#V95(U?e3k3@27sPgQh3-8x1|6Eiz! zWdJxP^pBYP?$Y(tPb*l|(Ao5RNJ>#sOG%4G#L3doMqU-sTb!I-#4HS*fSh1C!yjQN z9F7S_WDqxXHg>XfaItp+K@bh63<9gT8o8W$i$xRwh)@MXTfpb)zwv(0EGl~HWR;DJ z3l3s~0d7^H9KiNBgoA?%;b0YkUf$5f$@0_@D?0}ugb-4i<7Y?@6O15C3GfdK_+35q z7K;g%umjwvBD}}`;*dkWdr1Zz^)*$8{kG37|oslxxiUiz-=s8!_vgX z0^kH<;Q#7@_OxNnj+*?_>c2GIDX-6KC!h{ChUU%ytNy0NM4XKgesy_)_5&lZBmS^K zfs?#|<6sBrQ*ME{AzYYX%^!c@hz$uDz^AaKv51|yjVVAlSjENERvi(LH}p8I5x@$d z-vSjpI~jZ?*mOYLfZqOpcrgF3IH24-AjAWM!$EMMM*t5d4-YSh3)qhY&MkvrFxdA8 zfWdh#Th{YdvSACLzT<^>jzPGJcA0KOkYU?Sd}fW?Wxg{a^ek%%(R z!iY3K!w4FH2N92WBg+2~_p^ehco6Yt`G7Ej##5XKyubsT>I6Z#zP}L+Kr|Wx58@5r zJ;jB<1IUkm2>A&>)a2(ML>5Gq&e|P87UDr903J-(57eiqe&+g41niVCr-cLMAkZR+ zMkG3`@Yx$6_9s!?!1^l;kO`-80P%?Q2pAj=1#u$i<>Wmp{W~r|a=;OJzEgyFW5NLW zg2>MUFyib%)Z%9&AsXtJ=ahN`7k>o;;nNmG5PkNX_68u`X^#QmzrIg7jesHIe|di3 z#6%EqR^i{@h@o)Wf&kIK#+{XW_Wlv~vp4*j7QvWv%Q$Of;Qi010KYC0h=3r7nLQwE zMM1zy!D&bg7?$5dl7O=L9+CqAa`ZG-9_R$$Lkb}1sT9QoBsTPiUQ-4E8tOEi3eXR~ zhtvTceh(Rfpg+nm0zrS2VGKGw>;bqG|E?ckKguydsGd_yra;5~2qAO{2x!?;_^GOa z{V2{71pA@fY=A!WJ%KF<_Me2T(HR5` z@Y5s+eFiMteh;51#M6H+AkH5Je%FD3b_C#-HZ}x75QxDd z(El|VpfLYQ2K!%=0Z!flB@CoI4G0SULhmmY)IXK@rQ7`jL@@3L<^KQ?Tsf=GcU6pV zNe?LXo0cXBn*zV@DY!&x|jeH$-&0n#m3SIm0Jc>Nd7csQAwVFHqj3lk8)KYoBo1P8jY z8R#brVW%N}AiEzh7?ca~tj0>;?e@_PiMC@-cVEy5D z7(feRup@pyX@T(aAY{$6vq8W8v#WCmicdG04HF6 z?^jSx_@8`%b0Bobzv+WRc>c%>2YT-x={ONm^0&O)f6~mw0jR~_(s9AK{)E911OMOn z0PXunyKy1Z?QiJ-6XOp)bMyQ$jsQgre8%)!ULIhi{{e#o9|8QHju)XT5x<{(5|9#q z(!vYl`ZFCDF!lO9FAqX%{tF+#Wne-6C(Omk(9*`#2@^5JQ?c|iJ)Mjx+uI|=<*Cw< iu`>fy4N#-+s>a#H(8=XgJi(wmFbF0+y@aA9=Kla7vv-*Q literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/score_range_by_eval.pdf b/audit/dataset_statistics_plots/score_range_by_eval.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d668868c26ed88bb78d8a1a364055374c837f3fb GIT binary patch literal 21724 zcmb_^1z1!~)V~M_(hU;2bO`J&%hDm;A&r1iOLs|kmz0!%fGDL1NJ=Om-6E3GN`s05 zO8xJmzOS+0_xgO#AD`>aJ$Gi#nKLuz{ATXl!>leX%LV1;#bJIp4Jv(#0|h}qj;1y^ zA|fELmX{L(1eP*!GqH2D1cB8}ED^3C9w37TNK6a|;b4x;DDYDOSw{yq5bQJqSWnH= z24Us~;{SFl1N`F0P%h!sGGRCAzU0laO5oxSkubH+}gns#P|KFi=&w)!VP2q zu$7htL_v7Efxrs(fC!S`f2F?vDu9fB5(oNc1CVkfrFTI9(tQ&jtbuTKbaycW+JlrI z;ID!(w>FV<^aMCUfPega{M=BeAcT(x4&sCGaSQSYK!KzYpbnsoKt-+~!EcgEI5;>0 zcTi-hpY?)#{6mq-2nS0yD-iTMb6IOUpy?p6tR0{aX@r@hIRe=NS2q`gi9L>I)*AyA z*C_##rXAh6bU}=cy$DBVi-pMz3k%Z*4+2e14jBs)1Uq|$T_Mji|4VF97|iK4Y%vet zD6%6&gW|TZa)(z~RQ(6G3zH8Q4r3de`&8E)lMx-=l)hrbvdwcl*-kF;gN3p4Q=Csa zGzLUrBsXIAOHIPP=j(=^>+h^Da~`B;HwLFOUlbnXK~QNchu^4vkRx0g;`FxYWmymf z{wt`>YbQ>I;NG(4L@lgV!5-c-c}2|pf?q=Vy1OqES1l}YiQT|1q~?RIPs(J{m z5{)djyKCc|Wb4n}yML%5)SUBpc!Or#Vu`s;4JXx`S@A-WW}foCEl=-2S!FVRmJ|AB znOLfnvwQZD!gWZGKSMxbY3&49cjBOOaJkmdRiuA3^)1x9{4Mnp8Qc1_xA|DTj~>Xb z*!MDia_WuCpDt?R)a|UDmRAVlHAwATX-TDNDX#Vg_o@)4`sDUF7+lPYFmMpPJ+OhO z7jAybz9;O{b>#76T5$tsX3Ip-(Yr2u(CL+RQTv6@_x%%T$5KA7FIez#KL}VJGZ)kk zOuzHtP}AR6kui$O)z*XU(NI~ei`o~P&hn%+d+JQLb1VZhjmKkZOnB&xn`@W6B%*zN zvuAzvVjatfi4T2BC3Y85eD?*|t)NYVy(hB0s~fh7w>o;KhCO-e8!o8%2lhYY=W8mI z?Y8H;Ty=058gQ9ep^I0J*|j^|kI50%;#9w`nNCe7m-eH*(pF2wxgmdh|gF_E$`<)#0Hk(_9w zn>uYq7m~${*TtCLZhcsF_5U((^yJ3+e9f1#gW~ytnXHK(@5@EvmAEBGUm(X=$C=EB zYMA0}l{>~C&U3E>m`|7#7CMYw9d%lcVr5s|xX&iRwQf7iWA!vGG)WeRmGu6r2X zupNX`_~Le@h6qLG=2mdp;;k>6xBM;BJc^4-!W&%X=et3&w4x76EMISw)nyY<=i{tv z3~S@xR3(KrPcUKdMHNcxop|o_9@VYfm!vgEBN%pHv2RE z4hlQbmTkX*mDzLwYux+8j8*n5X|-d1r0n(_+@z+xH3dto!c*W+1MG4Ui1)4nCg;W}yb(w+L2JGnEnxmm; zx9ynT_r8}ra620y&%9V5;BlcduClwp^F+}mw$yZhrLzBJk$Fvh05iG&*uZx3DxFIm z&8n^v^N4bn)>ro{%y;Tlh6)=~i(kD<5aN3op6cEEsJggnK0(oe`EB}$@)s=)2YKe3 zAB`jncZ{SszATB11S*P&cESph)CL~m6)!cf;u0q@@*Y;I##MFux4P%~-DsX&J~yhf zqc~&+-rK$dF%5DIOU`qZw22M<-~gwv`occI45v<#m5SA9$Z~fn;zY|#uyc}KFJ~T` zNbHWyYBXC~mKm*lXcy)h+is3sKjZc!E%>TM73s9V4%8V5yUioG&hKBU z?pJCQW$QOoL^z@!L7N8BY6ZfQSFR5syud3A)3`Q4oa(92Ew-gbI zBL2dRO*wYAd#!ywur+#I6MOX9GN{Hz%Tv4Ps&3CQZmZ(>Bog&~$0^_O-zlg;0v#S6 z2>b^qJVirDq5;V0AKK4I@dqM;!T}icA8-gB+l!Abfi1PI8KyKhs6Hrrl|bQS3}U4# zzaPXU$7ID~$|(4L!F^yh>6Mpn+Jjj)$yaMF_fpT#+bvAM;KYvM5yMq zDiR2`80D5=+mK68Ene%v0Y_-CJ_Jq8nQw~36&@X7O^!45p=cxuGzWw8|F`U-NfJp* z5NwQXe@r!^v-GQ?Pb#1hGbcCrV{%qwT0rwXt=d!Jn{!ulYMM4~i60%SNra7&3I;qe z$VRx>(kczSQaSK`%~tAX-(GcX>a|O0{V01$TJ+QE8K@*3!c60WsP!XrmtxYXm#bf^ z<{Vsj;!9!0ujV%@=N?*wuB$?=b4Ee1ZQ=Yxf{g7d$$M>;TfBQZO}Cs8b*3|<(%mc9@6jk%C*LRr_dz=0Dr5PPxxPDfCpydp*7 zdI`F^LkZTM3Ik`Ayql?Ocw{4q6>?Nj^R0uQI#jiEXQO<)@zJD$$hPR7U3*+PCiex5 z$ru!P3nTnZ)H%AVb=f0lMj#Ah5TWx$w~iQRKSvmL@@}T>n{ntf3nOrs>1GdBIg4U# zNmE{w)%#Ej-<=fy$Sk@@G{pjuGM;?)eE;48I>n`Oi+%s&V=R)GSn5A}iwB9tf5YB9 zf`2j6m^BpQF`(ED%jCz;rF>KZRm|Auyyg0YNH!_P&b&W~ z%psjCg_bNqMzbzxy|k#3U7QWl#lk75SwLzI>C2{4uy=9d2f|_rWuV$#%4*03M3_9~R z(b{0?hz6ys@{&G&L6mF6V0Imkb5*)OCho%Q$((+``e(E)9m^dQ`u=&%^?hawK>vGk zsPtD52STthhjQ4d(9TL$-dfgYI;fQ#(4;;_k9K?~zrq+^k^5x6yL|6vL`FFCaofH+}hQlQ?UY1$}+snwYzIIpbf1)2;=1TkjOqzLmwReOQTDe3#F#FLM=SFd?$%k}JQi7*++C|y@` zIp#S(fKKaO<|lgbz`I7uuJ_vb zXD#_Squ~m6KBq;HU|ob^etFB5iE{bJ1PLYU>+il=O?XO&Fl&F(7EG-kWnCz%Z5?>% z$J0R)-OV}VvNn*Foh)He$^ORa3rU!$?zDYd%j4oSvxlxMA(pvpH*KIzcFvDtWK#tD z_{UPFv5qMF`^}$Ba#(N4nBlc8xd(H7j8AIM9=?Nr$=lY%{z2f|0f8Bj`n~N%AMt>@ z+n3x}{F+b<806-}Z`}ij{CDrbcxu&w0mFNaB%Fc*)qp_<;p-%}-EO&k?o2KiuN?(v zo~ds;HGOy<(o0!g*G++(bhj^i?2t@!EJ>Hs;gP zd5SrP=@+j;EYu1e%*OVw`Ern+X!S{2QCEwk6vzAPMBR3paE;s9ov!-&c=zqiwcV*A z+S0A)tgc;JI!4-tuQ5gRCnqlPcRzeVCpi{6g#*9M3FMy7Z%r5Y3*V{|SAqW=@U1=% zj4?v`r_g1t;`p5K1xBWXi+g$ocE?%nouyJkyCFGoH-XBITzl=?X)l1I>GVUWY_(tJ`Ek6jR~Z=j-qFfJ6OMo2N(SBj)B49nhzvJV?JH1`%yl+I zYc@Rby;v^o+tBX%9oz#R_Si`A-Fi|RNA}d3#J4X7-etTAnAx;>OxUBy!iqw66wryE z@4wXtk8MEqE+(>f&HO)#>7=yIJd0~7n5T>IoDt@86K-2Nb3rwD`lfUw=8|CnPl8r- z*})}nb)z#&#yx7vui25XamxD{Kw3SNfo3{e$~Z?cB|NRl_@9 zy(@QfD?4iDW@Iicsx;i|o}^pRp61uGv7C~*ehE{-e-uRM7;!z-`>NU7a3iF#FtHEad+9FiUAjUS!ViV zy95g{8(vwrhf1!N_sZ^*+?-~Q4YvRf&7Kv6;&5ZYJ=ddf5s7$SU^+UR$zG&_VL6+!kZ-wKb`UnI=g1&b@epEggxq` zBd(%Wx_9CpIrRJvT%?5Z1B{R7EfX_BKazRX;qr{1x09fa%03%n%yFq4{d%;t(w?u$ zjmz(}+TAMJ*hBR^jWi><>=F0lWq6xA6zx6ksF*I-Ek4GTh0U5I_HVtfz9Z;q4kOw0 zijl*F-A;1xG^?7oh$WiF-;%+c(fP1v_WFxdOIZU{wf!bShR{swu+wSYtZw}1%jL&+ z1GtBjO;IQX1r`A|mH&Gph$c2TqXV9Qr}Qjy;C|BQuEncPTjHO&RqtJCy-0aiuDoCR zKHX`<1G}X_28}-XVbkHP5Ncb@TQQp?oqTpry_i`-80NVT8^f{VTUtB}C7Srw zd>7WQ9ToYIUHB^DJt)DpxB2E?AVP?PM4pPgi8*yAsm^6?xbFFLLwl^#i!Sde zzR#mhihE~t_ga2&(z_9_1>Tjuy3OdVNk`5Dm1*X4#vzk=8KjQ8ad5_!p{dnHn+2;d zZj3txTpWt4?VDTt)LDj?t_LaIB-|VedHP&ZzCAl(rZd|qcHpeis*Xb(`YeajoSx;q zv$#4t@U7Zb=dr%$Tr3N(XOd;^T3u!+lkFwXWfH}G#hfA6<1zdEa?3!Bt^T%QC-nNe z$-XD&EhP`tU89);lgpgh=3}Jxb?kyinix6vGW#|(h+!Wz?$VvvY_83}ulDI3Pe#oh z<)C7=EYm@6&R0toruM7^Aylt>g1=rdvUT63cNnxyxw`nuvIRFj%Kt{3j$KC3l!0BY z3hiAXTMC)%eCCW^A3w+=Kig;49(o&VTNQ87%~A&(=j&M{_ZtJq%(}>|)XFVHy&E!I z8yE_oaH~;F_!w(t9-e3b%hNAk(R*|Y8c+-Y6!;Aa`3nFC2e3rj6+Cnwl@{Wh!8Ee| zMUd0{#;mNrK-ifDjEjsU=%wJs1iVJ;9XO4zy}i=~IWgUD#*w87r+OuUQdipPt;p1$y_dO za7*+e*7C7tj1LJ@72~7aH&FVXd;RK3p1ni$VY9i-x6QgX?i6ztjakT=Q94S`+DN^7 zVQWQEKyoc>EtESb(`7;9rfT{$XTQn(9T};S#Ok9sLp9Ugh4|dXNqy=edDT?WD76!_ z6y|lv_-uszbu(w}R%zQhS2^}5?y+qMo z6gZEU_b)6UJV_ym4mm&Fakz(wO^4iaww*kOn@?VYi+A&Jm@v#)`s(fUxDQ-vR}rv? zqmPBM6C4KzOCDiQ!Ua1bR1QR(!~IlZCbOw>?8R>l)~5LAji0|ay`1q%jKftkV6gvw z7P*dySaLIknR62x&6E|9w#Vc4SGIfh72Nfmrn9PX_b`^p(w0N|3<~Cx<*)*m#;Xdr zzIc7fXZ*NwIJq-<;e;{bj%$$YjB2Ssbf3xn+7)$8wtOy|E0vDnoRVnGl@(l#fhKux zvin7r{g@N0W(f<=-isyu=KBWeQ>!b)5+CbxrL;qfkk0@}sJ zn(^sa-ZAXlt^0gUx1aC22P$H$**|ezD#o#OpKtw|lN!L!rs9Spq3>v-F1_qSroub3 zX9*u~oaP7@wV5HUzY>@bDM&MlF>C)d?<@Z1?8%vq^+J9WBLD@I=7IkOFi=TUa(Hp& zyM>Ml;9qtoL(ESH+&Mca|D>GlD(`zIX%fwIeo^Z3L#YLh`U^B)o?%9;ZVh!bF>OeB z)v&jdWfj!0$0}TAx!(J!TMtx{nEiq%XmMf5y$gTV#ARc?DEP@dmG@nS+E^PZ#~~4`b@5w!T6A#RiAiqBVZ)(FqIHiAX)jOU z)c`}04vXFG!TbtF{RwA5?lERVHkM{r_3bq4W@{zJ{EhX4olBkfYYaqNWZDZni^uv_ z6u9Xhd9^d1Q&r)JUbffLlGSCgB%Wz$?TJ}^q16drcKq~)PP=-++oU@7k)uatLqbEy z<~1ua`Rgj=1+T$G-JhR|ZEUJfRN{ zM1P9Hi)tyt_ZJ&NaCInhuX23{N6bm%`IV%-4Qu!Ijgu*uw84Z_hp{jN0>{u_tG%}O zWy%dYvA7*Ivv3h~FKHuS5B1u^jxk7_#ZwCMH6rv*p<2%_op})zE&f8^(Q)@d3*Hy= z=@>%oD$kn~qzXH!le_+magQF!(_NuW9F7;|SakPX=u+C2z2nY9RY=#}g`#(WQ_;yHO7alvTe2BK~Alis*M`&Z51EG1c zVs?|f+iK&)lk?@7z8*($pY5C(-64AHb?Rr-(rIU@U3ILz5{EZ=?srV3#@sOAjc%u; z#%l7r*GRQ~MCf31!yJV=Q7t|GX0;Ai@3zN);vF>L@DW`pyyeZJw^XruLZdiiEJ9?W zRM6FTJ_zhv%m7d1-hWN}V5t7Rum*ll*MA^k^saLD-?dz6@{Fromm4N*ZgY zc&3G}{sWO0SM#4~@az1%+NBib~fkh zco7TTJMa;<5Pq~pQzLd2I)k_wsqSK}tEweCH@f{S3~)HhNV2c!XTN>rPEmyYE(lg@ z^rm?bUzy{~*6#Vw7sOu~d%I9MjmzNa@ryJa__?|cP_k&grPC_E6K(&cz|4OOABMfQ zKIt~&pEQ3_$eCyO!!y(3XUAtcCXI4X=n}<>g%|o4J0GGN21q=8a97hQNO3+p%d)gY z+~;I=vd`{*e=Ek>8u};Z3PrxfiX{r`iH=QG2ELA7H>e}Qy zNG{Ilb*-O~uZ)@*W7&S@r%JE>gWQTy&aS_zuCTIAB{HJLrn5k1a!Z{pH_OP z7PKi^K7equBM1eizQAE|=V8-_uiuL1%Yph(wS_m$9k1T}&s|j(iz{eCFuO(e!TkLk zG6I-JmU%K&%)1nl9Mek~d5;)z=1eE~q_5zY?&Qfo6%WW9xgU$7uTiX>pnS0ZUY)9_ zsZ<67E@h8Q@T|IjkCz)QrT59BRo&$;6xpHYNXoAf@y@=Astjw0*M2xlBZBdGIDl$W z8b@5iU%p2Y*F{JuF|Q7vT&_}LB3m@CjIH=l0aPW=UbjT0{yiV(!r{XvwX7xqwfn0J z$<8$?oW=?P;Pl716K3B2FCwSz!kaSmvR-4p6&rSk>}VYN+|oWYTV8o|oStRIFVeY) zuJuqv5k*^3EE0Hl|6*LrTvc#*0l{lHAV0i4PvQi3iSI6G4;vg(e$S_5x93PVt-}a`#xX65hgFB|#sJ;ih!s z(;{lk?YA#3y)X4naR~L0pqq~xr+rMMEBI_*kA5J(tC6F9^^;KEJps3y!PlNbKl8q^ z<#X(J5-bXfe02Np`aUda>7?k5yV4}qsB8z*M{9Zej5tHg!mJX`*J0*j)0&1*to;I+5xwW$TDp*8Z_|dq!t&w)6B2AO{ z(-sDN6jGvCZ@}Px0h2_NgySzl@j?y+!?+=V#c7*6hY|@4^#Kh9O?MPS?c=)CG)MV3 zr>|d{a+4?a%O8%o z+76nd4aKveg4VU|3vcjnMF(E>zhm?$keyGtF65ZrfEoPy2qPiLJj^j?{30K`&Pr!w zx?D|h8>Z2J2;#K4W@+`xh@gB)W6Xmnw`dWK#(D?BK$O5H$?a$3xpGdIkHwOmmc+m1SNr!+p8^$ypb_=FO1d1ma&&dY%|F*S_xgW zs@sb_Oi^SHnGRIXVp-##ylS}fHTQKR+Pd=2jj^v~gJADqab^@PM}g=0|6*MXNx~nn{CLA~r8y(7Sdu#ujihHjSV2+`)J>aS0=$ysE=Mzd`yf13T?RW9O}YY2 z&zkzoHVXv=g*YhCvEX0qp{T^RAb0H1Pcnju12G~tG`0N-4gzqkIX;IFmfQEez<6PW z{tDAgUjHCFDMJ6nC(sX(ok?lg)jeVkZLHn09)G)>*T({ zb(-4>g~?K1Pi!SPRzDesP{3RoPI| zBi-E0msK>C?VYqdgPeLTo>=N+Gi&-ff#}LKT8Ft5W*WtXcYT50nZ@YmCOx!G#|*d@ zN@SjFqy*R7V68f>S~BRvu6&*u84W*RJCFeXr zTcYSZTQBl|e7di<<)W7G^63ePu$Es*%0WhPNv?D5orQWet4t@B_Kqgv`K4X^PFk(@ zgSg!To^rbvP@>dOlkI zdWl}ksi&-9{3+SK9jp%-qVo20|PP zuL*W5wu3z$6@Bd3)#1U&aJ_WZ@l^)>lyYzhSaPv#C47OCE)kwJF`1?>bjLtgGK)rC z!ZCGmZr&6TQi~8NG!yEs*0UX4$wR+5xlm{T1rCD= z{Ke}st|Wy@+V6X0bJ$QtAvIyz0F&8(b*#}+X;7DW7pFc6%QG$JQSpz@e$2kUOO5YE1EsZS{qyv_BY_9fG|irAb#xP0wVjDE4MpWO0;<*L z4au3QdW|{HlU;}wQ=>wdF~Zv835A4&gAtqO-#4#taZqHfM_*#Z5^hF~=Y8z2FOEx@COO~FlC?(f>7kgmf5lbtnc$TTp(JqX8#ecb zB=6xpquw~X$?#Gu0iqn?NG9!35{YNg zxzs3UN*As$VJB6e2XA>t3GQO8Ae05G65L5Xn9A}tly~YpKhs*U2wm0MdQWQj zuGQy-Z;61h^1G*A-ML+P`2)6&4L*l>gpDfgzOMA(o+#ya+^|`Mt-iUQxCPf^bCRSb zQkP8Krgf(*bnMh|0V=x81jGt46=L2Ba$C(2k3v}tBVCV0p!+Sy1ENXwhjRCF>sNZ_ zE)X7%eLX~LiO_w3LTf0XB^3G>69BF%NPsSZw+5Jixv9lHJ2HFdmjIe=Hfy}rz({#W zs88ALyn+uT-fW9fPfeXJ#fos^&*N2J?Rd!f-ocPb|GiBlv&d13_x{cWir2}hdL#E( z$&acy3no3QXfL3d$(98xp8i>&$*it@fE!d#vcfFE&?>p@}GH#Hu+jYMdb*^hD`n2>G%4GD` zdb3fS*;>itNu)Ss-dmwJE2gALC1-PZyY98*@Tl)#ojzL3d{H1Ap!aS`xFzL42w1Z%0AGY6dMhuY~qZJK_R zI@M=#<5>NJ;O4Tk_`CJ-lTOAy*cYyy7RQUioin%Yn)v8>QsJ9=c#1m&ax^_`SIlNJu&zHd8F6{!kRTg-SL1>;lz?i4!EJ$3nz%&9-K}%(_@JK5V`7Hl zlU|s7MJC9C>QOQajvOM18sS!yo^QllY4AWQ36We2z8z^zyvqyCnxFV@d7uq5R}@G9#S6*7n130m|3Rt_paAmm_5S ztLASX;d_uLoh6O7mO;qX#jf<#a%94foxwU6hicOYr zXW3r6xdMK(p`|usQ?`fP*PzKlzZLy8#k>{lq2PjnxhG8f9=z}K<;U6I zujw21>nvb@#PDAC3<%2122rrK5yVkHYD+h#$tL50lDzNtSikaSPUdXZ^EB6UKDAjr zd_&Klcnpo;8E^>L8|+;qBlle`*>iQ9fK=q37Zcbl>?$wHY3Riuio%#bQBKf+MmFg^ zFm;T8nP*8)kz`8ONpuR~^bUNKZ4IwPOyG*(z01Vyu4t*b>G*(^U6=raxcdo5{E*T` zJ*lkiWU+M`;=2`JpOOrG+G-zI+iaL;!IK~zqtLn|caMJD?D3#m{!LuZIaTjRUVu*?;pnmdt^*} z8uY)sbe!?#X589}w*y^k{5awb@ypx!;Wb(V8u8}^9|ogwNH|frkb1O!CLH6>eyg2x zql`-Q07auw;AtMdzu4=8U+uMT2i9voIPu0un_kzieoEL1;JTlCg_dLZRskakjrW&m zd$#)hq|tfjL)G#RoKxrzHG?h~epP6(44bdKO21Zz)f#no^pXos(mCym^WDa>VAhqO z`j%)t(Ndq-Hy1?>=a=!qIbfvDWp82xUbt}2zFDNSCP^GX<(DGhPNd0yKcZ_g1JwzblqPxiXz-rcgDN64rUF1m&N zG9PmyK_4Q6>6@e?#k4C)5sbgU1tAF!C``h8rAviCZsS+-xCd?JoHZF}sCEV$_h_hj? zb#olX%DKU`p_AWElb5m+siWYI0&hYgf3b4}kAo5bZ)nhcuFtaul2$H>)?TwuJP7cK zPq}=i_qo8E`N#*aOB0~Z5zv|g_?IbtiX1s8_mJ&n)fb~J+=0=>t8|ylQWMY!jPQ)K zMhL5|pAanHWf>@FeSO|&cX9_^h=5ibg|sNZ8Wc109N->ds)GZFAkHk3bjYu zKOIk@u^54J>|#Y^Jj>rHQwj4ulZYelfs-8z#X69wQwa(4q8?Jkf$z{43%ICb-4If@fWZiUFoSe}-%m@29I6q|fLD2#f zP>=`s*6BY16Fi|e4miUnwOuBOPet_h@tOkB2L03tydXR41+-JGAfln(kRf3<7`3&5r^e@$mlzx`%&1OEJjq8>D!sFtAfn=-_=~ zukw<$ADg5cs%jxY6b{xd;H+}+U2mgN>)d9V?Wpjb-qhS$eqdHT#W|-1*d;D1G&QFSae_ShW~S>|E62&LxY?!|VoX?GHJeHbM_F z2Hv04i{JX%S>4^Yv!6dL@AG~=0jI7+P?v&0*rZP|sxxy<&vmLZlO8)K+-ZT1Utj%; zo4KKKSqq+m{=Ady8~Rw<7NCH;ee&BEi4c~)Cpvm2?G!efLg9jCT2hc3}8FM3E^VxXbwW2+VMuX zID){A4uIHhHyuH6AfE*goC5gY+5@1u0usT25(sNcD>odlgEcavnWLSf0|;mcSONqF zB4S8^kf25eXa>-B5I>*?1rS&f1XcpH2m-4BY5{@OfFKwk0ibpbKx-fYJ{+(P2&@Z2 zei-NOV2*HcHFI=9fCPaZDiaXc6yN~@n*(eCIY3|_{D&0?Yz=4&cx(#-+kwFLAg}`< zHqaCtC=^(O0|L7OEdVM5fjvOrn?Se=VETIj?E}661;Sf^b*;_atN_&^!NbpyGERH) z#|IYw4wIky`m}RUbu7?cI}=M+K(~L>#S*S&$PgevR1XjsSv&HEiwC*jl`?UXKh-aY z9}4J2@B0lLdD=k%2!CR2CgEUdhX4czYq}xqb&v@a6VKBY0XhL%@NYnr)Ak{oV*ug@ zUR3;lxN!cjb?^Y+G$1bkZGhmw7y>RF0Rce}A8^Mc6aWM` z%?ms~E%!a&KjnZj$h07^@pDQ;)(_P8T?icHFE20~kaZy&_(Mn}j~{7do`2Fv8GsA< z9Qljn|NYrN4Lq#}`TR#YAdQsqv`%Ebzy*9X2;u=&+sI#}0+5}CtOxlE)O%VNvL3+T z{42;m1Y}G8xj_~|Ht9#dBgH~q$PB=R1N&a}X;uG}`X&VIR57Q_0Y1oTkwPOg{b=xy zUqIM@gyIJ_reT0DJbeZTkIavx!2xT;iZj{edZZSACIacx9z+WL<2oG;K)%xv1JHl|J=HXlhJ60h^}S9UqyRq} z{QEC*DxCHpAoS1Ae(?SA`~BHJqv7YgNX4M!@uQQ0-_vj2kQU+8Yy7Zu5+Gn*f9k>{ zfywwSB?b7TZz&lNU_DQt$^u>gEhPs!{d^QK-wGg}Q!9!C*m%H^o#s;lM&GxTGN8F{ zDHRaUcOI%Bp6@)=K&PKS0kx=uV5dBA0FMB8r_)+AfiC@)(g6YecuLa+49K^X2?+L` zhbb^zzR}D;uu~p5-<=ZdJ0HNgeQyik@xHg^n~ep&$vCao0>t~B*Quui<}*OX`R4q9 z4c5~YHl@KJN!^aR9#toEbTN;0gjx5`IeoKKXQG4bb8@pU(Hag>E3= z@aAbAq%Q|fP@UG{4mw>j1G#_lw7_&Z&FBG4fN!arAYgi&(!4-?-kksN*uDOo7!6m$Lj9NE+#&n2KgX$G{Ri|6?}5qyL)? z_CIEWa08C>-;BKfOGW?y0U06O0s!luyeLvP-^KpdwIbE@2N9{U9}QFi@Z7gv{DxRa-ExOWBsXyMLnQ?s#$V_*nZPyAt~2_h^GVvaPUe!r)%|PRM)Jg>a_p zF3jGP2hg|=NgAwIy$LM)aIFa*4Rf<8FxMN*EFEws+-jk0luSJvQ#3ExAzeiz;#Org zycIsX1&I%tM~fDPWQBCgx_2n@4x{;Ds!II1AVQAbU$ylu?i*Oi+8haE0egxZp{J41 z+yMY}I%K}b0s#)K{99NGh-QWa>TCe2cCvGHv$Hk@dBC^<^$Buwif4LO65e;|kNG^n5;a0cOb z8VuO)`kls$w8hB3pUUz`G6I{uVsNlSAWz8<%c8f`Y$}7e1Zahv`v5?SU~)m58#V5aKDs=@CYD{@UJxF zocn{uhqU3pU&q3aIq=vQP*w($D^u zPXIWA_8SdYcK%7@5%{A%DDNM01`35EUG1+tfF;o%G@yHb*C~_-=%nAv@*tNZztsnW zBC*u3JYc-QX`bI{e30M!7YY+VBEetF^1}WY_fTN${!t&V01^rQS{4rZQ}=LQq;LE! zA0Pi8x`+Qc-=J{8KlAZHfsKdX>H{qCAF}ZA{n2m0R?N?RMlb~wi?r!1g2tk5Bv7s8)CgwJdRuG<(r>;(B8W!#l z1AwihG$4wFmpcS1=KzQxesY&MxywO}eiH}wX9GaFLFrvB0O@{;4^_8tbMkOC1KI=1 z5AavAFt;%gck%)_asmH%d3ZUwc=-{$NC*!X4<|1i32=c*0PO=Raf9&xlv2#m(Fynk z155m_6Y%jbB`R7tTDe<8U?+^FZR~+&L!i?3fG#90%$&?EK4$}c&1x}>j@)k@$!kIPpDpBS;f6p zo4%j>Ek~10L+fynX1ukhV$`>><>vNq_Oz{cC-G6#qy*=#)xqtB2$A-6+I^*&jfZi} zw5$}aHLGtGpbe0%jvB@*3kWYfb=UXm3AsEwfj%{}Q z@=-o~VZQMz4UcJ#q}7XFZTuS^2R9Bb`hSzmOK1H-_%h=KA?~AGtVJ6kPNMPj<@2Lf zPP^67-mHl&59diM{0xO(N)GczRnL)g2RmNNtSawFRTH1*eQWmI?bQ__4MT7D^(lG< z4VN#0hbbblc)R{PO$uAz0vg_3nrLv&yI(eOjficUQSe4hf;3ApE5=ohamBj$4V^c5 z1y&(BC7;N?P`}w;q-b3v7VvK=7=&XGnvgBz$BR2y5Ms-_+#QOlJJokK1Qpt^!2`n-WAnv zC!bEq)VJWILEpzHH>Peh8r)jcURNP4!aMl%Bg0>Co@{UY_RU-0a5EZoQ#}T&qNIc3 zZw^+}%DrX7W^|hiQ^v4wzsYj&Zg>BO_3md1vMc(`k)JHq)ep7?swBLyR}NY(6GTsZ z+`GO_t+eWXBPpZic2@>RlGOqhla;>;Qk1;c!;nW`@}*zXJp(?ybQMfy>Q&1xiG*>_ zOON8d4n{sx(Rj}BMj%w3IycP4SCss+|Fh*JnLO&F!{NCh$ebb1bat4!Q_8;D#r?>8 z<4Agy=F&XvQI?#ME97nS515Mzq!QX*m(-P%rz{zCuSk?&Wf!jP1rP1UQsUuhnjFsH z#ng6$tJHnW+EK=X@ut--tf5pMs$6}gPuccjI!n9}|! za+5rt2>ADEAF0}0%QjHjO}Zc>?4FDT!6Su_*|yPaHaNQYY7m#5MzU|_S&jOHF|cgErT^MLSl3Iv^<~AhYH`0LV|BXQd=aZsz4dW9 zvtEdyT09PnBGkV|lZ&<4XRfH_E%EhF2R@I#uwolf4=@uD-)-u=ZnZsdfh@O2bH*>& zmpi|~;XR%F0g>4?kpc=EN_DT*M&W{x6un@PyGOc_aI~f3D6?Vo&<9jJGf~RMvgFkI zg*)%>z4 zTZ7upE7U+u7%bj*XnTZrt11oMh|Rm2;5R0)K~Or^@F>dtBc(%I;dxvrzy0e1t*fC^ zxs?6rhK0}b89r=Fdjxl@u7z`H#k*^-eP!~Y?`5!cQoZD@;`!8P3gknjy*-FMyQb@b zrc)Tl#9vXoOqh|aZxYGNf+wM_^f94AEi`9jI6yxk$5!)(i&uvUy>c-{Za(c~{*@{D zs+LCl5ijj&gyt-fzW0SZxx(`M3i&;(Om>u}5R3;*QFv6>U5COniahlnI$!Q)_YQm< zy7n|+pOA;Otg7c}^Fn?Sp_rOp234N|>m|5Qa(1$Yt`q6-?d-jX76n{N7u)`G-sy>y z{kVELF3JzQZcKMJ=N#QjIkyQ=W8?X01}#S(1;^Hw9e8 zRdE<6bogu@I@0S0clSg$yb`c7?CPa9ippVmBFD&(q54UGT<2%YR<_Xkqb3)nm8J~2 z8ei{Y-<0lVW&6@aP*)T7!4?48K60inqU2CJAJjiMrcq2&~}mtcEu?B*={9$CWm?C zXC^p)G7Hmv;XTdvsl4F%{Wqf18i_89L1UNfM5_zW*Q-Y}7`@u2>?aYHL1LN@$bS-V zE}9}?%|N)Wv)e{%e!Hi2Lv4rH;;BH|*sw8WKj!tKErwh6WL6NkS{Mi zT)1?bp*)rjX>nY?bQN!N8&4LASMi9Ct{tc$Rfn8#Yf@iJZ3mp?$a(|;p|ag{U34fO zv9Oelra?4&UYHdppS=&D(;DRlC$KBjl-lI_!p)!AiWg4%$)ZujMD&e3`cSj;YML=L zny<<+?ftn3FH6POw^f00rtoc`@+)8~=6Ov+t8*|uUQ+{{g|sd`t#XR4_}Tq zRj}gbg;cm|jyq}Uc;Drx-2qDc^9l5;uLugnG`w+7dfQWsZ{fHjq)Fr(qisoq7Zf{u ziPZ$=dr2bI&)S4(u@YQBb3xJ_Sr2UgYK%m_Ug?ZJoA^_yQtbLt=QOAH)wK$@VjK@3 z!(Hbu4G?T_+?^F1$EskP@w@6PG0hkp%~G}Oo<+r@_HLF=D0y0;9{QCWvy~_R{+Fk2 z8s>wEj?#1LLH6#1yV4uxId9Q=DN?G0e%e*8z!UVd<6*$aL|}dy3D9nS#(n<2<3@gE zN$-o;*A~~yY`#u$&}kKUw;U__zGX$T!l*Tn{TY2=bfyrTV=7~y-rJFx+M+mPtf$f2 zePKf{)pvCJ`MSg8Xk)dX)BeO(!`cz==mX+3mZdMon-F-${`WgHw=-wFHZGDFk9|(L zOy6w0GQz{)KCv^3&yrXr$8*ru_D!SfJf50sj=`OCnEbZ|{hc@N-`S}2Vsl+^YV>vO zDJBi=H&b#NK03o8zbgdd+!J(mf_RUi90>2=+(_6j7<`NYL81Y~=pP!0eNW>+7!m+~ z{{aG#QExAyi(yHuYXmDyb*pwutKrHW4RTrQ$nFGk$S_zlnbPxrneli(8Q%o&r zck#aErtG8()AloCyjsyx&qNg7&{`N-KMz&TY?jC6Z!&sZh-FJ6Mn0?Gi46@^XMPG9 zoibk&jLtheJU=qT(1oIrzvKE742wYW{hpRU?1GWHmi&dc?WtxWGV3guVn zlrHSMNajLsMP5!bA*<<}VZ#o8A9bZQr7bW{R-QC&r4U`!vG9CKxq*vP=FOyK9OB-% zavAdQ>E`ZlZOWQDli|KTm(U~vi9b`n)Gw_Vl-Y%1(gz0I!w4AXT2BXj{{wNcd zgz?DBS3B7==%h4dmOHn9{5Ve-8AXAjw@}PD;QW7KX^iS}F&Hqcx&@LSuVhV_*9+9_ zaC?tlK9{4B$A`3e7HXCgP>RsK;YsAVZ6U?Wjj@iD>Tf>ku4Z(a%Mn{`K zu=uB%NfViW`zXa4`gxRJ<5PlKvTF5vKV{c$7w!fs+w{pAw*P!@;9^o`EyGS2W{`lE+N7?rMxX=sjU+9Q%%kAP^-iVkUYK)k z%7L&W%p@RhKa5d@TbNp(iAQovn_i|&7PlZ^kEI&=Mpc4S!hL25iyVnUB5>k+N>vDM z_`7{_3p6xa4zV|8_5b`0@~qmgQT3d4+<-vR66Wt ztK^3}-1i^cl=l`%__4mE@1-TvxuYq_U?8P%UB&eW{K9*5DxVTRp)317&m|1I^v5o& zysF73sbS0uO!h(|btbuu1%Rv5`P{XXUrq3pd zWgD?#3O3j0_pFD#B=0b4ebeGks_JK+DXDIL|I`oOMi|k-{=s$meQH{Qm`MfexbrSy zu#nD}LrYU>!2`3WZcKNq9<$uEg*Di_6h}%Y@^|qLCXSsyBzyPH{MiVb&1WeyoR&F{ zAdZcg_|~+ZluI-|b|wx_0zSRxn-F}ly*}$J;-9fjEJ& z2My*7p(T^_X@j}^D21+EF~G;QMp3CNP8VPAApw38nkKF^C1r|)l4zNztMRC74ZP;# znZQs=c@#o{Yec7nLUR4L$spj>s(?`3=LkbcNlyy}S}{80l6G9n{igfp&OC4CCedZ`Mdvv6$CfCyo?`FFx7ZV~sa{bYrmcnmPl~Y(N^4naO@uimt0H*~@JR zEhT1B>|E5v2Zb1JLFyXvZD;eQsT5_x^P71dmQPcWx=d=6nW)P`_V^yp>JPP~Vb&H+ zlTOi%z3#hXsgmbtHn^jIi;d_=vrF8XqDnBaAm+Aq_7zz`>=rQ%y(m|98s~NTddrDi`lL z!Y9%prSB)|Rj3jO?B7bIumdxoBt7)$r+2X#f8eFbOG_6QIMeR)m73 zMFX1BnO}^NWgLzgx69qRixFmWPHc}&C=N$(*z@Rm+1)zUO)g5b`+8kcO_0=cwy(5A zVSg_ zqmUg1VB+QZZ}lOg>Mp$kLW1uTV+^y~8^YR&%@Z%9n{uY9L%k-h@wi`WnLBe?IcV&r zWEke0VGcZ2Gooal23l3`!jzm%K^ELGj|ml|iH;%jLm?9?SnyZYtXAhB3By^LBsPQu ze6JGf#th{ietD@t0$buPdk;gyc9Zy(TgfIe#`~?+LaUP353tRhQiGQ<-%9Q`t$g+P zP(5|?ow8xdo=?TrT4TJLwpOE3*J&{2Zudj*oWc$ml5xjg zyPK?;Ud@scs<$EZpzDJgKonZN;yL>ViGD2c-N%(_0C*1O_MWpAZ-2yc$DMuj*8neH=@Y-1Ewr(#qR5)UB1o84fM z8;x)FZe_YtDW@fs(E36^5Zfw>sU^L^n2VTVFpn*WYR;6-Q(DJh(k6wbYsZ|m=!=tj z$fQ3>hUL>g)r&xZ0^nSK5p9K3`Qux-a8|x!3yaO5$*;>XJqzAA@)Yx73cmD^>UtSzgmE}n|-^(09xaJ)xgNe8w@9Je%Ic*t*KX&P}6y}8X*KMRiwPRJK`kmN8%_{`;F*HRsPF-#L}DMVqgX zWyqAhlg!0C`X*Z%pW*Rtn+6^4^IH`^&BPe_agEsWoTb7=q-RwxK zP;RCz2ojEKCx5KB_-5aJE`UzGOSZ?fC-n}69p=5rHNtit`&w^CraN@goCozGSTRja zo`zx#Jj=IcR`d_^eTgsciTQMkv23r6X9rjauo23VlQb|UZN}HQPW9BhdS!fF!1W3Q z+nZ1B+clxDdYdfnovQ4ZRgpzZ&Q!vL%OAgBb-TBnf4KZ(Bkub)4T+G=H53{~0j#+H z2KRC#C^!OBGRbW}Y)<@+TMijkSuBxrKSl)~K7H(9s7d19pTroqk;#_JIvR6pxg~rz zeo{v0P0T?ob-6{BizIJ(&_U-)kDpX=jYp5v{Rd-H&C74}W-c0%-CO?RbD+zd$+<6k zx6|ff{?%>yFPRka(ep+b`dJ0>^S$0P+>2c`YZ0GEoY?o3#u(2T-xJttpXZF(PR zr?+m{4!b@-()H|umH2_GTLfc3LWv8@bfmoGK4C*s$jn!GX zD&OYe$0oycy;CQr4uuJzfU&%P17nXTV_a|mQ6>F@ zsHC4Hi3S>SIau1~$F^WiTdS;fAl(97_!ir0D3Bp$Qf zUd%MSyK|9?L`%3=>57egZqVnMI6Z|weKmtt{jr=#GCya*Ya?cGDm-OThG#WV)<(mL zogBT&1QD!xy)$Xpv*TK2njKa;Zt(%l?l&?R!-PDAl}u_;XwmR3IfWk0DyBJ68n*EY zn-N1eTw{%GP6=PXMrYM=>-}4)$i);j0czi^?>Af9cdp!cIqf3x5QQ#KU|ks3Uvv#J zy8Kd$C=R->QWHUX_XFa0vk>R$)k*2we8FdCFs{%Oq8CByV{z(jHj$LK92}f4%Lv;f zy{b)k)I-L7UN%(He42}cM>ap>TR@iQCoVjpM@QOrwFt+r>Gi9X-FQ2Ll!qvUMS+UB zx&LCd0U0kBPYrq!4{Xj41g0O{)7PE7dK!-gpaol5*bErvOkFSRbvgH4tCuVgp@$oJ z(!*@~`*WVb&qDayLY4LfT|)enB1h85(;YJDAV9$QH66J~SM@b7+?n@XZB zD4ftpYUa|wLOE)UujN_V+Gn@zP|o?H-E>kpIvZnw_`$-RE`yxu1ex;zb3>Im9J}7T zS@au=2P2y!myhT}Q``cjCzOl$BDzd+s~1%@Sh6^5MJt>_*u~KrE6O?Q157f<)7}X# z_%X&+PU7XA&5j_rE5ji-KdYOWVKqWU=#QYGjS779b)&B`t6w9grbB_dFH+L+-M3Aw z7V{j;h1y$0{&w+FE^xkB!6BOC3kEwJtq@Y$pv3xCGLrBE96l9 z7+Z*t?F7*a(SX=6e#(A~Nr%16y-RD8M`zkr@_12L0E)Q<9QhYOQz=ft@wMnlED87L zU2q|`m>%;_IomD!tc*pC`-`(Ap~gACa8=n4NjXk>GnBh8F+-O=e`sr9Se5X8&e}?x zn)94BO71Gt^|#+TbRmUtX|M4EXJ_U--dvhAab2Cx4|+CD?vp{cAgUGT5KM$)fwZI8 zmwb1-SDr`Ue%#~E1{NzE3at4M+{a0Z^+~qT@prllT$;QZYSzDB#LVIzGbYaFR1Fid zkXTu=UG>D78lL^OgXq^Z(4DfS{GhvxIo=YGH&mFOKOAb?-@c5U!h7aCmJenbmEIaU zH4Z;f;F&iaQ#aj%D_fqC`iwXhrrvZ)yr?U3(a#c0&rcFxOQKafL3`#C+b?q(<5ek8q0j7=8%DU-4X(^!94g3!}@| zX0pHEomEb+H|)aCIml?p!qn)dy8ghX(MEwjYjtIRlcqiQxq(oVRBMh`!C=>-94BqD zcPsrlWhJ(V1qV${X&ojjf{CW)&d8ZRw=ww~y3}J}q$G#8h zw5_oh-@sv&s&768VSjSIoK+?o&u{1~eU5DCOqr_$I(j}G#@jPlLua0HUplkaLyC_= z6cp18IL}`!Diq|RPxw^ z&Zew1L2K3O*2NloQct|@9ZK6CN0E2mxCnVR2@#h|9T&ew*tOxWhP7H~VVr}Yc`;*l z5YB&YVaJiV1s}T}O`6Mc&W!p1J?c8eOA5&cXDQsYZM@@p*5J8qqe+oB47ej&$tcb@ z_+{6VuN>kz+TJinp-xoufxlV(L8^8*V8C$p>#%w7Mf2|YFzL>fFC9_JPZ$g0n<(VG z>ADaIy;VSmjN{xHAb9fO#g}X9mpV(f!=61h4P$&5;f&smIN;ln;tpRk$S|p(v~iAM zn0fPVU+}eBR;~UhJkm0*<<+3-Qp}=fbadi{g#OWC0V94js3h$$H9_j9gBGnc_JJ5d zOPv&CFUuWXv{_RlRwZhK=n09A0!=mL!p$2UewGH<>?MS0qIzkc`aDSUvE~C2)kfou z-Io;E&V1gw@cpt#pRtcCx$}?|jxMiY!@i%J+j}x5jZf5?WhoI3yE$gJKVL#%Ew7BY zPuz~5z9QfP@A>-DwBY5BGi@VA=_quG0{O#u5dXatq@Y*_~mB!ZlSLVJH`6N2t^%SF? z4j-Z_#n<6~xuZ+_KI=_ATkFy{fy`_^_nSfbwXpBp<90kw@0|JbgTspNA6(x-#Lpe& zk9#PLobQ)zW7x2fbx4jj#LP=AWFH7N7q$_IOnU-}XO-F}aT0|2+H%kutmeh5u}10{ z$#GrFR&ZTxXVK`tW=ETS)+!1?H&fWf+gkDEpl)#I^18@}=X%yCtOW%u=l`3~9xaCZ z3IZ&b)CV!ws8;^Svk@FRx^a~}o9y8#=Tp7;*B@Z)I)JSxmG-+YG05byl7ok%q{#wix z7|xx2{$Nh7fPx2Wn+IaCbT9ntavDa=kQPJqkF8aTv=4KxpSN*}tygRHUE?j0*rkzYrszdL}i+v+>K9}~1x z*ubG@w~-7vbLD%LG`IZCbSG8Veg z@l1zDD*GS-)^|Q2X?Fq@d?I&+9F{!_KD_6nGhN~##~8A5v7ZaeQFx*Cy;1WU5faYD^!9Zl}wtG>3=%b?>?W(ywRD_fUXbE9TignP^tVAUzJ&hyoN@tD>N9}^j z&t{41<@_|r6TV))#j#m`pAcJX!;^2ihH*ELP%1s>PWhEJV$FgJCf<&H2xZ!?s?=S) zQEy#}VKFjXXZOC(mYKbSs@@n&aZQ6C4_PJ>A~j>*P${p^b~1z$KV{fgK6~Eg(otVs z%HHFFdbAbA%^QPzCEZY;AQ46sEk^;Bk$(XT5Ij8Ez%B-lIv1Q@zn zEN2)RVdr#^oVYHegX&@7&l&a)Ib2K>yiwpw-oKcqA>iO%8W#ecFysl-uN@T8#^lTb z@jtn9mZ2GcdJ>YPDutDVCRvWAYeR8nor#p|PjP@9vp+X<{$h^%OB^&noIkeqPVJ&S z9Kv~kf-@>`_E+YoF6Rm?jMuG@+zP7ESeMI7kk~u26JuNYW_*WK7Qf7>u#)v`b(BO& zN~71iO_e*FnS-XA{`d9VNfeV54aGelnw#BX7D{57mz1TGQLV-iP8w-sez<}wv^a*= zW-fvGfb{ZwSAfr>0`zkuo?51Z1{^bmQqNWsgI?I4UvgfuqSHf&exK;=57}q=A<;Tk zC;sFOUGW(@vefPL0C_dhCZ7!27t!cV;nZHuS9mvScXU6ys>FV*J%Zp>^D0O4Aw)%G_N|4o zI4;Bct+wmg?`-_)7lvH8y;4axAcDgaX z*R)WFdC8l@m(h>}#fT~lc*Rf{y@4`K`U8yTV;)L}llb}@hRa`DQfc(WO^e7~+aCLV zc|NuG*>LsY#X>*P)^~hn?q%NZ?>Oe^^LNO%LOqM~H`=ze;TXwoG-^(L$+V-2L4{E9 z*_OqS8FuP8Wa{w913iHhgKOfcl&WG*NwZVarWSXqEd=w-1Ujm8?Yb8;(XZxWmx{fe zsMNE1B(iJhzw-4glddEO3Jst@Sa8H&OhkZPT40X>r*@xYPt=P2LdRF%kHbs3M=wcJ zzhQ*mJ)={l^tKXu^XjBF|JvyJBT0vSk`6X{naUS=Cb{D+b}o|~f-y|mH1#s3)FE=i z_7X2Lkxlbt&^OHGc+ya1+?0g<(NrHpNmpB+(V0N=GtD`(uqDmUUx*Cnn|)v3D&#X( zoUirnc>E?a>%E;*o$mn-UcFN5EjL5?G1bQsy8)!JmC5^uvx*N=1`Pza9#Cc*js!kd9hAb?Ae+!n@!;)@>03#ddq z2A9)?*_3*Mm3$=M;j||+N)*mL@s+cc+lz%Z2!AN5>*MHFnDr+s7}t{RVyzY3BJOSK ziru%k5;tl4zcUi+NsDg>(tR2Ph%mmb1Sy^5n^St znnhb5jMjUIsGpPy3K_~hWnWW#O`w-Ymp&V9%{<#~N@bvbs37{~ljVlqr}z5zXz&s( z2T&*&1;PW+{eQlz`0Jeu3~+A$;gu*jx9{o(=Jj*O^Lp;Y&MA5HGY_0`?xNs{3Z`)X zjVHTTM}W?SCE=^mgavt(;T``r*Lll{ewCQjzGFbhrKT?b+uSD@gV86OwR#ao8wJm2 zVIpLip9IDiO^M0Qgwf;M;a8`WI#V|&qG#S_`L)C6$-RCXGQYh0j91TUwP{0sw;KkAZ zR3}G*oGZN;mK3;V`#8+}&61!ipAWcZyG{?)R5fSxKl2V|uUp$NR-822Wpd+(>R0}? z1s9R|m7$|{`YyyS$7Yk$m22%2_cBa;b-l%kBo*#M)P98UMuP^sIM)AtWAd6LAR zC5o_-vXH5XTI{N3dxSiE7TWPlymK++TxztWjEAOm@U`x-dy686dg7|CcJn_P>nQf=BWZNwM3C zTHinXIMNR-ctU1BOw03ujLm-f;bY2rZNaC`>|RqE>WKP_`>%RLp9|^vHtTgIzpkDA zU|ghIK;f2aql1u?Q1Q6b6(c1%XLE;@&`%T7`Ug{sxa5G+le6aUhA^R?$;02eZ(p7J zkvx7gdU?^uk-9l%$YPw}B<&?hop0hnWA-Tk>^D(H~0a8Fv#{PW8+JORDR$M1>?AGIgSc3C-OZ4iD-`pXPiiYde`BaE@(;v7$}D}Liv z45QVcR|&^imT7VVAp#1udMew`cD;3$g3i3zl4n#BJ{E`-*(P$aTN|X6cYXD`5nG7( zc2L?=?=BX%Si*wyyiWq7a|!q0V+Wtq5&@>=8maDEb3|e5N|j!}Za|l`e(d!{ihPe7 zUza}S#N~XX_04^1gPFKNNc-r1>GuMgdqj&-Lwy14S+wUWo1=oie_S;hN|sfbv~m@y z!_~9DITw`3?`JP<&VGYvzQ*Se_&jmUcZAq4w6H6@dG`*fGY3Yu?w@zQum_=g565 zO2yW5FNdCx(q}iqf#tC zpVPWB-C-;ZWnK(?(G;O8ROA~qenrS|dI2Yd4MF5mG9JnI+Ld#1e3r}xH$mJ-d`0SW zxz~|BA)`fsxh11~?SLs&nqTNzR1;av?&Wu|8&D4#(a`>R2BSG$Y zD$FNdKZ%3Na|>ImMn=BrnPY~p>j6uF6}k;+GPz%ct>w+?uh7)z#a79TJ%txrH6%;~ z<;bHCqe@D4O z62vE*4=-Ar4R))UVl!5J97J_@VmGK=-{q%^W~f zF(*{5?+~}|?;DeN^~~E>eB;w$PX>x&VJ@Mt=ljUrQ9aUh85rjWyQ|8t`0lM$wpzAnEo|f>K^(^NQrxcwUNy5&GwF7*?*Gu`J{6coV+#ii&M?aOSr3D8bemyww)46bdnX%`K%Z|6^&zFbDSNS&^X9~ z`Z>BhW%WhT0u-PS4t!qopU?>z`!*VQHA!NIH_n{II{`GIqYQ5STSUu;YBZy7cXyqUmJFKxC!|&1az~n{$ zSSrIRqutG-193v+lVCh)v^3{HcX4I?$FO`|=IOdKJB0zO9Q}_>>ZEAKy5;Bw33!+j z93nj)$eb%x6iB1yoYzTzXrA9z6sQsgeDVIDs!@vhpDAL9i*KY0r9GeeF`3@M zV-Rqj*5FEUW|OVAJ~s_H~BocZ4q`Im~&kcbjrj z%4=epeHqjEFk$WaXm`5eJ<*MOD-B<*rGsx~dR7n;N;-AwvV1&$ErS25)|+>oS+&@& zd2%jaWhH(-XN~**5nn`Jg|QTl3sWa&cB9`AVam70I7zkKw^(+VgH^&zQB&ci?Q@oo z4$|rG=!vDh_*{5qt#*G#(j!`mM%>$;7$ zKcdA-4{@HM=G9Z(bvHLuENQ}#)0=jd9;c0>`h45TTH$g=v`W&#g~X(V;e_YUr=E2_ z{r&^}k%T3RwGiNYvj2sNpC>jPAHx6@H*vH0nUYbF)>GBHD&}fqVy~nLYTeb%UBcSL z6(|W+G&xDbkw|PPSU}Rk&CJ!t+1<$%0s}p-Is~fWVd{Q-7zD@w2tmpw4#1HL|E~9E zVey}5C~)xbAR!zG;MfE>7w}#U@NNwccz^;}y^@K$tIhE_3!Gel8wHUr*FPyEY$zz0 zDscP*aIOOA%Yd^Pq#S``62QYD{&y(FFA_FX(#F!#0yx?MJj=lVf&ehGo3n`-aLfc? zUe?~u))tNssI!HujgvV9-0JtSaCL$}og4x2-ERUZcWYM*AZ-bp1_AtU<7ojw@?k^G zoa~((AwUzLVh|{B;DiJOgfmh=1AtaTpt8Wx6A-98a7qNAKL}I_PzVI70-P}cfvQ2E zpuT_;DTfhQkg)YrVywZ1Zoa}g1jM6;B*aZ2-F79 z6Ck2J1nK~R0%8N=IYVGTA3y=w+_0hUKur*+Cj@#EI8Fi>_1-{pfUgaK<0GItHshLv>GnAYp(;<$$9_Y|O+Qt?VrT!J!)N77p5ALdnGIxJ7_M z0Q>&mVI#-w1Dj(2;RTkr|36&V|F3nxf%oXa3ji4qB$5Zbu=)7-Aw0kf5iouj1c5;O zyZ{=KkNam{fcC50Nts_|Pv5bBJ^v>^PzPAg@dfZZ)(prDym;a8_KjnZjV45FT`8lS6^#k>t2!RdWxq+bo)&(~3myjTj zUuiJUKWR_~-~yk6caZ`CV3U6JJ17=-ff;}c8*x(gaaI46`Y8nBSTV=U0X|^0 zpwM8ZUk(0s2Za4cC|+P?8UeibdHf6z9?TEYkifnaHz+T#Uk=iKT}Z&KAi*;KqhTWe zUj&xt0~GM<0$cGD}`L7{(L$JPMkJGK~r{`>t{(;y9e{@Zm@ zCpIX+uLhsKgG1rC2LYjffA)*-ulvcff2`s6yr5!G^7z%s!2S3`CeR}s2a8{xP7DGA zQ6%7r#Ua3o%yCKr7?wX%Qb17pnUaP8e)RaM3}6I5Q?d~Fu@}V#d_4Rl{>ejtNOqi0 z5wOFbDJ4LMKU2yO_z4dc2>gVHD&+X<5FmP~K@cZA)WPs{T#E+Kxj$2y5THkoXS*9sWXKU3Ba?voas1Wq7e9%r$E06lRmhCR@!KWPpSh6(@aEAm-5of6=_GX=r+JQ9 z0+A31-N&mAKw~^0$1`Ox^}jI7bHdvb4C2RC-2{fh&y+U=7$^V@8;G93^3w6sV^{z< zHt@&)!VTD%UnV6DgB(Mx-=Bj_emxh1{SHt5IqeE6AqHfEnwdagV8u`|`2RH<5YbPv z{U!n^{RxZz$ptp#4m7c#>r6bVq!TJB>i{Rovty3-p67-7yl7$Pw2fW5~ipI?adYFIb z;|3SCPSbe)nR>^-al*z2ImI@1Niqpzkxu2Sm6|n8;sAVXn+qoP2-0B z(I0?c`@>FQ+<=k)QI`J?dx0V0f7lBQfRm@k8w|++$)K8a6%_fPV&N#7<6NXgQ9wa*mcxkmJyD97)~W XOEjs1TBrW5VE literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_summary.md b/audit/dataset_statistics_summary.md new file mode 100644 index 000000000..11ff1811b --- /dev/null +++ b/audit/dataset_statistics_summary.md @@ -0,0 +1,37 @@ +# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains 40,495 result rows across 59 datasets, 178 evaluation names, 794 developers, and 5,299 models. The coverage plot (`dataset_statistics_plots/coverage_counts.pdf`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of 40,495 result rows, 40,395 rows can be converted onto the shared zero-to-one scale, or 99.8% of the dataset. The only observed normalization exclusion is 100 out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`dataset_statistics_plots/normalization_quality.pdf`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are GPQA, IFEval, BBH, MATH Level 5, MMLU-PRO, MUSR. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`dataset_statistics_plots/top_evaluation_coverage.pdf`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. + +The new model-per-dataset histogram (`dataset_statistics_plots/models_per_dataset_histogram.pdf`) adds that missing model-coverage view. Across datasets, the median number of unique models is 37, and the largest dataset-level model count is 4,557. The highest-coverage datasets by unique model count are GPQA (4,557), IFEval (4,557), BBH (4,496), MATH Level 5 (4,496), MMLU-PRO (4,496), MUSR (4,496). This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. + +The inference-engine spread plot (`dataset_statistics_plots/inference_engine_spread.pdf`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are unknown (39,618), ollama (450), openai (150), google (54), anthropic (47), gemini (33). In this snapshot, 877 rows have a named runtime field and 39,618 rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. + +Mean normalized scores vary sharply across tasks. The lowest means include BFCL leaderboard CSV: bfcl.memory.kv_accuracy, MATH Level 5, WMT 2014, BFCL leaderboard CSV: bfcl.memory.vector_accuracy, while the highest means include BFCL leaderboard CSV: bfcl.overall.latency_mean_s, BFCL leaderboard CSV: bfcl.overall.latency_p95_s, helm_mmlu: Marketing, RewardBench: Chat. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as ARC Prize evaluations leaderboard JSON: v2_Public_Eval, ARC Prize evaluations leaderboard JSON: v2_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Semi_Private, ARC Prize evaluations leaderboard JSON: v1_Public_Eval indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`dataset_statistics_plots/score_range_by_eval.pdf`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`dataset_statistics_plots/normalized_score_variability.pdf`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. + +## Plot Notes + +`coverage_counts.pdf` is the high-level inventory plot. It compares the major corpus counts on a log-scaled axis: result rows, normalized rows, unique models, unique developers, unique datasets, and unique evaluation names. Its purpose is to make the shape of the datastore visible at a glance. The figure shows that the snapshot is much larger in row count than in benchmark or evaluation count, and that model coverage is broad relative to the number of datasets. Because the axis is logarithmic, the smaller categories remain readable instead of disappearing beside the result-row total. + +`normalization_quality.pdf` explains how many rows can safely enter normalized-score analyses. It separates normalized rows from the different exclusion categories, including out-of-range scores, missing scores, missing bounds, zero-width bounds, and incompatible score types. This plot is a data-quality checkpoint rather than a performance result. When the normalized bar dominates, as it does here, the downstream normalized mean, range, and variability figures are based on almost the entire datastore. Any nonzero exclusion bar points to a specific normalization failure mode that may deserve follow-up. + +`top_evaluation_coverage.pdf` ranks the most-covered benchmark/evaluation pairs by normalized result-row count. It answers a different question from model breadth: which evaluation slices contribute the most rows to the descriptive statistics. The chart is useful for spotting which benchmarks can dominate aggregate impressions and which ones have enough observations to support more stable summaries. A long bar does not necessarily mean the benchmark covers many unique models, because repeated rows, submetrics, or source-specific reporting patterns can also increase row count. + +`models_per_dataset_histogram.pdf` shows the distribution of unique model counts across datasets, where dataset corresponds to the `benchmark` field in the JSON. Instead of listing individual benchmarks, it bins datasets by how many distinct model identifiers appear in them. This makes the coverage imbalance visible: a small number of datasets cover thousands of models, while many datasets cover far fewer. The median reference line helps distinguish the ordinary dataset from the high-coverage comparison hubs that dominate broad ecosystem-level coverage. + +`inference_engine_spread.pdf` is a horizontal bar chart of result rows by recorded inference engine or platform. The x-axis uses a log scale so smaller nonzero runtime categories remain visible next to the very large `unknown` bucket. The y-axis is ordered with the largest categories at the top, making the plot readable as a ranked metadata-coverage view. This chart should be interpreted primarily as runtime observability: it shows where execution-platform metadata exists and where it is absent, not a definitive ranking of which engines were actually used most often across the ecosystem. + +`normalized_score_mean_by_eval.pdf` ranks the evaluation slices with the lowest mean normalized score. It is a quick way to find benchmarks or metrics where the collected model population tends to score poorly on the zero-to-one scale. This should not be read as a model leaderboard, because the model set is not matched across evaluations. Instead, it is a difficulty and saturation diagnostic: low means may indicate hard tasks, older model coverage, sparse high-performing submissions, or metrics where good performance is rare in the collected data. + +`normalized_score_variability.pdf` plots each evaluation’s mean normalized score against its standard deviation, with point size reflecting normalized row coverage. This figure is designed to identify discriminative evaluations. Points with high coverage and high variability are especially interesting because they have enough rows to be credible and enough spread to separate stronger and weaker systems. Low-variability points may still be useful, but they are less likely to support fine-grained comparisons unless the goal is detecting failures, regressions, or saturation. + +`score_range_by_eval.pdf` ranks evaluation slices by their min-to-max normalized score range. It complements the standard-deviation plot by showing the full observed spread rather than the typical spread around the mean. A wide range can indicate that an evaluation distinguishes sharply between weak and strong models, but it can also reflect outliers, mixed subpopulations, or uneven source coverage. This plot is most useful as a triage tool for finding evaluations where a closer row-level or paired-model analysis may reveal meaningful structure. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py index 8b2f2e0e8..69ac29aaf 100644 --- a/every_eval_ever/helpers/dataset_statistics.py +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -24,6 +24,14 @@ STABILIZATION_WEIGHT = 5.0 BOOTSTRAP_ITERATIONS = 400 RANDOM_SEED = 20260429 +SCORE_GROUP_KEYS = ( + 'benchmark', + 'evaluation_name', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', +) def read_data(datastore: str) -> list[str]: @@ -144,8 +152,7 @@ def numeric_summary(values: Iterable[float]) -> dict[str, float | int | None]: def shared_evaluation_key(row: dict[str, Any]) -> str: parts = [ - row.get('benchmark'), - row.get('evaluation_name'), + *(row.get(key) for key in SCORE_GROUP_KEYS), row.get('score_type'), row.get('min_score'), row.get('max_score'), @@ -497,13 +504,13 @@ def descriptive_statistics( 'score_summaries': grouped_summaries( rows, 'score', - ('benchmark', 'evaluation_name'), + SCORE_GROUP_KEYS, summary_limit, ), 'normalized_score_summaries': grouped_summaries( valid_rows, 'normalized_score', - ('benchmark', 'evaluation_name'), + SCORE_GROUP_KEYS, summary_limit, ), } @@ -598,13 +605,29 @@ def print_report(report: dict[str, Any], descriptive_only: bool) -> None: section('score summaries') print_table( descriptive['score_summaries'], - ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], ) section('normalized score summaries') print_table( descriptive['normalized_score_summaries'], - ['benchmark', 'evaluation_name', 'count', 'mean', 'median', 'stddev'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], ) if descriptive_only: diff --git a/misc/dataset_statistics_summary_writer.py b/misc/dataset_statistics_summary_writer.py new file mode 100644 index 000000000..ca9e19c51 --- /dev/null +++ b/misc/dataset_statistics_summary_writer.py @@ -0,0 +1,109 @@ +"""Preserved Markdown summary writer removed from plot_dataset_statistics.py.""" + +from __future__ import annotations + +import statistics +from pathlib import Path +from typing import Any + + +def label(row: dict[str, Any]) -> str: + benchmark = str(row['benchmark']) + evaluation = str(row['evaluation_name']) + if benchmark == evaluation: + return benchmark + return f'{benchmark}: {evaluation}' + + +def top_rows( + rows: list[dict[str, Any]], key: str, limit: int +) -> list[dict[str, Any]]: + return sorted(rows, key=lambda row: (-float(row[key]), label(row)))[:limit] + + +def pct(part: int, total: int) -> float: + return 100.0 * part / total if total else 0.0 + + +def write_summary( + stats: dict[str, Any], + rows: list[dict[str, Any]], + plot_paths: dict[str, Path], + output_path: Path, +) -> None: + descriptive = stats['descriptive'] + counts = stats['descriptive']['counts'] + quality = stats['descriptive']['quality'] + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + out_of_range = exclusions.get('out_of_range', 0) + models_per_benchmark = descriptive.get('models_per_benchmark', []) + inference_engines = descriptive.get('inference_engines', []) + most_covered = top_rows(rows, 'count', 6) + highest_variance = sorted( + rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True + )[:4] + hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] + easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] + model_counts = [ + int(row['unique_models']) + for row in models_per_benchmark + if int(row['unique_models']) > 0 + ] + median_models = statistics.median(model_counts) if model_counts else 0 + max_models = max(model_counts) if model_counts else 0 + top_model_datasets = models_per_benchmark[:6] + known_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() != 'unknown' + ) + unknown_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() == 'unknown' + ) + top_engines = inference_engines[:6] + + def names(items: list[dict[str, Any]]) -> str: + return ', '.join(label(item) for item in items) + + def benchmark_model_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["benchmark"]} ({int(item["unique_models"]):,})' + for item in items + ) + + def engine_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["value"]} ({int(item["count"]):,})' for item in items + ) + + relative_plots = { + name: path.relative_to(output_path.parent) + if path.is_relative_to(output_path.parent) + else path + for name, path in plot_paths.items() + } + text = f"""# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} datasets, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`{relative_plots['quality']}`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. + +The new model-per-dataset histogram (`{relative_plots['models_per_dataset']}`) adds that missing model-coverage view. Across datasets, the median number of unique models is {median_models:g}, and the largest dataset-level model count is {max_models:,}. The highest-coverage datasets by unique model count are {benchmark_model_names(top_model_datasets)}. This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. + +The inference-engine spread plot (`{relative_plots['engine_spread']}`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are {engine_names(top_engines)}. In this snapshot, {known_engine_rows:,} rows have a named runtime field and {unknown_engine_rows:,} rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. + +Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. +""" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text, encoding='utf-8') diff --git a/misc/eval_hierarchy.json b/misc/eval_hierarchy.json new file mode 100644 index 000000000..5ad8650ba --- /dev/null +++ b/misc/eval_hierarchy.json @@ -0,0 +1,4187 @@ +{ + "stats": { + "family_count": 20, + "composite_count": 20, + "standalone_benchmark_count": 10, + "single_benchmark_count": 108, + "slice_count": 58, + "metric_count": 208, + "metric_rows_scanned": 41616 + }, + "qa": { + "fallback_metric_count": 2231, + "fallback_metrics": [ + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + } + ], + "metric_like_single_benchmark_count": 0, + "metric_like_single_benchmarks": [], + "single_equals_only_metric_count": 0, + "single_equals_only_metric": [] + }, + "families": [ + { + "key": "ace", + "display_name": "Ace", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "diy", + "display_name": "DIY", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "food", + "display_name": "Food", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "gaming", + "display_name": "Gaming", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "shopping", + "display_name": "Shopping", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex", + "display_name": "Apex", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [ + { + "key": "apex-agents", + "display_name": "Apex Agents", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "benchmarks": [ + { + "key": "corporate_law", + "display_name": "Corporate Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "corporate_lawyer", + "display_name": "Corporate Lawyer", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "management_consulting", + "display_name": "Management Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_8", + "display_name": "Pass@8", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex-v1", + "display_name": "Apex V1", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "benchmarks": [ + { + "key": "big_law", + "display_name": "Big Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "consulting", + "display_name": "Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medicine_md", + "display_name": "Medicine (MD)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + } + ] + }, + { + "key": "appworld", + "display_name": "Appworld", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "arc-agi", + "display_name": "Arc Agi", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "v1_public_eval", + "display_name": "v1_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v1_semi_private", + "display_name": "v1_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_private_eval", + "display_name": "v2_Private_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_public_eval", + "display_name": "v2_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_semi_private", + "display_name": "v2_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v3_semi_private", + "display_name": "v3_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost", + "display_name": "Cost", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "bfcl", + "display_name": "Bfcl", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "format_sensitivity", + "display_name": "Format sensitivity", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "format_sensitivity_max_delta", + "display_name": "Format Sensitivity Max Delta", + "sources": [ + "metric_config" + ] + }, + { + "key": "format_sensitivity_stddev", + "display_name": "Format Sensitivity Standard Deviation", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "live", + "display_name": "Live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "live_accuracy", + "display_name": "Live accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_multiple_ast_accuracy", + "display_name": "Live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_ast_accuracy", + "display_name": "Live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_multiple_ast_accuracy", + "display_name": "Live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_simple_ast_accuracy", + "display_name": "Live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "memory", + "display_name": "Memory", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "kv_accuracy", + "display_name": "Memory KV accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "recursive_summarization_accuracy", + "display_name": "Memory recursive summarization accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "vector_accuracy", + "display_name": "Memory vector accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "multi_turn", + "display_name": "Multi turn", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "long_context_accuracy", + "display_name": "Multi-turn long-context accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_function_accuracy", + "display_name": "Multi-turn missing function accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_parameter_accuracy", + "display_name": "Multi-turn missing parameter accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "non_live", + "display_name": "Non live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ast_accuracy", + "display_name": "Non-live AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "multiple_ast_accuracy", + "display_name": "Non-live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_ast_accuracy", + "display_name": "Non-live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_multiple_ast_accuracy", + "display_name": "Non-live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "simple_ast_accuracy", + "display_name": "Non-live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "latency_p95", + "display_name": "Latency 95th Percentile", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_mean", + "display_name": "Latency Mean", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_std", + "display_name": "Latency Standard Deviation", + "sources": [ + "metric_config" + ] + }, + { + "key": "overall_accuracy", + "display_name": "Overall Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + }, + { + "key": "total_cost", + "display_name": "Total Cost", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "relevance", + "display_name": "Relevance", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "irrelevance_detection_accuracy", + "display_name": "Irrelevance detection accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "relevance_detection_accuracy", + "display_name": "Relevance detection accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "web_search", + "display_name": "Web search", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "no_snippet_accuracy", + "display_name": "Web-search no-snippet accuracy", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "browsecompplus", + "display_name": "browsecompplus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "fibble", + "display_name": "Fibble", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [ + { + "key": "fibble_arena", + "display_name": "Fibble arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ], + "category": "other" + }, + { + "key": "fibble1_arena", + "display_name": "Fibble1 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "metric_config" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble2_arena", + "display_name": "Fibble2 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble3_arena", + "display_name": "Fibble3 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble4_arena", + "display_name": "Fibble4 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble5_arena", + "display_name": "Fibble5 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + } + ], + "composites": [] + }, + { + "key": "global_mmlu_lite", + "display_name": "Global MMLU Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "reasoning", + "standalone_benchmarks": [], + "composites": [], + "slices": [ + { + "key": "arabic", + "display_name": "Arabic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "bengali", + "display_name": "Bengali", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "burmese", + "display_name": "Burmese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "chinese", + "display_name": "Chinese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_agnostic", + "display_name": "Culturally Agnostic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_sensitive", + "display_name": "Culturally Sensitive", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "english", + "display_name": "English", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "french", + "display_name": "French", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "german", + "display_name": "German", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "hindi", + "display_name": "Hindi", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "indonesian", + "display_name": "Indonesian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "italian", + "display_name": "Italian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "japanese", + "display_name": "Japanese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "korean", + "display_name": "Korean", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "portuguese", + "display_name": "Portuguese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "spanish", + "display_name": "Spanish", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "swahili", + "display_name": "Swahili", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "yoruba", + "display_name": "Yoruba", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + } + ], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "helm", + "display_name": "HELM", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks", + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "question answering", + "dialogue modeling", + "text generation", + "grade school mathematics", + "math word problems", + "legal", + "finance", + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries", + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Extractive question answering", + "Fill mask", + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "category": "general", + "standalone_benchmarks": [], + "composites": [ + { + "key": "helm_capabilities", + "display_name": "Helm capabilities", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "helm_capabilities", + "display_name": "Capabilities", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "ifeval_strict_acc", + "display_name": "IFEval Strict Acc", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-Pro", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "omni_math", + "display_name": "Omni-MATH", + "has_card": true, + "tags": { + "domains": [ + "math", + "olympiads" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wildbench", + "display_name": "WildBench", + "has_card": true, + "tags": { + "domains": [ + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Open-ended text generation in response to diverse user queries" + ] + }, + "slices": [], + "metrics": [ + { + "key": "wb_score", + "display_name": "WB Score", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Helm classic", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "STEM", + "humanities", + "social sciences", + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Multiple-choice question answering", + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "boolq", + "display_name": "BoolQ", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "civilcomments", + "display_name": "CivilComments", + "has_card": true, + "tags": { + "domains": [ + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification" + ], + "languages": [ + "English" + ], + "tasks": [ + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Classic", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "cnn_dailymail", + "display_name": "CNN/DailyMail", + "has_card": true, + "tags": { + "domains": [ + "summarization", + "journalism", + "news media" + ], + "languages": [ + "English" + ], + "tasks": [ + "Summarization" + ] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hellaswag", + "display_name": "HellaSwag", + "has_card": true, + "tags": { + "domains": [ + "commonsense reasoning", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Four-way multiple-choice selection for event continuation", + "Commonsense inference" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "imdb", + "display_name": "IMDB", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ms_marco_trec", + "display_name": "MS MARCO (TREC)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ndcg_10", + "display_name": "NDCG@10", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_open_book", + "display_name": "NaturalQuestions (open-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "quac", + "display_name": "QuAC", + "has_card": true, + "tags": { + "domains": [ + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "raft", + "display_name": "RAFT", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "truthfulqa", + "display_name": "TruthfulQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "xsum", + "display_name": "XSUM", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Helm instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "general", + "benchmarks": [ + { + "key": "anthropic_rlhf_dataset", + "display_name": "Anthropic RLHF dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "best_chatgpt_prompts", + "display_name": "Best ChatGPT Prompts", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "koala_test_dataset", + "display_name": "Koala test dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "open_assistant", + "display_name": "Open Assistant", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "self_instruct", + "display_name": "Self Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "vicuna", + "display_name": "Vicuna", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Helm lite", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems", + "legal", + "law", + "finance", + "medical knowledge", + "professional medical exams", + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering", + "Multiple-choice question answering" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "gsm8k", + "display_name": "GSM8K", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "legalbench", + "display_name": "LegalBench", + "has_card": true, + "tags": { + "domains": [ + "legal", + "law", + "finance" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "math", + "display_name": "MATH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "score_details" + ] + } + ] + }, + { + "key": "medqa", + "display_name": "MedQA", + "has_card": true, + "tags": { + "domains": [ + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_closed_book", + "display_name": "NaturalQuestions (closed-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wmt_2014", + "display_name": "WMT 2014", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "bleu_4", + "display_name": "BLEU-4", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_mmlu", + "display_name": "Helm mmlu", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "category": "reasoning", + "slices": [ + { + "key": "abstract_algebra", + "display_name": "Abstract Algebra", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "anatomy", + "display_name": "Anatomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "astronomy", + "display_name": "Astronomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "business_ethics", + "display_name": "Business Ethics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "clinical_knowledge", + "display_name": "Clinical Knowledge", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "college_physics", + "display_name": "College Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "computer_security", + "display_name": "Computer Security", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "conceptual_physics", + "display_name": "Conceptual Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "econometrics", + "display_name": "Econometrics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "electrical_engineering", + "display_name": "Electrical Engineering", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "elementary_mathematics", + "display_name": "Elementary Mathematics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "formal_logic", + "display_name": "Formal Logic", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "global_facts", + "display_name": "Global Facts", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "high_school_world_history", + "display_name": "High School World History", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "human_sexuality", + "display_name": "Human Sexuality", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "international_law", + "display_name": "International Law", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "jurisprudence", + "display_name": "Jurisprudence", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "logical_fallacies", + "display_name": "Logical Fallacies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "machine_learning", + "display_name": "Machine Learning", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "management", + "display_name": "Management", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "marketing", + "display_name": "Marketing", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medical_genetics", + "display_name": "Medical Genetics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "miscellaneous", + "display_name": "Miscellaneous", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_all_subjects", + "display_name": "MMLU All Subjects", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "moral_scenarios", + "display_name": "Moral Scenarios", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "nutrition", + "display_name": "Nutrition", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "philosophy", + "display_name": "Philosophy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "prehistory", + "display_name": "Prehistory", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "professional_psychology", + "display_name": "Professional Psychology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "public_relations", + "display_name": "Public Relations", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "security_studies", + "display_name": "Security Studies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "sociology", + "display_name": "Sociology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "us_foreign_policy", + "display_name": "Us Foreign Policy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "virology", + "display_name": "Virology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "world_religions", + "display_name": "World Religions", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + } + ], + "metrics": [] + } + ] + }, + { + "key": "hfopenllm_v2", + "display_name": "Hfopenllm v2", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "mathematics", + "explanation generation", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation", + "Multiple-choice question answering across a broad range of subjects", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "category": "instruction_following", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "bbh", + "display_name": "BBH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "math_level_5", + "display_name": "MATH Level 5", + "has_card": true, + "tags": { + "domains": [ + "mathematics", + "explanation generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-PRO", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "musr", + "display_name": "MUSR", + "has_card": true, + "tags": { + "domains": [ + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Question answering", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "la_leaderboard", + "display_name": "La leaderboard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "livecodebenchpro", + "display_name": "Livecodebenchpro", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "easy_problems", + "display_name": "Easy Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hard_problems", + "display_name": "Hard Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "medium_problems", + "display_name": "Medium Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "reward-bench", + "display_name": "Reward Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "chat", + "display_name": "Chat", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "chat_hard", + "display_name": "Chat Hard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "prior_sets_0_5_weight", + "display_name": "Prior Sets (0.5 weight)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reasoning", + "display_name": "Reasoning", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench", + "display_name": "Reward bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "reward-bench-2", + "display_name": "Reward Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "factuality", + "display_name": "Factuality", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "focus", + "display_name": "Focus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "math", + "display_name": "Math", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "precise_if", + "display_name": "Precise IF", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench_2", + "display_name": "Reward bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "ties", + "display_name": "Ties", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "sciarena", + "display_name": "Sciarena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "cost_per_100_calls", + "display_name": "Cost per 100 Calls", + "sources": [ + "metric_config" + ] + }, + { + "key": "elo", + "display_name": "Elo Rating", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "swe-bench", + "display_name": "Swe Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau-bench-2", + "display_name": "Tau Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "tau_bench_2_airline", + "display_name": "tau-bench-2/airline", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_retail", + "display_name": "tau-bench-2/retail", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_telecom", + "display_name": "tau-bench-2/telecom", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "terminal-bench-2.0", + "display_name": "Terminal Bench 2 0", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "theory_of_mind", + "display_name": "Theory of mind", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "wordle_arena", + "display_name": "Wordle arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/misc/eval_hierarchy.md b/misc/eval_hierarchy.md new file mode 100644 index 000000000..d6005f3c8 --- /dev/null +++ b/misc/eval_hierarchy.md @@ -0,0 +1,459 @@ +# EEE Eval Hierarchy + +## QA Summary +- Families: `20` +- Composite benchmarks: `20` +- Standalone benchmarks: `10` +- Benchmarks: `108` +- Slices: `58` +- Unique metrics: `208` +- Metric rows scanned: `41616` +- Fallback metrics: `2231` +- Benchmarks that still look metric-like: `0` +- Benchmarks where name matches the only metric: `0` + +### Fallback Metrics +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` + +## Hierarchy + +- [ ] Ace + - [ ] DIY + - Score + - [ ] Food + - Score + - [ ] Gaming + - Score + - [ ] Overall + - Score + - [ ] Shopping + - Score +- [ ] Apex + - [ ] Apex Agents + - [ ] Corporate Law + - Pass@1 + - [ ] Corporate Lawyer + - Mean Score + - [ ] Investment Banking + - Pass@1 + - [ ] Management Consulting + - Pass@1 + - [ ] Overall + - Mean Score + - Pass@1 + - Pass@8 + - [ ] Apex V1 + - [ ] Big Law + - Score + - [ ] Consulting + - Score + - [ ] Investment Banking + - Score + - [ ] Medicine (MD) + - Score + - [ ] Overall + - Score +- [ ] Appworld + - Score +- [ ] Arc Agi + - [ ] v1_Public_Eval + - Cost per Task + - Score + - [ ] v1_Semi_Private + - Cost per Task + - Score + - [ ] v2_Private_Eval + - Cost per Task + - Score + - [ ] v2_Public_Eval + - Cost per Task + - Score + - [ ] v2_Semi_Private + - Cost per Task + - Score + - [ ] v3_Semi_Private + - Cost + - Score +- [ ] Bfcl + - [ ] Format sensitivity + - Format Sensitivity Max Delta + - Format Sensitivity Standard Deviation + - [ ] Live + - Live accuracy + - Live multiple AST accuracy + - Live parallel AST accuracy + - Live parallel multiple AST accuracy + - Live simple AST accuracy + - [ ] Memory + - Accuracy + - Memory KV accuracy + - Memory recursive summarization accuracy + - Memory vector accuracy + - [ ] Multi turn + - Accuracy + - Multi-turn base accuracy + - Multi-turn long-context accuracy + - Multi-turn missing function accuracy + - Multi-turn missing parameter accuracy + - [ ] Non live + - Non-live AST accuracy + - Non-live multiple AST accuracy + - Non-live parallel AST accuracy + - Non-live parallel multiple AST accuracy + - Non-live simple AST accuracy + - [ ] Overall + - Latency 95th Percentile + - Latency Mean + - Latency Standard Deviation + - Overall Accuracy + - Rank + - Total Cost + - [ ] Relevance + - Irrelevance detection accuracy + - Relevance detection accuracy + - [ ] Web search + - Accuracy + - Multi-turn base accuracy + - Web-search no-snippet accuracy +- [ ] browsecompplus + - Score +- [ ] Fibble + - [ ] Fibble arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble1 arena + - Average Attempts + - Win Rate + - [ ] Fibble2 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble3 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble4 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble5 arena + - Average Attempts + - Average Latency (ms) + - Win Rate +- [ ] Global MMLU Lite + - Arabic + - Accuracy + - Bengali + - Accuracy + - Burmese + - Accuracy + - Chinese + - Accuracy + - Culturally Agnostic + - Accuracy + - Culturally Sensitive + - Accuracy + - English + - Accuracy + - French + - Accuracy + - German + - Accuracy + - Hindi + - Accuracy + - Indonesian + - Accuracy + - Italian + - Accuracy + - Japanese + - Accuracy + - Korean + - Accuracy + - Portuguese + - Accuracy + - Spanish + - Accuracy + - Swahili + - Accuracy + - Yoruba + - Accuracy + - Accuracy +- [x] HELM + - [x] Helm capabilities + - [ ] Capabilities + - Mean + - Score + - [x] GPQA + - COT correct + - [x] IFEval + - IFEval Strict Acc + - [x] MMLU-Pro + - COT correct + - [x] Omni-MATH + - Accuracy + - [x] WildBench + - WB Score + - [x] Helm classic + - [x] BoolQ + - Exact Match + - [x] CivilComments + - Exact Match + - [ ] Classic + - Mean + - Win Rate + - [x] CNN/DailyMail + - ROUGE-2 + - [x] HellaSwag + - Exact Match + - [ ] IMDB + - Exact Match + - [x] MMLU + - Exact Match + - [ ] MS MARCO (TREC) + - NDCG@10 + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (open-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [x] QuAC + - F1 + - [ ] RAFT + - Exact Match + - [ ] TruthfulQA + - Exact Match + - [ ] XSUM + - ROUGE-2 + - [ ] Helm instruct + - [ ] Anthropic RLHF dataset + - Harmlessness + - [ ] Best ChatGPT Prompts + - Harmlessness + - [ ] Instruct + - Mean + - Win Rate + - [ ] Koala test dataset + - Harmlessness + - [ ] Open Assistant + - Harmlessness + - [ ] Self Instruct + - Harmlessness + - [ ] Vicuna + - Harmlessness + - [x] Helm lite + - [x] GSM8K + - Exact Match + - [x] LegalBench + - Exact Match + - [ ] Lite + - Mean + - Win Rate + - [ ] MATH + - Accuracy + - [x] MedQA + - Exact Match + - [x] MMLU + - Exact Match + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (closed-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [ ] WMT 2014 + - BLEU-4 + - [x] Helm mmlu + - Abstract Algebra + - Exact Match + - Anatomy + - Exact Match + - Astronomy + - Exact Match + - Business Ethics + - Exact Match + - Clinical Knowledge + - Exact Match + - College Physics + - Exact Match + - Computer Security + - Exact Match + - Conceptual Physics + - Exact Match + - Econometrics + - Exact Match + - Electrical Engineering + - Exact Match + - Elementary Mathematics + - Exact Match + - Formal Logic + - Exact Match + - Global Facts + - Exact Match + - High School World History + - Exact Match + - Human Sexuality + - Exact Match + - International Law + - Exact Match + - Jurisprudence + - Exact Match + - Logical Fallacies + - Exact Match + - Machine Learning + - Exact Match + - Management + - Exact Match + - Marketing + - Exact Match + - Mean + - Win Rate + - Medical Genetics + - Exact Match + - Miscellaneous + - Exact Match + - MMLU All Subjects + - Exact Match + - Moral Scenarios + - Exact Match + - Nutrition + - Exact Match + - Philosophy + - Exact Match + - Prehistory + - Exact Match + - Professional Psychology + - Exact Match + - Public Relations + - Exact Match + - Security Studies + - Exact Match + - Sociology + - Exact Match + - Us Foreign Policy + - Exact Match + - Virology + - Exact Match + - World Religions + - Exact Match +- [x] Hfopenllm v2 + - [ ] BBH + - Accuracy + - [x] GPQA + - Accuracy + - [x] IFEval + - Accuracy + - [x] MATH Level 5 + - Exact Match + - [x] MMLU-PRO + - Accuracy + - [x] MUSR + - Accuracy +- [ ] La leaderboard + - Score +- [ ] Livecodebenchpro + - [ ] Easy Problems + - Pass@1 + - [ ] Hard Problems + - Pass@1 + - [ ] Medium Problems + - Pass@1 +- [ ] Reward Bench + - [ ] Chat + - Score + - [ ] Chat Hard + - Score + - [ ] Prior Sets (0.5 weight) + - Score + - [ ] Reasoning + - Score + - [ ] Reward bench + - Score + - [ ] Safety + - Score +- [ ] Reward Bench 2 + - [ ] Factuality + - Score + - [ ] Focus + - Score + - [ ] Math + - Score + - [ ] Precise IF + - Score + - [ ] Reward bench 2 + - Score + - [ ] Safety + - Score + - [ ] Ties + - Score +- [ ] Sciarena + - Cost per 100 Calls + - Elo Rating + - Rank +- [ ] Swe Bench + - Score +- [ ] Tau Bench 2 + - [ ] tau-bench-2/airline + - Score + - [ ] tau-bench-2/retail + - Score + - [ ] tau-bench-2/telecom + - Score +- [ ] Terminal Bench 2 0 + - Score +- [ ] Theory of mind + - Score +- [ ] Wordle arena + - Average Attempts + - Average Latency (ms) + - Win Rate diff --git a/plan/backend-canonical-identity-plan.md b/plan/backend-canonical-identity-plan.md new file mode 100644 index 000000000..3cd00e9c5 --- /dev/null +++ b/plan/backend-canonical-identity-plan.md @@ -0,0 +1,115 @@ +# Backend Canonical Identity Plan (Data Audit + Actions) + +## Snapshot audited + +- **Code repo (`~/every_eval_ever`)** updated to `aa966f7cf` (origin/main). +- **Datastore (`evaleval/EEE_datastore`)** updated to `5edc7b9`. +- Audit scope: all aggregate JSON files under `data/**` (`6448` files, `49659` evaluation results). + +## What is happening (evidence from latest data) + +1. **Metric identity is mostly missing in production data** + - `metric_config.metric_name` missing in **37071 / 49659** results. + - `metric_config.metric_id` missing in **37071 / 49659** results. + - This is concentrated in major configs: `hfopenllm_v2`, `helm_*`, `reward-bench`, `global-mmlu-lite`, `fibble_arena`, `wordle_arena`, `terminal-bench-2.0`, etc. + - Concrete live examples: + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405` has **19** results and **0 / 19** populated `metric_name` or `metric_id` fields; the only explicit labels are `evaluation_name` values such as `Global MMLU Lite`, `Culturally Sensitive`, `Arabic`, `English`, etc. + - `wordle_arena/qwen/qwen3-8b/1776347262.820056` has **3** results and **0 / 3** populated `metric_name` or `metric_id` fields. + - Backend implication: cannot reliably group/compare metrics without string parsing heuristics. + +2. **`evaluation_name` is frequently carrying metric semantics** + - **615** results have metric-like `evaluation_name`. + - Confirmed examples: + - `apex-agents`: `evaluation_name: "Overall Pass@1"` (metric semantics in eval field). + - `bfcl`: `evaluation_name: "bfcl.memory.accuracy"` while metric fields are also populated (eval and metric axes collapsed). + - `theory_of_mind`: `evaluation_name: "accuracy on theory_of_mind for scorer ..."` (legacy converter style). + - `wordle_arena/qwen/qwen3-8b/1776347262.820056`: `evaluation_name` values are `wordle_arena_win_rate`, `wordle_arena_avg_attempts`, and `wordle_arena_avg_latency_ms`, so the eval axis is fully metric-shaped. + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405`: `evaluation_name` is used for suite/slice labels (`Global MMLU Lite`, `Arabic`, `French`, etc.) while the implicit metric remains unstated, so eval and metric identity are still entangled even though the names are not metric-like. + - Backend implication: card grouping by evaluation name produces metric-shaped “benchmarks”. + +3. **`score_details.details` is overloaded as a nested telemetry dump** + - Found **52208** JSON-encoded values stored as strings inside `score_details.details`. + - HELM MMLU example (`Abstract Algebra` / `College Physics`) contains many cross-subject entries (e.g., College Chemistry/Biology stats inside College Physics row), mixing eval slice + telemetry dimensions. + - Backend implication: requires expensive post-parsing and risks accidental interpretation as benchmark/metric labels. + +4. **Benchmark/evaluation_id naming is not consistently aligned** + - **257** files where `evaluation_id` prefix does not match top-level folder benchmark codename. + - Main cases: + - `reward-bench` folder vs `evaluation_id` prefix `reward-bench-2`. + - `tau-bench-2_{domain}` and `appworld_test_normal` folders vs `evaluation_id` prefixes with hierarchical paths (`tau-bench-2/...`, `appworld/...`). + - Backend implication: any logic keyed on only one naming source (folder or `evaluation_id`) drifts. + +5. **Eval library naming is not standardized** + - **16 distinct `eval_library.name` values** including mixed casing and source-specific names (`lm-evaluation-harness`, `BFCL`, `Artificial Analysis`, `ARC Prize leaderboard`, `harbor`, `unknown`, etc.). + - Backend implication: harness-level analytics and joins need alias normalization today. + +6. **Fibble family note** + - Current snapshot no longer has `fibble1_arena`, `fibble2_arena` top-level folders; it is consolidated as `fibble_arena`. + - But fibble still encodes both slice and metric in `evaluation_name` (`fibble_arena_1lie_win_rate`, `...avg_attempts`), with missing metric IDs. + +7. **`detailed_evaluation_results` coverage can be metric-selective inside one aggregate run** + - Current live example: `wordle_arena/qwen/qwen3-8b/1776347262.820056` exposes **3** aggregate metrics (`win_rate`, `avg_attempts`, `avg_latency_ms`) and links one sample file with **35** rows. + - All **35 / 35** current sample rows in `9a357c44-1c36-43dc-a764-de1f3e204fe1_samples.jsonl` carry `evaluation_name = "wordle_arena_win_rate"`. + - The same aggregate currently declares `detailed_evaluation_results.total_rows = 27`, so file-link metadata and actual sample-row counts can already disagree in production. + - Backend implication: a linked sample file does not imply run-wide instance coverage. Aggregate-to-instance linkage must remain metric-scoped, and instance-availability badges should be computed per metric or per eval-summary node, not per run. + +## Backend-centric recommendations (proposed) + +1. **Enforce canonical identity at ingestion (hard)** + - Persist canonical tuple (backend-owned): + `(run_id, model_id, benchmark_family_id, eval_slice_id, metric_id, harness_id, result_index)`. + - Keep raw fields in parallel (`raw_evaluation_name`, `raw_metric_description`, etc.) for audit/debug. + +2. **Add registry-backed resolution with confidence** + - Resolve benchmark/eval-slice/metric/harness via registry aliases (`exact`, `normalized`, `fuzzy`, `manual`). + - Store `strategy`, `confidence`, `review_status`; quarantine low-confidence rows from card generation. + +3. **Add semantic validation gates in ingestion CI** + - Reject or flag: + - metric-like `evaluation_name` without explicit metric identity, + - `evaluation_name == metric_name` collisions, + - benchmark-family naming drift (`folder` vs `evaluation_id` inconsistencies). + - linked sample files whose rows cover only a strict subset of aggregate metrics without explicit metric-scoped coverage metadata. + - linked sample files whose observed row count disagrees with declared `detailed_evaluation_results.total_rows`. + - Keep structural schema validation, but add these semantic checks as a second gate. + +4. **Phase-in stricter schema usage for metrics** + - Immediate: warn-only for missing `metric_name`/`metric_id`. + - Next: soft fail in bot with override. + - Final: hard fail (for new submissions) unless `metric_name` + `metric_id` present. + +5. **Serve frontend from canonical IDs only** + - Frontend card grouping/filtering must use canonical IDs, never raw labels. + - Raw labels are display metadata only. + - Instance availability must be attached to canonical metric/eval-summary IDs, not inferred from the existence of any `detailed_evaluation_results` file on the parent run. + - This prevents recurring “benchmark cards that are actually metrics”. + +## Should we fix adapters and regenerate data? + +**Short answer: yes, but only for adapter-owned benchmark families.** + +### Good candidates for adapter-fix + regenerate + +Adapters exist in `utils/` for: +- `hfopenllm_v2` +- `helm` (`helm_lite`, `helm_mmlu`, `helm_capabilities`, `helm_classic`, `helm_instruct`) +- `rewardbench` +- `global-mmlu-lite` +- `terminal_bench_2` +- `exgentic` (used by tau/appworld/swe/browsecompplus in this dataset) + +These are high-leverage because they account for a large share of missing metric identity. + +### Not fully solved by adapter regeneration alone + +Several benchmark families in data are not obviously sourced from current `utils/` adapters (or are manually/externally produced), including examples like: +- `apex-agents`, `apex-v1`, `bfcl`, `artificial-analysis-llms`, `arc-agi`, `sciarena`, `fibble_arena`, `wordle_arena`, `ace`, `la_leaderboard`. + +For these, you need a **backfill canonicalization migration** + submission template updates, not only adapter patches. + +### Practical plan + +1. Patch adapters to emit explicit `metric_name` + `metric_id` and metric-free `evaluation_name`. +2. Regenerate adapter-owned families in a controlled replay branch. +3. Run one-time migration for non-adapter/manual families. +4. Turn on semantic gating and canonical-ID-only serving. diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py index b2479699b..e9b7a6c65 100644 --- a/scripts/plot_dataset_statistics.py +++ b/scripts/plot_dataset_statistics.py @@ -76,9 +76,14 @@ def import_plotting() -> tuple[Any, Any | None]: def label(row: dict[str, Any]) -> str: benchmark = str(row['benchmark']) evaluation = str(row['evaluation_name']) + metric = row.get('metric_id') or row.get('metric_name') if benchmark == evaluation: - return benchmark - return f'{benchmark}: {evaluation}' + base = benchmark + else: + base = f'{benchmark}: {evaluation}' + if metric: + return f'{base} [{metric}]' + return base def short_label(value: str, width: int = 46) -> str: diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py index b9f2cbd6d..708572cf6 100644 --- a/tests/test_dataset_statistics.py +++ b/tests/test_dataset_statistics.py @@ -14,6 +14,10 @@ def row( lower_is_better: bool = False, score_type: str | None = 'continuous', inference_engine: str | None = None, + metric_id: str | None = 'score', + metric_name: str | None = 'Score', + metric_kind: str | None = 'accuracy', + metric_unit: str | None = 'proportion', ) -> dict: return { 'schema_version': '0.2.2', @@ -29,6 +33,10 @@ def row( 'lower_is_better': lower_is_better, 'score_type': score_type, 'has_uncertainty': False, + 'metric_id': metric_id, + 'metric_name': metric_name, + 'metric_kind': metric_kind, + 'metric_unit': metric_unit, } @@ -59,11 +67,24 @@ def test_invalid_rows_are_excluded_and_counted(): } -def test_shared_evaluation_key_includes_score_scale_and_direction(): +def test_shared_evaluation_key_includes_metric_scale_and_direction(): base = row('a', 'bench', 'eval', 0.8) + different_metric = row( + 'a', + 'bench', + 'eval', + 0.7, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ) different_scale = row('a', 'bench', 'eval', 80.0, max_score=100.0) different_direction = row('a', 'bench', 'eval', 0.2, lower_is_better=True) + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_metric + ) assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( different_scale ) @@ -143,10 +164,51 @@ def test_json_report_shape(): assert report['descriptive']['counts']['result_rows'] == 2 assert 'inference_engines' in report['descriptive'] assert 'models_per_benchmark' in report['descriptive'] + assert 'metric_id' in report['descriptive']['score_summaries'][0] assert 'coverage_aware_model_summaries' in report['observational'] assert 'pairwise_model_comparisons' in report['observational'] +def test_score_summaries_group_by_metric_identity(): + rows = [ + row('model/a', 'arc', 'v1_Semi_Private', 0.98), + row( + 'model/a', + 'arc', + 'v1_Semi_Private', + 17.0, + max_score=77.2, + lower_is_better=True, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=10, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + + raw_summaries = report['descriptive']['score_summaries'] + normalized_summaries = report['descriptive']['normalized_score_summaries'] + + assert {item['metric_id'] for item in raw_summaries} == { + 'score', + 'cost_per_task', + } + assert {item['count'] for item in raw_summaries} == {1} + assert {item['metric_id'] for item in normalized_summaries} == { + 'score', + 'cost_per_task', + } + + def test_models_per_benchmark_dedupes_model_counts(): rows = [ row('model/a', 'bench-one', 'eval-a', 0.9), From 8c2c5783e0a26ef02f3077b696200fb40ffbbe70 Mon Sep 17 00:00:00 2001 From: Yanan Long Date: Thu, 30 Apr 2026 09:49:53 -0300 Subject: [PATCH 08/15] minor changes --- .gitignore | 1 + audit/audit_after.json | 262 ++ audit/audit_before.json | 426 ++ audit/dataset_statistics.json | 2407 ++++++++++ .../coverage_counts.pdf | Bin 0 -> 16555 bytes .../inference_engine_spread.pdf | Bin 0 -> 16123 bytes .../models_per_dataset_histogram.pdf | Bin 0 -> 13934 bytes .../normalization_quality.pdf | Bin 0 -> 15186 bytes .../normalized_score_mean_by_eval.pdf | Bin 0 -> 22251 bytes .../normalized_score_variability.pdf | Bin 0 -> 34031 bytes .../score_range_by_eval.pdf | Bin 0 -> 21724 bytes .../top_evaluation_coverage.pdf | Bin 0 -> 21142 bytes audit/dataset_statistics_summary.md | 37 + every_eval_ever/helpers/dataset_statistics.py | 35 +- misc/dataset_statistics_summary_writer.py | 109 + misc/eval_hierarchy.json | 4187 +++++++++++++++++ misc/eval_hierarchy.md | 459 ++ plan/backend-canonical-identity-plan.md | 115 + scripts/plot_dataset_statistics.py | 9 +- tests/test_dataset_statistics.py | 64 +- 20 files changed, 8102 insertions(+), 9 deletions(-) create mode 100644 audit/audit_after.json create mode 100644 audit/audit_before.json create mode 100644 audit/dataset_statistics.json create mode 100644 audit/dataset_statistics_plots/coverage_counts.pdf create mode 100644 audit/dataset_statistics_plots/inference_engine_spread.pdf create mode 100644 audit/dataset_statistics_plots/models_per_dataset_histogram.pdf create mode 100644 audit/dataset_statistics_plots/normalization_quality.pdf create mode 100644 audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf create mode 100644 audit/dataset_statistics_plots/normalized_score_variability.pdf create mode 100644 audit/dataset_statistics_plots/score_range_by_eval.pdf create mode 100644 audit/dataset_statistics_plots/top_evaluation_coverage.pdf create mode 100644 audit/dataset_statistics_summary.md create mode 100644 misc/dataset_statistics_summary_writer.py create mode 100644 misc/eval_hierarchy.json create mode 100644 misc/eval_hierarchy.md create mode 100644 plan/backend-canonical-identity-plan.md diff --git a/.gitignore b/.gitignore index d493519d7..fa0573f49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Local data (generated by running adapters) # data/ +audit/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/audit/audit_after.json b/audit/audit_after.json new file mode 100644 index 000000000..0dda99be0 --- /dev/null +++ b/audit/audit_after.json @@ -0,0 +1,262 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "metric_id": 1021, + "metric_name": 1021, + "metric_kind": 1021, + "metric_unit": 1021 + }, + "malformed": {}, + "top_missing_by_benchmark": { + "evaluation_result_id": [], + "metric_id": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_name": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_kind": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ], + "metric_unit": [ + [ + "fibble_arena", + 336 + ], + [ + "helm_classic", + 201 + ], + [ + "helm_lite", + 182 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "helm_capabilities", + 68 + ], + [ + "ace", + 32 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ], + [ + "tau-bench-2_telecom", + 15 + ], + [ + "la_leaderboard", + 5 + ], + [ + "theory_of_mind", + 1 + ] + ] + } +} \ No newline at end of file diff --git a/audit/audit_before.json b/audit/audit_before.json new file mode 100644 index 000000000..7b432cb88 --- /dev/null +++ b/audit/audit_before.json @@ -0,0 +1,426 @@ +{ + "files_scanned": 6448, + "results_scanned": 49659, + "missing": { + "evaluation_result_id": 37071, + "metric_id": 37071, + "metric_name": 37071, + "metric_kind": 37071, + "metric_unit": 37071 + }, + "malformed": { + "evaluation_result_id_pattern": 12588 + }, + "top_missing_by_benchmark": { + "evaluation_result_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_id": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_name": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_kind": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ], + "metric_unit": [ + [ + "hfopenllm_v2", + 27444 + ], + [ + "helm_mmlu", + 2844 + ], + [ + "reward-bench", + 2404 + ], + [ + "helm_classic", + 1005 + ], + [ + "global-mmlu-lite", + 912 + ], + [ + "helm_lite", + 910 + ], + [ + "fibble_arena", + 559 + ], + [ + "helm_capabilities", + 408 + ], + [ + "wordle_arena", + 134 + ], + [ + "terminal-bench-2.0", + 115 + ], + [ + "livecodebenchpro", + 87 + ], + [ + "apex-agents", + 74 + ], + [ + "ace", + 32 + ], + [ + "helm_instruct", + 28 + ], + [ + "apex-v1", + 19 + ], + [ + "appworld_test_normal", + 15 + ], + [ + "browsecompplus", + 15 + ], + [ + "swe-bench", + 15 + ], + [ + "tau-bench-2_airline", + 15 + ], + [ + "tau-bench-2_retail", + 15 + ] + ] + } +} \ No newline at end of file diff --git a/audit/dataset_statistics.json b/audit/dataset_statistics.json new file mode 100644 index 000000000..dffd83d39 --- /dev/null +++ b/audit/dataset_statistics.json @@ -0,0 +1,2407 @@ +{ + "descriptive": { + "counts": { + "result_rows": 40495, + "unique_benchmarks": 59, + "unique_developers": 794, + "unique_evaluations": 178, + "unique_models": 5299 + }, + "inference_engines": [ + { + "count": 39618, + "value": "unknown" + }, + { + "count": 450, + "value": "ollama" + }, + { + "count": 150, + "value": "openai" + }, + { + "count": 54, + "value": "google" + }, + { + "count": 47, + "value": "anthropic" + }, + { + "count": 33, + "value": "gemini" + }, + { + "count": 30, + "value": "openrouter" + }, + { + "count": 26, + "value": "deepseek" + }, + { + "count": 18, + "value": "minimax" + }, + { + "count": 15, + "value": "moonshot" + }, + { + "count": 15, + "value": "ark" + }, + { + "count": 12, + "value": "zhipu" + }, + { + "count": 12, + "value": "qwen" + }, + { + "count": 12, + "value": "aliyun" + }, + { + "count": 3, + "value": "kuaishou" + } + ], + "models_per_benchmark": [ + { + "benchmark": "GPQA", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "IFEval", + "result_rows": 4635, + "unique_models": 4557 + }, + { + "benchmark": "BBH", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MATH Level 5", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MMLU-PRO", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "MUSR", + "result_rows": 4574, + "unique_models": 4496 + }, + { + "benchmark": "RewardBench 2", + "result_rows": 1379, + "unique_models": 197 + }, + { + "benchmark": "RewardBench", + "result_rows": 1025, + "unique_models": 179 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "result_rows": 1020, + "unique_models": 139 + }, + { + "benchmark": "BFCL leaderboard CSV", + "result_rows": 3350, + "unique_models": 109 + }, + { + "benchmark": "GSM8K", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "LegalBench", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MATH", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MMLU", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "MedQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NarrativeQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "OpenbookQA", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "WMT 2014", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_lite", + "result_rows": 91, + "unique_models": 91 + }, + { + "benchmark": "helm_mmlu", + "result_rows": 2844, + "unique_models": 79 + }, + { + "benchmark": "MMLU-Pro", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Omni-MATH", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "WildBench", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "helm_capabilities", + "result_rows": 61, + "unique_models": 61 + }, + { + "benchmark": "Wordle Arena Word Set", + "result_rows": 75, + "unique_models": 43 + }, + { + "benchmark": "Fibble Arena (1 lie) Word Set", + "result_rows": 64, + "unique_models": 40 + }, + { + "benchmark": "SciArena leaderboard API", + "result_rows": 114, + "unique_models": 38 + }, + { + "benchmark": "Fibble2 Arena (2 lies) Word Set", + "result_rows": 46, + "unique_models": 38 + }, + { + "benchmark": "Fibble5 Arena (5 lies) Word Set", + "result_rows": 50, + "unique_models": 37 + }, + { + "benchmark": "Fibble3 Arena (3 lies) Word Set", + "result_rows": 40, + "unique_models": 37 + }, + { + "benchmark": "Fibble4 Arena (4 lies) Word Set", + "result_rows": 38, + "unique_models": 36 + }, + { + "benchmark": "wordle_arena_daily", + "result_rows": 92, + "unique_models": 32 + }, + { + "benchmark": "fibble4_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble5_arena_daily", + "result_rows": 84, + "unique_models": 28 + }, + { + "benchmark": "fibble_arena_daily", + "result_rows": 82, + "unique_models": 28 + }, + { + "benchmark": "global-mmlu-lite", + "result_rows": 912, + "unique_models": 27 + }, + { + "benchmark": "Easy Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Hard Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "Medium Problems", + "result_rows": 29, + "unique_models": 27 + }, + { + "benchmark": "fibble3_arena_daily", + "result_rows": 75, + "unique_models": 25 + }, + { + "benchmark": "fibble2_arena_daily", + "result_rows": 66, + "unique_models": 22 + }, + { + "benchmark": "apex-agents", + "result_rows": 74, + "unique_models": 20 + }, + { + "benchmark": "ace", + "result_rows": 32, + "unique_models": 12 + }, + { + "benchmark": "apex-v1", + "result_rows": 19, + "unique_models": 10 + }, + { + "benchmark": "La Leaderboard composite dataset", + "result_rows": 5, + "unique_models": 5 + }, + { + "benchmark": "Anthropic RLHF dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Best ChatGPT Prompts", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Koala test dataset", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Open Assistant", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Self Instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "Vicuna", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "helm_instruct", + "result_rows": 4, + "unique_models": 4 + }, + { + "benchmark": "appworld/test_normal", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "browsecompplus", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "swe-bench", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/airline", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/retail", + "result_rows": 15, + "unique_models": 3 + }, + { + "benchmark": "tau-bench-2/telecom", + "result_rows": 15, + "unique_models": 3 + } + ], + "normalization_exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "normalized_score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 266, + "evaluation_name": "v2_Semi_Private", + "max": 1.0, + "mean": 0.5578856391307715, + "median": 0.75515, + "min": 0.0, + "stddev": 0.44976366617156693 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 262, + "evaluation_name": "v1_Semi_Private", + "max": 0.9999805606556713, + "mean": 0.7136057730617251, + "median": 0.92835, + "min": 0.0, + "stddev": 0.3413295062389333 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 250, + "evaluation_name": "v2_Public_Eval", + "max": 1.0, + "mean": 0.5578486693149027, + "median": 0.8591871038330539, + "min": 0.0, + "stddev": 0.46020565690537485 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 228, + "evaluation_name": "v1_Public_Eval", + "max": 0.999984448524537, + "mean": 0.750460640659595, + "median": 0.9602438445183996, + "min": 0.0175, + "stddev": 0.3138616973551216 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 191, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5524884816753927, + "median": 0.5604, + "min": 0.008, + "stddev": 0.19526001389051642 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 0.9312, + "mean": 0.6721155963302752, + "median": 0.7076, + "min": 0.0, + "stddev": 0.16692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 0.9401999999999999, + "mean": 0.6615788990825688, + "median": 0.7104, + "min": 0.0, + "stddev": 0.17084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 0.9375, + "mean": 0.6427752293577982, + "median": 0.75, + "min": 0.0, + "stddev": 0.24460198666555008 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 0.9582999999999999, + "mean": 0.5703339449541285, + "median": 0.625, + "min": 0.0, + "stddev": 0.2059801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 0.9031, + "mean": 0.726408256880734, + "median": 0.7636, + "min": 0.0, + "stddev": 0.1625125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 0.7376, + "mean": 0.20235045871559632, + "median": 0.157, + "min": 0.0, + "stddev": 0.1699218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 0.7097, + "mean": 0.13904036697247707, + "median": 0.0839, + "min": 0.0, + "stddev": 0.1515138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 0.8323, + "mean": 0.2820403669724771, + "median": 0.271, + "min": 0.0, + "stddev": 0.208463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 0.7290000000000001, + "mean": 0.18597155963302753, + "median": 0.1161, + "min": 0.0, + "stddev": 0.18379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 0.7737999999999999, + "mean": 0.23962385321100918, + "median": 0.165, + "min": 0.0, + "stddev": 0.21479676048452157 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 0.825, + "mean": 0.29009174311926605, + "median": 0.2, + "min": 0.0, + "stddev": 0.24897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 0.76, + "mean": 0.24009174311926607, + "median": 0.175, + "min": 0.0, + "stddev": 0.2138372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 0.77, + "mean": 0.21591743119266055, + "median": 0.14, + "min": 0.0, + "stddev": 0.2171396175036615 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 0.74, + "mean": 0.21238532110091743, + "median": 0.15, + "min": 0.0, + "stddev": 0.194452693868985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 0.9065000000000001, + "mean": 0.7661733944954129, + "median": 0.83, + "min": 0.0, + "stddev": 0.18657086363085557 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 0.97, + "mean": 0.8535779816513761, + "median": 0.92, + "min": 0.0, + "stddev": 0.182740318362281 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 0.96, + "mean": 0.7979816513761467, + "median": 0.88, + "min": 0.0, + "stddev": 0.2273336991546167 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 0.925, + "mean": 0.7347706422018349, + "median": 0.825, + "min": 0.0, + "stddev": 0.24427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 0.8067, + "mean": 0.6783633027522936, + "median": 0.7258, + "min": 0.0, + "stddev": 0.14843039998882532 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 0.9959969388355802, + "mean": 0.910949171600733, + "median": 0.9723906516748102, + "min": 0.0, + "stddev": 0.16788751393048792 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 0.9983116129372659, + "mean": 0.9052860681766953, + "median": 0.9794227826729278, + "min": 0.0, + "stddev": 0.17750828285090742 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 0.9978872247523358, + "mean": 0.8712378709255851, + "median": 0.9528616366965585, + "min": 0.0, + "stddev": 0.18715211182331667 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 0.7746999999999999, + "mean": 0.3809394495412844, + "median": 0.3552, + "min": 0.0717, + "stddev": 0.1568359888890471 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 1.0, + "mean": 0.5, + "median": 0.5, + "min": 0.0, + "stddev": 0.2926814601721238 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 0.9987048455669116, + "mean": 0.8673404362764129, + "median": 0.9486161556437762, + "min": 0.0, + "stddev": 0.2029161256124978 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 1.0, + "mean": 0.7561073394495413, + "median": 0.8079000000000001, + "min": 0.06280000000000001, + "stddev": 0.16896574532662487 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 1.0, + "mean": 0.7637614678899083, + "median": 0.8125, + "min": 0.0, + "stddev": 0.19862042242738473 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 0.845, + "mean": 0.24573394495412845, + "median": 0.105, + "min": 0.0, + "stddev": 0.28751797503234583 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 0.87, + "mean": 0.2646788990825688, + "median": 0.13, + "min": 0.0, + "stddev": 0.29552705211555524 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 0.85, + "mean": 0.22678899082568807, + "median": 0.09, + "min": 0.0, + "stddev": 0.28410639873751836 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "GSM8K", + "count": 90, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6740333333333334, + "median": 0.765, + "min": 0.028, + "stddev": 0.24790177694247365 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "min": 0.169, + "stddev": 0.1866150109050233 + } + ], + "quality": { + "has_uncertainty": 1603, + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_metadata": 0, + "missing_score": 0, + "out_of_range": 100, + "total_result_rows": 40495, + "zero_width_bounds": 0 + }, + "schema_versions": [ + { + "count": 40495, + "value": "0.2.2" + } + ], + "score_summaries": [ + { + "benchmark": "GPQA", + "count": 4635, + "evaluation_name": "GPQA", + "max": 0.791, + "mean": 0.30281846817691477, + "median": 0.2953, + "min": 0.168, + "stddev": 0.04912650528590854 + }, + { + "benchmark": "IFEval", + "count": 4635, + "evaluation_name": "IFEval", + "max": 0.951, + "mean": 0.46067240560949296, + "median": 0.4545, + "min": 0.0, + "stddev": 0.20767533842318336 + }, + { + "benchmark": "BBH", + "count": 4574, + "evaluation_name": "BBH", + "max": 0.8269, + "mean": 0.4867208351552252, + "median": 0.5038, + "min": 0.2178, + "stddev": 0.11398463853942328 + }, + { + "benchmark": "MATH Level 5", + "count": 4574, + "evaluation_name": "MATH Level 5", + "max": 0.7145, + "mean": 0.1555723874070835, + "median": 0.108, + "min": 0.0, + "stddev": 0.14625658002062183 + }, + { + "benchmark": "MMLU-PRO", + "count": 4574, + "evaluation_name": "MMLU-PRO", + "max": 0.7303, + "mean": 0.32874433756012245, + "median": 0.34475, + "min": 0.1026, + "stddev": 0.12833971558059434 + }, + { + "benchmark": "MUSR", + "count": 4574, + "evaluation_name": "MUSR", + "max": 0.6024, + "mean": 0.40635732400524704, + "median": 0.4091, + "min": 0.2929, + "stddev": 0.04536121071938266 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 266, + "evaluation_name": "v2_Semi_Private", + "max": 77.16309638, + "mean": 1.3257351367669172, + "median": 0.09789999999999999, + "min": 0.0, + "stddev": 6.199066844791538 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 262, + "evaluation_name": "v1_Semi_Private", + "max": 44.25900135, + "mean": 0.8916221425572519, + "median": 0.30084999999999995, + "min": 0.0, + "stddev": 3.2441508923523688 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 250, + "evaluation_name": "v2_Public_Eval", + "max": 17.6, + "mean": 0.6595584, + "median": 0.08, + "min": 0.0, + "stddev": 2.152394923590425 + }, + { + "benchmark": "ARC Prize evaluations leaderboard JSON", + "count": 228, + "evaluation_name": "v1_Public_Eval", + "max": 7.7201, + "mean": 0.5021848684210526, + "median": 0.3319, + "min": 0.0012, + "stddev": 0.8240755952564907 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Factuality", + "max": 0.8716, + "mean": 0.6400781725888325, + "median": 0.6779, + "min": 0.0274, + "stddev": 0.14060436598989037 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Focus", + "max": 0.9838, + "mean": 0.6965137055837564, + "median": 0.7293, + "min": 0.0646, + "stddev": 0.1999740938960993 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Math", + "max": 0.898, + "mean": 0.6002578680203046, + "median": 0.6175, + "min": 0.0546, + "stddev": 0.11530869084864068 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Precise IF", + "max": 0.6625, + "mean": 0.3724553299492386, + "median": 0.375, + "min": 0.1313, + "stddev": 0.06683254610514013 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Safety", + "max": 0.9756, + "mean": 0.770956345177665, + "median": 0.8044, + "min": 0.0378, + "stddev": 0.16859961817216138 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Score", + "max": 0.8413, + "mean": 0.602605076142132, + "median": 0.6194, + "min": 0.0576, + "stddev": 0.13540270878209892 + }, + { + "benchmark": "RewardBench 2", + "count": 197, + "evaluation_name": "Ties", + "max": 0.9063, + "mean": 0.5353568527918782, + "median": 0.5529, + "min": -0.01, + "stddev": 0.21529016446306679 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat Hard", + "max": 0.9145, + "mean": 0.6117941176470588, + "median": 0.6053, + "min": 0.2654, + "stddev": 0.1713479724227396 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Chat", + "max": 0.9944, + "mean": 0.8923390374331551, + "median": 0.9413, + "min": 0.3547, + "stddev": 0.12437365150350695 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Safety", + "max": 0.9514, + "mean": 0.75624064171123, + "median": 0.7946, + "min": 0.3743, + "stddev": 0.14897429003710377 + }, + { + "benchmark": "RewardBench", + "count": 187, + "evaluation_name": "Score", + "max": 0.9511, + "mean": 0.7524326203208556, + "median": 0.7455, + "min": 0.4727, + "stddev": 0.12766260032441618 + }, + { + "benchmark": "RewardBench", + "count": 172, + "evaluation_name": "Reasoning", + "max": 0.9912, + "mean": 0.779306976744186, + "median": 0.80125, + "min": 0.2821, + "stddev": 0.16510278548710738 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_accuracy", + "max": 93.12, + "mean": 67.21155963302752, + "median": 70.76, + "min": 0.0, + "stddev": 16.692855101327364 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_multiple_ast_accuracy", + "max": 94.02, + "mean": 66.15788990825688, + "median": 71.04, + "min": 0.0, + "stddev": 17.084967242914786 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_ast_accuracy", + "max": 93.75, + "mean": 64.27752293577981, + "median": 75.0, + "min": 0.0, + "stddev": 24.46019866655501 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_parallel_multiple_ast_accuracy", + "max": 95.83, + "mean": 57.03339449541284, + "median": 62.5, + "min": 0.0, + "stddev": 20.59801726435246 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.live.live_simple_ast_accuracy", + "max": 90.31, + "mean": 72.64082568807339, + "median": 76.36, + "min": 0.0, + "stddev": 16.25125032958663 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.accuracy", + "max": 73.76, + "mean": 20.235045871559635, + "median": 15.7, + "min": 0.0, + "stddev": 16.99218603771948 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.kv_accuracy", + "max": 70.97, + "mean": 13.904036697247706, + "median": 8.39, + "min": 0.0, + "stddev": 15.15138492137527 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.recursive_summarization_accuracy", + "max": 83.23, + "mean": 28.204036697247705, + "median": 27.1, + "min": 0.0, + "stddev": 20.8463795648454 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.memory.vector_accuracy", + "max": 72.9, + "mean": 18.597155963302754, + "median": 11.61, + "min": 0.0, + "stddev": 18.379301567138523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.accuracy", + "max": 77.38, + "mean": 23.962385321100918, + "median": 16.5, + "min": 0.0, + "stddev": 21.479676048452156 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.base_accuracy", + "max": 82.5, + "mean": 29.009174311926607, + "median": 20.0, + "min": 0.0, + "stddev": 24.897845144318115 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.long_context_accuracy", + "max": 76.0, + "mean": 24.009174311926607, + "median": 17.5, + "min": 0.0, + "stddev": 21.38372755020874 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_function_accuracy", + "max": 77.0, + "mean": 21.591743119266056, + "median": 14.0, + "min": 0.0, + "stddev": 21.713961750366153 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.multi_turn.miss_parameter_accuracy", + "max": 74.0, + "mean": 21.238532110091743, + "median": 15.0, + "min": 0.0, + "stddev": 19.445269386898502 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.ast_accuracy", + "max": 90.65, + "mean": 76.61733944954129, + "median": 83.0, + "min": 0.0, + "stddev": 18.657086363085554 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.multiple_ast_accuracy", + "max": 97.0, + "mean": 85.35779816513761, + "median": 92.0, + "min": 0.0, + "stddev": 18.274031836228097 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_ast_accuracy", + "max": 96.0, + "mean": 79.79816513761467, + "median": 88.0, + "min": 0.0, + "stddev": 22.733369915461672 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.parallel_multiple_ast_accuracy", + "max": 92.5, + "mean": 73.4770642201835, + "median": 82.5, + "min": 0.0, + "stddev": 24.427840192832814 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.non_live.simple_ast_accuracy", + "max": 80.67, + "mean": 67.83633027522936, + "median": 72.58, + "min": 0.0, + "stddev": 14.843039998882533 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_mean_s", + "max": 169.87, + "mean": 15.127064220183486, + "median": 4.69, + "min": 0.68, + "stddev": 28.519051991371985 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_p95_s", + "max": 568.59, + "mean": 53.85339449541284, + "median": 11.7, + "min": 0.96, + "stddev": 100.92943454619746 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.latency_std_s", + "max": 212.99, + "mean": 27.425045871559632, + "median": 10.04, + "min": 0.45, + "stddev": 39.86152829724822 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.overall_accuracy", + "max": 77.47, + "mean": 38.09394495412844, + "median": 35.52, + "min": 7.17, + "stddev": 15.683598888904708 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.rank", + "max": 109.0, + "mean": 55.0, + "median": 55.0, + "min": 1.0, + "stddev": 31.609597698589376 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.overall.total_cost_usd", + "max": 355.17, + "mean": 47.11669724770642, + "median": 18.25, + "min": 0.46, + "stddev": 72.06972033379084 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.irrelevance_detection_accuracy", + "max": 100.0, + "mean": 75.61073394495413, + "median": 80.79, + "min": 6.28, + "stddev": 16.896574532662488 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.relevance.relevance_detection_accuracy", + "max": 100.0, + "mean": 76.37614678899082, + "median": 81.25, + "min": 0.0, + "stddev": 19.86204224273847 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.accuracy", + "max": 84.5, + "mean": 24.573394495412845, + "median": 10.5, + "min": 0.0, + "stddev": 28.751797503234584 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.base_accuracy", + "max": 87.0, + "mean": 26.46788990825688, + "median": 13.0, + "min": 0.0, + "stddev": 29.552705211555523 + }, + { + "benchmark": "BFCL leaderboard CSV", + "count": 109, + "evaluation_name": "bfcl.web_search.no_snippet_accuracy", + "max": 85.0, + "mean": 22.678899082568808, + "median": 9.0, + "min": 0.0, + "stddev": 28.410639873751833 + }, + { + "benchmark": "RewardBench", + "count": 105, + "evaluation_name": "Prior Sets (0.5 weight)", + "max": 0.782, + "mean": 0.5625428571428571, + "median": 0.5757, + "min": 0.0, + "stddev": 0.17788750218625798 + }, + { + "benchmark": "GSM8K", + "count": 91, + "evaluation_name": "GSM8K", + "max": 0.956, + "mean": 0.6556373626373626, + "median": 0.762, + "min": -1.0, + "stddev": 0.30260192099278316 + }, + { + "benchmark": "LegalBench", + "count": 91, + "evaluation_name": "LegalBench", + "max": 0.757, + "mean": 0.5902087912087912, + "median": 0.629, + "min": 0.331, + "stddev": 0.11619442676283923 + }, + { + "benchmark": "MATH", + "count": 91, + "evaluation_name": "MATH", + "max": 0.92, + "mean": 0.5574065934065934, + "median": 0.656, + "min": 0.026, + "stddev": 0.2685588691111619 + }, + { + "benchmark": "MMLU", + "count": 91, + "evaluation_name": "MMLU", + "max": 0.809, + "mean": 0.6220989010989011, + "median": 0.643, + "min": 0.243, + "stddev": 0.12023218786489331 + }, + { + "benchmark": "MedQA", + "count": 91, + "evaluation_name": "MedQA", + "max": 0.863, + "mean": 0.6103296703296703, + "median": 0.64, + "min": 0.229, + "stddev": 0.15792234765120447 + }, + { + "benchmark": "NarrativeQA", + "count": 91, + "evaluation_name": "NarrativeQA", + "max": 0.804, + "mean": 0.6938461538461539, + "median": 0.742, + "min": 0.111, + "stddev": 0.1228501275789075 + }, + { + "benchmark": "NaturalQuestions (closed-book)", + "count": 91, + "evaluation_name": "NaturalQuestions (closed-book)", + "max": 0.502, + "mean": 0.3627912087912088, + "median": 0.378, + "min": 0.028, + "stddev": 0.08850543190907255 + }, + { + "benchmark": "OpenbookQA", + "count": 91, + "evaluation_name": "OpenbookQA", + "max": 0.972, + "mean": 0.8312527472527472, + "median": 0.882, + "min": 0.222, + "stddev": 0.16911788087383792 + }, + { + "benchmark": "WMT 2014", + "count": 91, + "evaluation_name": "WMT 2014", + "max": 0.262, + "mean": 0.18178021978021977, + "median": 0.191, + "min": 0.023, + "stddev": 0.04641450975187302 + }, + { + "benchmark": "helm_lite", + "count": 91, + "evaluation_name": "Mean win rate", + "max": 0.938, + "mean": 0.499967032967033, + "median": 0.488, + "min": 0.041, + "stddev": 0.24004497034928224 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Abstract Algebra", + "max": 0.84, + "mean": 0.4692405063291139, + "median": 0.44, + "min": 0.21, + "stddev": 0.1566784405169303 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Anatomy", + "max": 0.911, + "mean": 0.7049620253164557, + "median": 0.719, + "min": 0.222, + "stddev": 0.12203524533321435 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Astronomy", + "max": 0.974, + "mean": 0.8196835443037974, + "median": 0.855, + "min": 0.342, + "stddev": 0.12503810130124515 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Business Ethics", + "max": 0.89, + "mean": 0.7354430379746836, + "median": 0.77, + "min": 0.24, + "stddev": 0.1177001565076888 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Clinical Knowledge", + "max": 0.928, + "mean": 0.7806329113924051, + "median": 0.8, + "min": 0.26, + "stddev": 0.10518545005348215 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "College Physics", + "max": 0.863, + "mean": 0.5205189873417722, + "median": 0.51, + "min": 0.196, + "stddev": 0.13341576241396605 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Computer Security", + "max": 0.89, + "mean": 0.7888607594936708, + "median": 0.8, + "min": 0.3, + "stddev": 0.07740978772295665 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Conceptual Physics", + "max": 0.949, + "mean": 0.7394050632911392, + "median": 0.774, + "min": 0.319, + "stddev": 0.1436847973853721 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Econometrics", + "max": 0.807, + "mean": 0.5924556962025317, + "median": 0.614, + "min": 0.307, + "stddev": 0.12405156056525753 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Electrical Engineering", + "max": 0.869, + "mean": 0.7012531645569621, + "median": 0.724, + "min": 0.29, + "stddev": 0.10967007262512768 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Elementary Mathematics", + "max": 0.942, + "mean": 0.6168481012658228, + "median": 0.622, + "min": 0.254, + "stddev": 0.17076712953141734 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Formal Logic", + "max": 0.786, + "mean": 0.5559240506329114, + "median": 0.571, + "min": 0.27, + "stddev": 0.11667484646986527 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Global Facts", + "max": 0.8, + "mean": 0.49860759493670886, + "median": 0.5, + "min": 0.25, + "stddev": 0.11856767165669667 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "High School World History", + "max": 0.958, + "mean": 0.8590253164556962, + "median": 0.89, + "min": 0.253, + "stddev": 0.1104488482004626 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Human Sexuality", + "max": 0.939, + "mean": 0.7969367088607595, + "median": 0.84, + "min": 0.267, + "stddev": 0.14067149783040647 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "International Law", + "max": 0.959, + "mean": 0.8525189873417721, + "median": 0.884, + "min": 0.306, + "stddev": 0.09770414010589916 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Jurisprudence", + "max": 0.907, + "mean": 0.8231518987341773, + "median": 0.852, + "min": 0.25, + "stddev": 0.09722219971870344 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Logical Fallacies", + "max": 0.926, + "mean": 0.8139873417721519, + "median": 0.834, + "min": 0.264, + "stddev": 0.0972786763034739 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "MMLU All Subjects", + "max": 0.873, + "mean": 0.7308227848101266, + "median": 0.757, + "min": 0.295, + "stddev": 0.10005918242229046 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Machine Learning", + "max": 0.839, + "mean": 0.592126582278481, + "median": 0.616, + "min": 0.286, + "stddev": 0.12807703682255595 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Management", + "max": 0.942, + "mean": 0.8453037974683544, + "median": 0.864, + "min": 0.272, + "stddev": 0.09395052631917909 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Marketing", + "max": 0.962, + "mean": 0.9024556962025316, + "median": 0.923, + "min": 0.269, + "stddev": 0.08556236254220637 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Mean win rate", + "max": 1.0, + "mean": 0.5000506329113924, + "median": 0.517, + "min": 0.014, + "stddev": 0.2741845671999428 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Medical Genetics", + "max": 0.98, + "mean": 0.8162025316455697, + "median": 0.84, + "min": 0.28, + "stddev": 0.11717074761250226 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Miscellaneous", + "max": 0.964, + "mean": 0.8688607594936709, + "median": 0.893, + "min": 0.292, + "stddev": 0.09859535722376811 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Moral Scenarios", + "max": 0.902, + "mean": 0.5793924050632911, + "median": 0.575, + "min": 0.231, + "stddev": 0.19478445797799818 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Nutrition", + "max": 0.928, + "mean": 0.7968987341772152, + "median": 0.82, + "min": 0.34, + "stddev": 0.1008295839442827 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Philosophy", + "max": 0.9, + "mean": 0.7844303797468355, + "median": 0.807, + "min": 0.325, + "stddev": 0.09312807331625374 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Prehistory", + "max": 0.951, + "mean": 0.824746835443038, + "median": 0.858, + "min": 0.318, + "stddev": 0.10757030716441658 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Professional Psychology", + "max": 0.922, + "mean": 0.7793291139240506, + "median": 0.812, + "min": 0.232, + "stddev": 0.1177310844427953 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Public Relations", + "max": 0.855, + "mean": 0.724873417721519, + "median": 0.736, + "min": 0.345, + "stddev": 0.0757594653625247 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Security Studies", + "max": 0.886, + "mean": 0.778126582278481, + "median": 0.804, + "min": 0.408, + "stddev": 0.09570378540441088 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Sociology", + "max": 0.96, + "mean": 0.8729493670886076, + "median": 0.9, + "min": 0.383, + "stddev": 0.08587676004752948 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Us Foreign Policy", + "max": 0.97, + "mean": 0.8918987341772152, + "median": 0.92, + "min": 0.26, + "stddev": 0.09360413026947771 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "Virology", + "max": 0.602, + "mean": 0.5457215189873418, + "median": 0.56, + "min": 0.392, + "stddev": 0.047070851318166546 + }, + { + "benchmark": "helm_mmlu", + "count": 79, + "evaluation_name": "World Religions", + "max": 0.924, + "mean": 0.8426455696202532, + "median": 0.865, + "min": 0.234, + "stddev": 0.08472202480187987 + }, + { + "benchmark": "MMLU-Pro", + "count": 61, + "evaluation_name": "MMLU-Pro", + "max": 0.875, + "mean": 0.6609344262295082, + "median": 0.723, + "min": 0.169, + "stddev": 0.1866150109050233 + } + ] + }, + "observational": { + "exclusions": { + "incompatible_score_type": 0, + "missing_bounds": 0, + "missing_score": 0, + "out_of_range": 100, + "zero_width_bounds": 0 + }, + "valid_normalized_rows": 40395 + } +} diff --git a/audit/dataset_statistics_plots/coverage_counts.pdf b/audit/dataset_statistics_plots/coverage_counts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6ac4008ecd57c03cc296ec0e43acf30bc6503d4a GIT binary patch literal 16555 zcmb_^1yohf6R-#pzb^Qu!|P1j_xjIzI)Fn%;$F&;?u}xv0oJBwLg9D6^T7+0lr|I<&fGM_I}5DZ`ZHr#%A@P z*L>2wA(QJ2#u`iw%j6svxAUisRAXkANls3@-7xV-3D-ao$+%_fw$sz6TwNtc>J^gk0@j5t_o=jc-epWN*Dde7B*x%ewrj zt|=qR;RS}B1?{+G=|C3K@w!%{e?N|?w#4%*!mTe_JjiJB`PJ3jI}Pr>WmVPQXZ<7O z9qZ;FtgPSt_DbmSB_92U$~4q>3NJstq@Ut8F%!|#A+WmH^X0YgXHS6(4vM+Yi!8^_ zv*=>__>y;29H1-H5g!>=-!@e;Eo3Xa)vBU8QQu5C&+VEwHqkNWBa%LGJkg?W$4h-CV7)7^HfV7&k%cK~<_K@T zW&N}HXP>84V{bpbbYy{aALV^Eraf>Yos$y%1qRp)kK)MiS_BOh}u~KSZPY zwS7$^SLK`j*_#~Iq~3_>Pq*A6h~*q2DD#RM8jZuXFETa{H_)0%f7XoCaC&%8v8Av1 z{34SZM%!0=VED48y_Na!JUA4hiPW^OkquT?(%~8|eM7ArCva~%bN4K@4sG*YZhnH$ ztRpH-Pi)npRk%z0*CRX{8>_F}?mi)zb#A@eO++&X`m_i$+MZZTvDTPWbwMpGpd@Z5 zd-h3WV_V9noZiB|eVd-MoX1Y;-yN21l7lJF-Bdr-J=^U%ph~GNw-hnpkwf(8m54+C z#Ix6D!5M?1wI#WBMWy}u3mkfj-Q%|W6ZSu_M!L^K7u~D!(`^*xV4Wjs7y3uO@r(Jd z*bk!j+G(#AUo*zIgbX{YzI0j^VP6iiBzvs)Sr6t}FdQ(~kvoaA~xo>i3`pb1$ zge)C(xYyFnEO{y^Rm$1Nu?Z!|i6#Q>s3#AoKi{Gs5s~?tp^H^r*9>{3a{C3*H{+m(j3fqv_=53C;{>_V9enU!p-ne`u2qP5yxa1C1I2!H@Xi z2M#PB2-Jrf{)Bag*;j<1cXT9?Pde7uQqqys3H=wIsP7rmGN2E%6#xE_NSJQ8+{UZXs{=SUBn={;PS z@JY-r^VnL5K@;C%@}?Q5E{ll8i5dE_?K$ozk+$TC^?5IRXhiM~N_=4vnwlp}=zU+TPxAvWQ3;EMooDl^^;lq!9XlON-^AVl*CtpmK(C@0EfH+j6#s zElJP*%Th#8CSj~uPSQ?wFHZ}=1WuUW_n@Zl|^Wqa%PTlhj5&Z+s4Qhu1zy&sI* z0Sk2>-Q*scvHV0Bk`|2r_B0b?<_?}=Z0cvNC(TY>;Wkeq9Yw6HW_iGT?vo-833m7! zR&F89EOK*rYdW=}y~ET8cb?k{C+xQQ$?_>I-c)S%Zl@|^m@ zB)ZuB)6C?^&A)cba)c}ae&J(+Mv8h_yPv8{hcjO_oz3w}VgZSBeB}A{r1uP&%&rh| z&&jk!M4cJi|DfwXzlO7P$!Zk~zp=yyNHpL7<~Lfs*%?ogU|Wp>5^!h9v+v^51J?q%PG2Do1C9~xesnd#zB`#?DC5>8l?9vk6GdZm1lbyh7! zLQu0>fl+LL{cUk-RF=G0y*b$s;fStK5hl_%qJ&WAX2&8sqU>w42CY=NfN(u>`6&I}G} zoTE}jF*Ca7w#%g2HEH&6!yTs;W|@Nv?>!%HF4(vg7ed&y!bDC|XdCD9HeWNYc<-9O zJF$7s@$g|ka)Z#ilEu@LraDEcGzL;S;j5>qWag^M7V7rQBU{(5Z&+l%4TcXx-3 zn-p&Z;f0zUlicDIiz5;p@Yugn5LC&r3a7=nrQ0f73r#y_^Gf?XLNs-r-J&iX9k(KW zkM3ho^NZCJ+enVcP>J;y)WSOM*K$?ZSqgHTwrCxLOU3v{=rn_zi?G7 zuF@G7K_s)9dz9t+X8hWlNe!nZi8Wr;2j}a~p1vzz&?fWnZ5s@@Czsx; z`DrCaUl`GhEzJ5*leZ8%$oo)G^3s`1d%>G&GO8?0^m64+_cEgsLxOtWZ6Q2zc`|rj z=nZQ!MVC9P8b`yX7*1ctRScuIAg<79XoX*GPA^ieW6TZ|k87g7r#t&*+ivPQ)5TVW zF4L~G8#K1~F%gSoP5gEhUM#FPn8tZ`s)7lkYim6WB&+#9`A*F1?LP6LII|_`-66@o zu{iSJx`i+&nF2LsHA~WJe7VbquJTu}jIRj0oP|QX1QoxEh<(;wWp`^)=f*FI$Yph+ z6DRxBy+Y98)_DB(r@b$6Ya0xdVi*xD90t!xe{dM_FVzcMZqI$n{xap2K7BQTQ`TKbF7qU7#`8<``wwAvX65-85*09-j@;C^B_^nwT31?<20WTYiqmZg zGDx27Fns%Z8gXv)0*#e?T5d@0fUlxbmsSbu)OiNZks{2X0TQXX$T_7#s#kt_!C}4q zt!%{X9Svc6pY^q?4NaS8jb4sBOW(%A3-C<%2QT>e{$UTJ;}zrS5d<LA6 z8Ou(dT{ZiCuW7*47txI=tI1Wpfs&klinj}4*JRJ+u{mGl7|bn&Q&UvPQ+3_s{z!V8=!_prT=Ce+$44K8p9+%aQJkLC zxqsJckdDkB#lRRDkox({yW-4V&FJ!GCBAnNG7fEDR|#I5XW`FO_>%kE#>0lnGz>t9bmAA)~SE>(6cO z&ZtE&bXqO;RT|RhLoKW&q$2J$GAib;lPKbxoh+aVt;C-@x z?mCqXv2`D>+sa7t7hqR$gGlN+nyAan_)w_ujc!<>`>TdIgT-t{$zPnm9vdn|+lx15 zzjc3$cyVn1NaOrt0W2B-UIYB00Z2Z<|82$58cGiM2!f#*ikkBeuPZ9Fqk1(@FYcD1 zt2_`$!deE)wlKmgzPB0kOq?frN4mIC7t(Jkn!?t)D2~Na*5UMR!3$bv+jVi5bt{y z19sLLSM}v&ObtefIdfrtdzGQ-VX3}Yt!zV4=$5d4~!)NTZwB#vBIwcZPkKV49?2@*am!jR(DWqkM%`81IwJug5$VQ*Jl*eSC1Ei^3+H z&gylse{vI)vBh#zH)na{`hd`M-%r zt0Tb3bAA;f?sW0h`S^_mjC;ev{t!w=e?Yp?ScJ&}VsK5Xq3msUq7l7#)T)|UuqdvV zj3E#a>pg9J50A2}prZUlirXYy=K1E*xTuPjz+>Oue)kSbzWVXoc)|@Tuc{TLA2(`M zxUL3mx@fFyoAVfP*4~uUFm^dXGZ%a+Tv8=a?!1b1HT291cWvH1cP30heuDObmG6mi zJ1UQp9hwDFEXaXi`Zo5`k^UUL)4`yz+0 zL{U9tKW0Y1gBy8;<|U0x@=+SsOBk=Xu0`a-#-XGLBYnQ`hSN01tNk8SQP1z5bg(fp z$HGo*fjnC1U)b4fkB1=Iu7vQDo_`$U&8jn1IJZxGVboZZ)I=%kO)Fsl%r~0}9ml)b zcj{5+ixrWJ#4ULnq0jG`hO*oacf##J?Feql@`Wwx-!&o!8pw+TN$)*Y?w_njh)43hsp4f6FqvwnU5~_%uK0HhL|7{H(Auvg`9p z)9jafM;ZqWGqCUyOOVco_y=B!UDS5~RCD{Trc=O$@$@vS+#Cs?{jtGTyN7Lcct=Ya zpPMT_@y))Fqc|VuSY52|>*!@f6AGi$PA*_!V`2K}&dB_U;F@z|*73Z#h4_(+9x^-+ z!{_vPqlANn?M&0njFNp~nyyuMwm}?qxK@lxSxO&R5BEi#@^mJhqo#=4eFc%YfeJmn zyJHd<`uW7SGZWdK5pSRhZfNMEBUCe`5xujRg44HY_26M8Nhe6_UwgldqlS0-K8j^z-W5hU!O7A;c26+z7K~E7qcWd7V&6w( z!;8a)3}zgcZen!VR_`pa^rpSz^)_?uO|j+VERHbBZjz%Tb2Ed21a&dZJb^l5I*}~H zlp*E*GiJyK(?Ncj^TfHU_Z2E6{8M`#M*b~p6qb_&lJ6g6EvupEP!A`n+@{<)k7HZ1 zo568ROnKj*Ck1bo@syd;xdtsXYxif1;G)LVC`Ve+!s^c=++-e*yCy;=46x$1i zy?2V{z9IjBEiLvVx=srE#Tac$-#;v>R?~3n?9@uGccMd(ha~-YSU+7p>1Cmpn>vi` znQy8%8|JU!`*k&TpS}Dk^Tr8&mp@(=_rcuvEgCVUw9m^!4aWrs0bH%pyV>!#ID&Z zV$1m8s8uA2X(FdpprL4Gr}F*gnPrL2QeA5-9K>?u5&DM{txA+6$tx&u-%}OHR<2&W z_ZTDEzi-4v{owTN1>UE+)AgMQ?h%97Z#dcID-CiuSL<+0k>^X~s(SG0_ze9^F&Pz= z9N%JQ=iUX1N{Bq`SGP6PF1kQlEm2X+M2v+>EQcEu`X7P~v3QZ_vk0Oa+d?;a;n%a1 z7gu*AW0_v~S7ue;xe#O@^+rvzm!Et13d4|#eJ=I-2{mG$p2e{n>^`{*u-?+EwXsjB zUw2j=9Tq>?m-W>m`cWw6idVzT=UtxH+cn_^66pvbOj*Ok2+;|#_IGX5%pO^C+h$r8w^6pFr?n%>u!Qyw&>()0q z(Ttk`O5PFQ#Oy!0XLq&>GMUctQ{(kpIonNzW+}bUUaQgiN=_g+^hQLlRaMPki*c(# z*N`0=@x66pj(2a)_kmv2CT*s8Wi&zr&TCIgt3SV+et~_A8rv<)PLMdi3Xr! zH{5c2LsIvRn$#b5(UH;W<0OQtC|APrGZR%Dbv5_-ik)}}_nSmjpHEU$Df($rCwxBR z%d=W_iwvUu#Y1qsoaI{pnQTVjjl#2w6k6GYCSDHjP^yfrC28MI4teR&3`m|PadLaN zWO={my}B3PTwLw@?p^lLgb1zJH*~7YlP%}MD4w3%Ry}$gL%jd4^3K-1zABt~2BxH!pr3YExO;#Y72 z-2IfmQ`hlA7Bsc}vbOzCU^v%qoGh?!t;egk#(jtHrl7l>9v`AxKN!R5qkc2KnZSnr zPO*r#icykt#c;As7G)`I;>*09@scYaeUF6n&|bnqA(ki@{SOrWU>FX-Fu5NLGrqVY;dQs5@h`%f6DSZRWHJ241CrEGY z+e&iIeKo#8r9fI>m{ZJgv@B9O?@oFbeVY@v~A~rjW(`YV@pGFI4v+k_G zmx@iDB^R~W?uvcr$ua>YX$RR0Q}Rt8rY+S=*kqfIYp-frykc0lZ=%y`*p6D?Mi$sT zSGzDuQoowPPOU`NsA?aqrZ)N3LRE_7T>E7T=htZrx>BaO)Gm$pd{#<7Y%Liq+&+=xcfO%b(9EsCtNn(l(Y-wgccKObe)k>SC@0W5JSO7I`fcRcZm zMRbQ(Bp(P67lhSB==@E_{O6DN=E^|dn7`rHC1ZW5#nNj&e`{>ZJp5AgOZnY!2jC}A zxv+{!qs)j+U6)edv-(P(!gUE5-}YSJmZ7cjOfIJ%I9}Ei7g717pj1$`gwTMJh5D^A zHzCEDaB($i3o~X^L-a{uVUa+KMZ%SuSsqTRwE1ub=Hnu@pXdahX01!Q3YMO{cD5Rk zbWA5e{IT3+t_QDLIr`4_OPj_!KTMOu;I_0-4coutDO@9YXLC|2Pdo|rVTA1d(+7rc zqwEHQbFBqQGeknqX?L$bWxH+tV3}$-DQ-l92tWP7fRLf=fKow0mGorWgp8j@_X&Yb znlbv)-T}V6<2TFIY~$-X*k1~K_PUnQ8KnForZm30Ft(7=q_qPaYR8V0#IWhlea@~=t9j=k|D6g?2$wjUOA-j6*(OV%` z&0X~DFQpn>m@~i(fo8c{uXg)_IIaG@A7iLzTQK zeX!zyo%D%#e>wM~c za|Df*rz^ag@4dO7*>3At>9a$0vPz}F*Od|05~e)OiyE`A)iu|Vu;j@%CySpVcS*fm zJ@1r;OOVtrNKKy_YcYF@+G5-qeoM`fh)P);@3s!BVDm@LN2G)5PvsxndokPc;mpas z_ggzSwIP=uVc{B<(DI)yIMAv>B)F19p8yl^VQ6y0j=~=C&7XFe9Ya)iJyZc62n>!RM6^)qRUA9T9Mcvvid5^;(4SPLflOvax*)@fsNk z-vB4gwBJ$hJ=wLY8a1Tcqx_z^3DYme_sP2p^+NsF%!nvlxGFwG6j-MXm3tVANgkWG zAq|%-nAFN$?=)QQBBy_pFDhoB_>_B5x&D;yW2TJBC~LOKCQ~|nyv&sjmyd zzY>4^8l4F_1Csl_rjJ0>E3pM@zY>esAYIc*8rJB`=R3x)pHJqyB8|Kgc07n#vmkM; zpEN1^+&0DRm+t#T+~uwTeQI~L#S>VrrQ0paB5E)ZU+!4WNy`ouH=5PZrCXMU_OMgD zQ}77yZ50###4w+=;Xmu+OWZXJU$%^ICvD!XUpx0CG@$yvcLAq2F0m7F*uXI!+R0|_ zw@F{{jEy|=rHHo?mMw8vi}rp;{ViKD8Mm5WT*-`jufAp}rR}I6$>DB-7`+w3HTfuP zjQzBk-!)u+4}*_bxQHd*MDYEKn~kYFg$pN;_ED=PfWErx76103)4C(`0tJU%bDx-V zdF7L@Qy&rM3_oHPt0Y)3N*>QcB~IV}C_FN2N*imOX$i6e8=!VzeM0kP;_CIB#;GvVYnA+{$@_pMbI1kg<+W)rloGq z;(w$KVW(43J^Uqkj}1;d2wVggsW$VB%NUW;u#Ergt~{xZQ``dsT^5% zx26w!+b<|Q21cpA>fBT)OodufKTBYR$XmqIShzj8ooUEYr2j}+NV}?f;oQIU+H0xt_o8yLBbu?J=RQSxw%tc84eRNiG+2xp5XY93W3ei_9Ys*p=gAcS)!h?iSt%#52pLQCkNBsbkz(Ew16EdgIU|sH*0y;yi`_+G<7JX+ zY=lZ&BYCO*{WXPj9^a9(!nl{QV8#|z3;e^T0E{LH zE+bLpD#YLToM}?*hO+ir=ia_vNcN-Cb_0z3ou@hN#&6%Ft-2)o)QQ{kgXTq4)rsv_ zUFS>1bbab{TT|*QCOeIDb+Tz(A7U<}WTe&HiCd#(Wu`DUILQ38@U8b)UrWjKsXRJr z-qw!~^GF%^+Hs9*YA1Fy!Vcp_; zZMiv6;70fM&cy_+ThA_En|~dV!8I1MMW)Z`&k)6u+EONq4$C!2?l9xxAYV;n^2+DG z+cXzEsZk`Irgr&pQ)XFiQ1|0?9sM%q#fDqZj%dy24Qvyw?pq>fRD&1aBYovoCQjcm zg}UAiWvM#*^<@61g5#chS{n!)vv zUeDBm$#ZvzJJOhv&pdl~%m3<**Pf`61fg35mdg;}9RJrZTuA zXye^QuPYi~W0(AS9^N}o$JrH=#Y{%){Vm*{{l#W{@3`}hYQbmjA>5~$0cQ-h6l<++ zju&Y#ekwm+7k0Oo!37e3O#AG3v#}hEZ8qRVZMcqDu21C1Suun08KPiL6uEQWNQ7X$ z3-8#-sU3PF#Tg|oybs;wtS{TfX3eUI-S?}$yYF+YuVZTucTvUU z!r$8gjV-Je`iHrVQwzYUw)j`bX*CK>$Cwx0@-J` zxBP*`;B_4%VS<;Qdvopb{gcd!!D34U-I@3UvAS?seBXE#>2vE+RDr}3=X_8;)>2g@ z?;N)HU5TA{le3!>6id?(-tl@#JaiuG5bbjL3$@Q3?)tdwuxDH0VyIH6{fcZXua366}1KI+SjML35bO1IJ9hIFYi1Jyq zAS*zqT1t`z!?d%wiyeIDU(>2JEuR~2BYP>sW6po2!<3iyf)>8n%CKhVsI^DxxACHO za-*2}>d)44H?Q9JC?X@1act3H?>;UPF2tq%rmZEj0#eVPb%u+BqIAldq&-zI{Be=7 zERi#73-5y(zkaeiUu)uIG#aRElcluZunNHiJ(0|&@hxSF|O zoZK8;pa@WK7ojjscT=~63=wbwz+$MH*aLYMzvKNrSn7LX1rI+z8p?x0qM=AQ91i44 z!1=*c3J|@DiJJ@NAo+q94(vJ5)Wd()3?VR(Fm)g&1jw*B$Pj_RWF3Hn2{2>izf6@M zAP7tbV`*stY#9{Nxe5Yo6hnV12&C%|t}W{&puCQ$I(5^oC^M<~qE0ifFLDo}H? zcCi5JmOy3*@Gr*00*VII(bdAk!T|!az*t$kL17LUpgTWs?*^oy02wGUP?#(fyyjJb z3IelSgo2ln+E5_?VgeNcoRb+8W)6i}0E-1U1SHW|Ltq%-1!%X0!t9_hdjKUs0~F>2 zg*iiEz{IWqAyAk*6y^biT?M|S22w&`moesU*1+1qP5*0l$$=<-eD(34oPQD40e7)U z2H?A$iIpqBoPUv#q^lX2JtG823@R^86%)^c6#S?*)LN1^K@B1z_AF z;0!n!SOk#BgL3c=WH<=`gn=+%Fa!GfI3Ly0ZtGv zP*5m)&j z6tn=KP2iaSU=S2w7(v{E03&`Ba1}p=1PbbxazH)E#b1Fy{Xl{s(Lc(8HURw&GzNhG z`aIw?2m{-HDTg>AAOSxX{O1$g3I`Gdi2k+h$GktDhiyN#;n%((W3bNgL&(7Mzf_W6 z&-PMK1b8UE2%2EvD11?)wi-)jfH6i8kG!T%X_u5;*gmCfZ#%ZQ25=!eoupiDPhdP z*aFxi;F1qALfrwUctGS~o)q9Z{u?X}WOahk6FU&qa5j!#IOFMJVF>~36F&qntA~G}{Ae_wnU>I>FfiT%|3DoM zVF)w=)Y#u)d_tfS{s|KRe1G#9Ed*%KA7lAJpZq5b^_M*0{DPoW|9xHoI1r-z2?Jx%KVhi9 z^ban;C-fK10)l_Z7cPkSTb6=o{y%jCE-3hyKEr_^`7c_8`2N;sAuK!<68cLw07^h( z{u>Vvz*hK6|A58)4HE+Ha{d^L-~+wT@Awb`f5{R7B)?1efw`Z*;{(F~ zU-<0iVuG==a5?x-1vD|@4Pb^^ zmgn$R9#HWgpGw{yD!LvP9(XAF2SD4x!vjxnhGM`^2u#<`!rI>17K+`jCAe7W;ys`y z09i$4fEB#A2Nb5}1h62#{Z`n1t3l0vV+ZlK03hEW_XIq^-4FI*I(T;%Pl6R756C}2 zUjuJ#Zz1pE4N!yw|5zMG42eS^F;FxXE{2AqF(~lE3{VFkBY?;qiu=K-oU^kF@Pz<- z{jL|V`llk*@y@m$c2LANabjdb@*^k?F#*gY6F#zbuAd{h7+G1`Su&RzO?0-XEVKH?Z{=^*CZ_R?R7jo zT*jN-urfw^>c9$@GnsviwZFJX_xtEgeb`caX#vfZ#kOz$o!+}To(NcEDxe?TNf1wYnUB9>$b4u_gu8x#UZBdY#Ah>%d8vY9 zy+d2C!B@2%XZs);cX@Y1(>euTT8oJb%Trd(RUQeW7v72E{kv@O7syg+f;n8)AKO*H zi|dg8P4 zHFwi2V_nj1?h_MLBZmirkpx|pi(t>%Q?m=k1+$6AtK5)=)5zoMU3?jq(QX67_ng>n zM48duVwBjImHG*B{u7s{*EAAA#lmI(!@OV3>@ zH5~9aEG!VyTp3bOv$lL<&pdmT?qPYA;cSW`er! z^_J^!z^=BG(a8#X>6u6=7k@U_m?Gqnu@dq1CQ}$&;OxN+M^2rWx7{4=T^7v>PQRX_ zzkidHi(KcmjTtrjBl^?JFF#pUJJd1FH3&LVMRBaW{Pgj#Sb_6O#VzX>xQBkc^9yE= z9Of)l?V0_PHRjW;U8Y5Qqnc26SlVEcgV0UZZG47c@R(USzNs;WgGxEs z)ji9wukRw~0U0ATwI(Usk^9ci^+(=uQ^m$soo`o~WsnGaL_Wr69BcLpe^l)1fyIMn z4xOr~%jV=Ie90&Bhl}WMMHgU8uO~C>Fg2)KM4G?x5+VypW4?Ln$bJeT#)qX>f>}GL zahaj2x)jyQT?a95)frstmdIbzt_Y>OSu*w@YXvH>ZQ2 zNgt^7gqO?mZ}Sa%$~qNNR!X`9GP6%$1hAD>>u z>kjbsd*0xZy1-u<(6HKH>%O;*Q}X{*894d@V|i=JnG zJ@A{p(uT+9QN$dp`&!r5g*)bC4vdwagIrsMT@&X5-ltAvB+lLRd?e|#NdI~v3h%yn zva-zQsqxv#QiCZQH>`K%gU#J;P8#`Orm-Cxw{s3+AO(XfLIRHanTQZ0FbFh(n*D-- z*~gA3EP((b?|(#Kjh14@AUGA}oAe{>yUFAqUL7;yTPu}s)jhaL9_ccwI?fwfocU<5 zv1sM`wJ_?2c^+oEVux#l*M+**N;WS9deT~V?2YUXV0JF!dM(V+O+td-)28k800aMT z#!$#oZMb$3dJj`5D=TC))CO`S)YSZfSU4H=h-+;QRD>K#d-~e*>$TVS?8=1E)RUl0 zA~8Y2(f>_aN*fpV98NQ}1d)~-+I?b9P2kbxuUlSnz5D=PZqRjNFt-OVUw%3wFzOwj~?L&4`JIWo3NF%#sE_}F%{7j_c$rXQ# zvqwku>OG2|9|Utc(+p1*H<_beU?W_{z!09h5R`0(n)D~1CH0#gfmFTVX)*D<0t;)JLz8@{j z2o!sxZuz%TkYM(QxN$@qi5#Vx%WKIJ+qvm~Z@OSX=SEkmWK zMKPCtYf-f@F2l2B}#z6y@Z$(A9Q1jb3WA+Ovm<$A4e!em7p=KR- zUa<7F2KG#&@mFh(BSAbm&8qE|?WvazI#6DXnqg|dI#&4b3tZwE6kD$frHZMo^)i*K z#!mVVO&M?8^~l~ zsv%AGNq3K~oS4r#7#BTimSLQo7dP7BGlU**E}My*?R61Z)9B~lV}7YOE1B74IU2(| z{-$qY_}q})Wig7IIikWRChBKq#Sf;M9zJvNrx!4@rFCvy&Cf&yhTadvD`nUn5gSV0qciPtOi;1{! zHu|(<^2I(A$4rew8OI!0mC~~LlVAGg+iSMo!}6B1A$%0p0?p-)hGJ`k(g{2&Y~$-4K{w)looc^4Qy-4P2GQ5QaZkd)yl0} zh@;PrLEo#ezSCjFsaUMM!SbVK^eu`pmK$T2noV*B<5l(sjdYjfh<^9^p3VDpe7$#} z_rMl!*iH9~$^)ABB_f+Ga!bdxb%nA;&&fV?2^EpwUGuP5v?|CV>qArxh0-1se$PEwcP32SGNdU#*wG79CEGT)EGdW(IBn)jZB?s3zenL#_O$VX+qL^HTijly~Eo)&Pt z-j-d=Yt-Y06YJtP6%wd%*Pgp!Ut@ogH+y<&ZQ*c3?h_NKTBZ6N@4T+&aWygSe4l#W zJ(?QAkz-DJddda@wu}R{bxlzdFZ3EPV=jvyIQ1V7`C2@_lJDa6urjvt(u}bki|QGT z{WQ%TegnT2MnqRXP zJu6&$S^1PXVc)@tP{v3(jbLS24ZCXSfq9YIf=o{y`=Y{l{b^hOePxcUUi59>IL@^@ z%d{-QnXn5?r~`%0<6j~jUNTIF*W>jm_CR-g3sN>Rjm|z7p;7Ti_MeGn%@x{X#ko#? z<;=mS2NiFS9dti!?{lqv2ATV^FEQ$@2|BW#?cm<(fLm4UQycWo=gwM_P$#(gv{RiJ z+&|c(cB2yof@ZA}f@P4sbJbVCaHM!*i{r$Axg>+d$(&cs)E8m?c|4eFV&C2~-hNX) ze_V&QsbD4iQKn@$e^R6?c^hh7;+qmWV#XxH;vt8(ZWx8TX2gPN0r|*Vl8*40j5q$lb%qnEHABxqh0+`xr?ZgPWKRp-bYshb_&HXmFZD z4=6O|>7CNNw{W&Gz{Uh3QoxiZYn1l6)06cs)#yc3so95`Hd=MzU9-#7O9x~+&3y^% zuH8yBhT@XdYXR==t!x6ipE>o4ZbmwN&#}5NON*kKoa*%$xDYqUb<7Rf{^hA<-qX!p zFMG|>N$3*Xx!a*j1Qzw*y429pco+=K?G+Xrnf8SyACE(9zFYYd24mk@(?a$z6&W+2 zKXyhuyj&Th|KKBsBt>C+AbYPOL`LU=YLomv!m(r5vdU=pt2~tJNt4Pd5X#HXL1<(- z8Qjw-pT~*}tv{&NO0AaA%AFXBcYBf`Vy+elODf#gW956{MR;EZraIX$^*!Zh>2^=} zg3h|%RsD6VvGM%Pq*N<$$%bKay$6yfNXSZJS%Le96j~!%j_w(hiej!RSg=gHWb=-_ zWcSwDqwKfXlBUHT7>&MogAn;(8vA{BUfF!T^6v9>yDgEjkCm%BC=a2{0-xBYS5zMP zesz4JGgwmQcz(CGgPH!r6CBkt6}3FHB)BB8#z0~I!P`p39gpEc&|F%>T^54}<=vQB zSeJ|CDG#j7slIt4#3}lfmhL;ONdK9`eFUfb?92PKX#F~7K3)>?yFU$k_vCzS>;v}a zZ>q@prRm@2EaGEshufd=sUQ2YA^Lo+Cel+F=ckAJNk>XPRyZnTGpW==Zu|N`z38oex>0Ru~!U@fB9{O z`lt{G9hZTD-MM)^VY>&)?ZXd6mQ?p+i#hLZ?e91v7d+B3-4c>^LuidIS@oo{fBSH7 z?XlsK{cjwv_=u18H#2&a51+6e*GEJwckr|uYuP;y5iPW>%ouAUr?>3sE$M^?*{9JeCt!c8YFWH^-6ioRj7HOEr3n%v2J6uZ|lxYigOUaeu#;~WOsbHhI%EeFgroR#YlGxUE(TAz11M8`DmD>N-aQ_ zJ^sr9f6;}i>r4>+uU-;^W&GbSGAX48Un=IBVbRN@w(xQ8L}_w2KTiEl-{)g^ut$!K z&efxH)+Vc?OWTKH;#zIj+YO~;E+Tn{cpPmKNPkbP?+*hh5cW4 z50biRZjhi%DueqcBh^tO0J{g3Q~Ujc%XH_|?!+ssZaK&aPb`{WVpU}*GP_rDgsk+6 zLcz@%@0JCvOAA?DmJ5N`jXd_NC#svudnH+0`3p)V3XLkN@~CK+(nu%v)(9p|(MgT> z?|x~mKzW1pz-V)jZ%Q8do?b6~%Pteqp?gY?rW1n8&+VOXov`IGLdh--bi4~)6WUa$ z@2`}<{fZ}l7Y|$NN_x7;%40%Yr3B&zjIWC{dp)KcGT%)Dr@oL07%GaA&;Y574D}CA5Th+ecmb!WSlhoUYb!$C_{HyS&qVIl zQ`Xc)iinV129Gsfm%z>+{dgKT)3ZN%@SB7cQ-Hc!BOZ7EXB%Jf@8@gz{t1f34F`+j*&ultc=l8<@ zo`kvjXoXK>=BupiRtJ|#zjYe=Dvf%7cWziyg!-r$>LcF4$l6fGMzqkHDQ<+BkYZ3h z<(f)PmDnx8&Y2R6A7^C858A=6Yq`*{s!LNO845@=eipvX(5wAG$_{i3~rOq8mZh1f&Fj0!D!;>$mM7z0lWFsH@xD=;)M7f$T25mPg55Zt_lzFE4ZGO z^M%P*Xjq}b()I8aNfFvXn#ZSJJ`kCAHsv##KNrp~xsl-eZQ%gx`}jn|j$4BJH%dfu zdc8{y9oTK9ToA05j$qrco%wiAL%jMa;VsV_63mlGrjQc<;%~2=bOv0C;)*mSj6rt& z#4)YNOmJsNT3Dm~vWBl>3r#~Zzrwwd+kR^2)K+6*)zWY7S9XfFog5Bi%ln|O+I*x! zcA2H4wmEhU&lNXni4W^ozPWb6=Ss)-{$??1&skPMMrLfyZMBeKjH;WzIz zAAgmeD9~4l$lkwC{Qa?0CyQ8LQHrUA8@(o1h`M-F_{qTj=SxxM>xr)S)Xkl0XqTAi z&^^KjM+3-wH+v(b83tps#{M_6SO^;G?H_Kc#>S} zYTYjyKqbCp*JjhN97qVggqAZlr3j~rbZ1d_K~OH4o7A&1xP{2tTk@mJ?S_KN-=7V= ze~bLs)S;)Henpi1wllqjH1@SR#?=tDG;Nq!iJ4w$AwOSKantAWF?Utolq>JK1S&ri zcx?847?vu4y-I?U*wRLk%AEd1Kq)OXI$-y z!-Fg&=z6Ksnd2|t6t}%lVz5@;Hd^hvxXY<7nxjX{)TuaA4WWpu+JiKUX;XYp-Brm& z&Q&)=kmz)*$e|>>d(>3;p*jh26hR2mvl|h6&3QXj&s&2o&Fotm&uB+2(vEU}8~<^) zW<0gjzOBP&X4+?65+5SM{vJUzi3ACW`G>WYlC~NFObr3A_)IUk1v@vPCmfc(b(VKr zM6l63MDayk2<4f$y%!EyFds&Wbm{tuM?aIAwhMTSj}0-h96l%zV<6i$7$kcGeMSL! zGh%NDuWnJoQa3|l9^V?v_}8~t4@Jt{FTU5x(3g(qzmVoQqlBojkNSGkW}bwn(foCHPNm?dRZ9hZmf;ikI@fjC zg?F0eh-2e>DYvr9yNe^A5qxH!?PscF`0Y=6Jqnj$%ldrm!?-1L+&!{%^s8I7h9L~z zGM8W7S0D9s)>~IThgf31dq!mW7{&9vD0EER@|3yS5(=SRqJg|^X`NcN#4Vj2m$NUN zpE*qhG0pe*drjr+vU4F!U=X>Rgk&W0K{W6q*?<1dB^s|r;FY6N0(Vw3!>wOUND?G` z!JXAPUYMS?HGkl)7KEc=MRk9EVqdex*-h;)xS26GnbE24tp?s^)OXQ^S{JV44Fgv* zEc^_;*=a4kyk*>us}teZHe zUPled?XL#oDtNr;BaU0Gq_0M3r4ouz6QeV&iBHkd%QO%+i233qwPZnc$l>Z0%caSg zH;V~sylEwFdOd=M@(N;so9uIA9~UGiH*CU=gGrej zRi`B%xQcjx($zs#?OS`+F8f5v$gj?*Ir&A!@Eh~{hIt3wbL|aKiV9kuw9PR}iX--y zjxYu2QQB<^JeOB|uW_5qx}}>E=9S#D*mmLQ$Y%0~^U;&zzRsL=G2QqNjBl@Jhd$A3 z)rq0TJqX?{Ea%EfVD_q8qVEz<`>daSwt!t~jfBu75;hd}AGT&NaVLST8LE&UzlKRV zWsF@_HnN_5nnr)js3f!H{ypd;Yqg|xop`hXr z;=bvPHmjpYm=_Xwd}YWUh3b+3NvvC%ujpk`&`*fZk??|Pp7GVwO&TFzT;zuX7*lnGy%xKpkd zF6G%@WgfrM-5Dg3&Aq3j?#kt*x6@|b$*Ni(Z3$A9bViQnM}iY@0glSnB4?RL%X~M0 zQ+WO6%l>0ytvYL}Yc;*gAKrPV6b-8 z?7!?7pL*?753GOvATj0}Gd^cdEym6Uisoj@9ujW9n#0S);rl((NvQl=+`B=ybZyD_i`w!c_iGlZ~Cif9$yzo zRmMr3kK(RtE-1dGLCCd}v3ko{%-TA0@riB&A10;E_(MthV}e+&r?Q=aLu}5thE!HS zb;g$8h4*c%o8&VZ79{plfuDf>-aE$qLw=*7{oC(CyDmQSF_!W~RBtUOH%sR2v$mGT zmhBa=96U*Jl+6?(gM9C}IroXq^m!dOb-a(UXJhSL7Co<8sMIXg+ic37SR=R+rGK1; z0^hPcYcTB)pC8K4PQHq+(|Hx^PV2UDBd;-D?MW)aH_kXw^pMvwM88bsMC~Kz3_rrU zP@XNhJ!>+1z{7Jk=S#o9QlI48CiA!7+gGovU?Hi-9=&m;;Dyz~Lc{T!(PL>{-#81! z1EUq!NjL^lsVwpz_QEhRCxKL!N?|26nD@ncT#eY*wBpy`fRB$d32M%CGm zW1q@}GX>^|nwHG$vMEA0CS55`VI@!T6@FsW8JBi8Cp`X9m2^o{eyWvq0lL(!CUn|; z#U;MLAX$Bn@095S*=vF`&cnR}y6f4n0B5%cNBsVl0N986dn}RvFqW8@<4m9gei`yA zpB$za_~Lsk-M6rHY7kjv7i%4gV9R>19A`<-#RVx3wg(g|aH&?Q&aEcBL5jN~gP-9r z?>!H=ZpR`pTfkxjs_MK3UdYWwCoI@3n$eWrn&pJFga8Z*kM}3dgvMrFxFpc&>}GXhTJMCEzBjzSLTg{# z)y#=9LfB~0M-!Q*vpxa491>2%kx&hZR1*Oly8chqXv84sfTakz->_9m>1vOeV=|Qu7=)PE;r}f zH0-O<1+zUf*YlpFyQW=qJ?;7;q+PQ-rACHQ|HTvQYOM?R?l}8yIcdBVY@uQ*LTK8^F@(bOb3{s<{imLDXf@374Yn{nRTQx074vx5{O%t~NYwY+m@5-LL95KNSlpyN5Gir8{oXjEiVU znKX3oYe?ayN)L4%;uJU1{_bIIs$Nh_qh>Vds{Dcb%Awf{E_NpmWJGHvj!`Ehj`hSp zdGhH|(}Sf=@)QLd5}S62-&43Bmj;M80bueL?)V=i6)j~WZKI=d1bYid4L#6W2<{#V zb`}J{S-{jSw#!Hi1_A>+DB|6%2==ZXE(9n7%sh3VFkMee58{0UkO1ILG%cKfQ~iJ9 z{n1(e$1#3UEEWS5MFFSwk#IO1IG_*5g6H=^^cof(1bgC9e=#^5oa?~R(5%72j%m%og0Ng@= z!jz!mfV65*a7RoVDght|l>x;87%ZSTpn(+>W(|b_-EE*S;Fbn}(jE{NsCIzD9HB5L z2+SG450Hn#+@LT5Ko}qbg?T~|2%z_QD9jrQ^8xq(PE7)r5MT!O)*g0%;NX1u{XPS6 z9De@3`@cc{ZKQ~*B{eL7(j6^q-2u(~%K*u_TY(ofaKLcEKlb?ziTT*`9N_@pyEKt z`v34i{$Ft*#U-HNgTi2-7z`GCAQBQdC>ARYMc@!n6bkj@0bm#j^pCax>}S92K0o{J zd_#WL|7s850PzqXfF4mZATjU}MOXLa1J(rg`N0wRhM)ki2;!Ci6!G%`r63Lvz{k!vk$X^!J1U4H2r~Wi zAzA~_j%YCe{P#Ce(;y72|Lxhv2>}`SDe%rWI2DK^2r&J7-A}qd-`jP+tl{^zpkheU z_&LbH`+tYVe_xy{K@s4R5AeBaPz1P40ZQ7ywER)h0b zk^WKgfnv5P`a%KgCe{)|C9vfMe*DjqgWo^nk}?8HOh$gM2U+@AFNgR&fd9IZ2vd*) z8o{hApa>8#Ob+>fZ3bkz+s%Gs;kW$fe|-IBMgIf-wtetFASm8X4u3?}A6G45C+)4l zi~^V(pvc6_hn_$*Br>^u6$10ScnZJ;LNI3%0umfoM;8xAdrPPnN(=!P!-+ucJUm?8 zkHKJ0zv{$X2)0KcFd);h_Ot>nB>vvb)!GJXX<_96kok!MXi6kt@8YZoUgQ)~Jcfh= z?gI-~?1TZQA9upASU_Pr+F^mrX(vn^4%p5g?SK!!J??~|Kx6*19TEu|>Yp$; z5)RCzKVd*T08IN27y<dN7a$uVegJ;f-NS<5L3Bk3G!Bh|@bfEaDMS7bgh8$o literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf b/audit/dataset_statistics_plots/models_per_dataset_histogram.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bb2102a3c78924d2a0507f8d0dcbffa6d8984a04 GIT binary patch literal 13934 zcmb_D2{@Ep)Cr}rB%~5G_TB8p$ZqU}5VB-27`w3~loGO5C}kHSN=k|>l|4(*LS$dE zBuj)s{O>!_*C+q;_59C2p6lNCyyu>C?z#7#bKd)&mtPOBDvppqL-_A@L+_MA5KuVO z&Ds$nBLjsQo^~ffVafzAg0q`F6sAY8CwfAW0Kze-oE(JcY6~Jt{nkO%&D9Hvq9DLb zbgdnUHeOKd&r20wFEs-%f)^2r{t3_{czF@Yu22kk34s|n5Nt`V_E5?7yJR;T1ELqy z3}A~_1w$UQFtpT<8O&r9Z0|4a)r6&^s>3)h2J4W<$^CsH>;{oLd z`0EgDNdzS~Uw|VV_>;t8B+v*X28xDDNnnB60#F1nI)H>H6!%j`MORlh;0gia|E>}6 z_J#_ziLUlu4p788TUC-XFlZ=D)fvzPo@nD{O9W;2^db`pE)d`3+C(h_PApr<;?mv| z0`&A2vIXp>3hKAXMj{WQMd`2aquH^&wtvQbcG1(%FVxjrt6VX%@@;)$*QmjS2mgHg z)L`G+>e~7WuJ3b#4>0Cjmh`dq-;&kETV}rN#&%ba;vLMzRn#qU$V2-^t3%%22G$E;!_ z-XxxTW=#J*r~HapSN5FJtgM`U?A4uV#~IvCbHUFJCkWXd--+>r%7yQqC9|4~&fcyt zS~2o6pUOA#2v2OAO{ojy-man?!@FHWOY_LV3K{c=lrsb5TqL-7Y{)~j(@P(!A z{qeEZGH#h=3cIPX1x!;xMw~3TS1*7b=|@JdVSBU39)<5#E-tt;KE{0gaOjl4Sf;zcb5d(Az2b@57Zzutf;5mnQJ}e2#w9 z+l{h)x`+I=u~1jezlL}Idn*I;F7Fny>2|?)k%^du5D@$OEDmQhaPxcBGqT z#`32LsT~|vpbJ@0!F@4t)jqDY?SMo_)_#|x$77o2TW{PQ-fR27>~)>1bQ7}(`9KN6 zv_Z7phawtJs2bW^%Wh}a8+dzp$=OJ*k?p*48d z$iW47wcob)2~F3HykR!;1|E_Xlf=&3FNocBw)_QJgqWMmNb+_0kO*@sSFPh;KTIz1 z-6F-tB^!T4y4w`S8eieQsE*dU z`q3t+Xvj7R{6dy#C-}wdyc=F8RhyoiN@T-q$eP<_mpIHue8A9k!*R+>815sQY5yIs zB2o4DmWVX%^*PNPQ%U`}rjqCeO-n`g3{BE+r1b6v4}D}tJL zTummj{$eC2-15-bnosc9c_p;O&O=R?`OT)~(ln^&PV%+iXSu`keJ`_`g6)J3*+h%R zs^fVjezc6S_p(2-FD-P}JLIvsD3ac^R57||r1+C$*n6|S+10qRL8pCmW>T-sj`wye zjMpdcVxRI=vt`WQIbH6AJ~qsMhO@WkiB0j!*QJq9Ukj$OIeikt)Ai&t8B1n?8B6tp znp3ZI4T~RJtleEMc2E^$yn`jd-QyxAsLxF?CQcpP7Go0p>LoN|QssDZB)7mdKjHLL z0hUYw~PPA%*LbXPCX1lcxMVte!|o%a(j=sQc`owz6Bd^KCEaxz;W2x(}n6XV7FL zP*SK3$40U1AkYkI@e2levrV=gi9rJf{XYylMz6txMv-n=mkn}0Y09_#qgzON2G4snifG5jc2(v<2T6hiTw>n+s=fL#(qAx z^xLSDhg3p%dZcp8!)!#kjz?M^Nzi;zbKT?4F{=-#FQGoMIyV)&lnf7Zk<+r^?=L_! zrbu^!96dM%&|dvZF*-|4w#JsFYgdmcu8{u%zvv<_UMB>)Cjr>*$`EQbCs!r zPrCP|{phl)*SHW&6GqsfxFjMQ!ywb?vu1fexJ-Bsevs;#X|qZtGPb0Y%uSODj0(&PN3wAeZ3J7i?`xfpU ziIw_q*)jTME_ejPL?4?u^xXU-*%sQ1=+0-1xylgbV6oSH6^oThhXN7^YF5j&#j>CA zmMaijx8#sdv`zTs%84)Dt;PMPULG}1TJkGgNH45=+~2Emcu1!#v*82Rm{B*@(9yn2 z#quz%#@ROLZnv-%z67T38ZW}6wP=QlY&nU=o97+O460S@8!OTFo^^Fx>ST_q&N$Vb zZ07xK9BoV$d@*k-*rL;f5t@OAngn{ARbSw8Xv^M_YJ-)=?>DJ_V>#6=92M#kWW6k~ zcabKqIGLu1g{>VkZ*x*eqbs)RbgjUJA`K&zxY{SuG7$SHf$G!>D>xg+`y7!V&S7gF zA5~+sK2p-*=5MybcV^s|+vw zXW34d&?7r`IV*C8r)V2q#7;iS}fVurf9$}in*I{N`R6+DdnJ{&p5 zI?g1@lW*mBk~fFob{Bc~`5;yCn+L4;+gYQ&vzOlYhbN7xNq)YnPBL&4+fV(7fC@3R zY>e;pX*g1p;zXCql)rNL``3L4zFtiZd$r^{7w>ciXlT7Qd?YZea9E_Lko3VEi8PoL z8P;JwQDXerb>UT$O5|9QzBHHr!u+Y)`frC$J{#~*zOjWaz)*VA0zsn@f3g2!HDb9C zbofPYh(zli>sGJSX?Ne(PuJRExeFN(J{ntr<|oc363gV`aU$yCkJ%j6PLbv1VV{tP zp}4w>I?FO1p=Wd=KcwtWb&AUbXk=9jSt4A5+w~ zJ7;@l#GYU^agC87lbdPwA2?Y8QHQys&L@7EeOHvxW^k#bK@0sZ67Tx*>l|IRZ5Hil zX#i`WQ>;o4ODLnLNIUnZD$9#l>)^)xAZH&;xkAhIf{Y+S;JAg~E}KfDnH|a;(FAnF z^H|#57kjS=Ej)Bg8`X_G+-g7BUS`2z1|>SkD@CT)acdMTFlkU78hYH(9ff{Rmp*k% zvf|qF1@Cj3G@o1^dJgA8oV*9BmQv#bu|hgt5Jgirf*yX|f(#<8t$-cF1brU(*Emi>dgt=)2{FM{diP3VcQ2reEP9)En9Nd zW8o+bQ31=QuMH;9yqJ_4=JP{?!`?3#`v~OE1Gj@74(#_!;~7;jig5{HWgudlIF|7* z&%V`^l)e^|-dG`I&%i-963UbwuU#JR7#(||CD)_Ux1wZvo}M-X-EGB|DWMl8OH`g1 zcl_+b(BC=q^&6_q(9ERY@nEaTC)%Fsb2%M(skb{XI=0n+f+S(L?WFUgz0YYnNzKK8 zV?Dp^MMM87uaKhZhwOeIT=SAoxm~3wg&1#fH{>4VpLtLT(obVh( z&i&YPh1kW z?max(u9SW6CuZt;>0;j(#|LC|46!)@Z5`e%cIcXc8#L0jI?pRKlym9~OFid;zmfH4 zmTkpPid2TE>RXZNImScxL@4S6sVe9=R6q~Rh*jQ8_vRtpFNibxY#%^h;>_-|yXD(K z$Jeg%FTcWBB$F=%OH}>3OSek1Wbk>jISNk?6s0%CTL~7Xx?=m7Ldz>iIMo$z9+l& z$;^>sjE(me!ycwvhw3sA7wrr!9=yBHN}GcFiAG^7(fWb#D!~01QF5k0k&Q39+=srz+@ajDqJ8($a1Nr*PGwa-E ztJ~^6Sfp;DOYjtElPWxbDWNGP`nxTUp88?86C@Cy^(B# zm8l=1HoPmNxy4wwh|iJezgWvE`Wmh^aE7wwy(>c2MR)#g`*uEIyDl0*-Jihw!<`MJoKg`z5gpYMFw`S3Y8@l%TU# z7;CPMZW-ODwbu>}&)o65>Kg2$$Tbk&!CAm;jC=OYg!@&-i*k|L@vqW1Go`#v1)V5G z%%gjpB;8)R<8Ft9ow?mU3dJ*+*Nm+#oLMP* z|LwrEeCuOVhb=U?#Zv=jhr~;r8Or-k@2rq7ueOA?_1;*32!fsQofh;)c56_?uEIV@2%$oubF!7)sELT zSMs@GYZD+S8!t41SLadFD`t?3|L|1s#srh>ST|LjtupO(_5&l$=ll|Lsds$vF|vMd zCO(*_^6>N3peK$y$KA*6c}!6X^Sy7|LYIYBm211pl(Jv&}9H%OBLF$#Zq_FB~({;sEK$~iHYykLY-yi%iaitkVoi~)R7k7 z{L!UVTG=yBd$!Q-76BVV>Te!j^boP&EfNX>9y%#^8MMLAXBOcKp3p>Ji z&5j;Uy-xGE+gs;HAM=S>^G{!@lMkCJS>M@Du1oiydECD=W&ZgFecl;`+Lux`UiVMG zy5O2~0@t8f3-igpJzKYM97&Vld05}=T>^KPc2FKnX{dTEbWn^d29wpK$K?)R`dW%f$6Xm~~2Tpw1E?|Ne-$Dah z#agJpSYH@DMKWMeOPBX9DcFncYWU*+rgQw(+B5d#uQcJo+l(LSG!?;4iS`}GO?K^E z!@Df+Z4lvAD|(VcxYbkb{f*xWlD-$ z?n?GE$CDlXx(3hNZmY@~LW~l z08egMW4QK+1gekdWNK?7Zzo=0%Mv@xN=`Jcm~c;~ri<^8+Rv4Eg*dinKXJeTeofbn zfn8gU=7x!YRKrJ+Y~~Mochxe}pNuv3AK1P6erbiO@}hC}7FydPq(UJ6A`ZYD#W7JU zGJFEoO@G(WqBENd;(Or1X(19r)wwWrc(DJyYd5pLu=oiLDVJKiAC8g{V;o?3q+fSe zY{u1`&veEyj9=!*Rljd@2iV)=;!WOW3hw<;B$oBTw}|rqm5u7XAdOVSz906JeR(?A zif80EJgr*@zeNm#l==&620o<#CI!DJM+;+CSkaW$jYy9iUhR?-(Rm%Bb%bOlx+*LtVyop_7ZP~g7Y9wYT;;ZlVKwzrkdSC)x~Vn?ir7vC-9WfS(wnBA{--Uw)?HB%$GmaMXF{Oyd593 zp0&Ek`sTrjurq=-3~Xf4(*DT%4#{05K31}dJ0={NBNXos8QxiFwU~a*%9ULpBWtd4 zS8P(dW{+tOPwG&#gWyoTHK*B$A6g1CAMIvMKfXF~`S9+mcI{gzc#D7xiTR6wOhr$F z3_QO8LCVr|!-SWT)8mFB7gN_x@{WlKHdqDYYpQ~2En|0{R@W)0!m;LN; z<`MBqu&MPBhrlIcg_ePH3fIw=%E+Yfox!{Y_pi=(FvsWeEwhcyzPVW_R^oZST{q20 zE{^|fiu0rj;wdR|Hpy;WS*5F}&gLm+irwR|w?b_1)O{k_nq{Rw9iGTq3>@ILxi7f!er6MPZfKYFC@H)Bb@(HK&*CF#vRs~@q~-H4 zOn%?ZkJ3G3)~vC4+f&gmGAm7jnSJF$Ufj_h@pd&_QFTPjv);B8Tac!y&W%K0id~qn z(wIjf^on$l+19qn<@4Nf8L?UPVf?HqYKYH#kG?-Gn_6%!fC&sBskRu|79kk$O838g zx_F!hnOBic1w1H953_wSE<={`0}qO(d0~cnw)}zHULu@LN-MhaqQl{?+9tY(Aa6>|aen)E^{5?e^!# z2|(0{u^dFN+cz>S_zTUlwU0D?3iC1TVG+Bh-8)>1FD_6lciCaF{j*1xR*XT-nqnu|peF+Uz&Ym3MB`#pvk+%&D@< z;)F@n#=RcixZCy)S%V?=r!z)F?0iH`s7aTDaHTvxyTgyzET%4n>n4-$qsB)jU&TK| zM=vly_Cc(^e%d8-uOj$x#iI56r^(i@S2cK3iaZQE1x=KcB?4FXPmlJ^{h%+-N}&I9 zlF?g4u!z4p4ZPgM2v}L=>C_R3~7++l+;c|yVgUhb)O z-&`K#JCzS)W@C+-@e4|0N++I+?%5+1TK3%j`SGd~gj=mf+G1RvXz8hSc-0*ydin=@ ztaxA2^6Q&f6^0800PneCSD33kr>)bj(hB0nmp+xPd|4gj82z?NRk0`Ak1>ckk(;Upjrvvu9Lmp_GJex%qOAqYs*>D;njqmQ_-Ks z{9L)JE_D!l_Pn{45&GEh)v-&vaCd{KL=@fG$*ewA^Sj?;Q$8A{p1ikTc6p0IZxMf@ zB>zI6m{_e#hY)nZlmo*C8JD!t%c`cfQ_oU%OPdy@zr2$NeQ2w3W93+!;kA6@vlG>k zsiJ+Cmsre10uM*?CpH$VV8ZX1UvIGy6=t2g%5%CvGOd0*bV$EYIa${@r#_?jPVk$Y z1rxJk-pSf)(3VRE4{HSno4uEg95@;}`5qadIy1O0$r|dJ7RFyeKP>r{%=~oW62JX> z-+SWQg$DNTk=?_gU(V?`<@C19PR1RZiQlFh_a;~Ij^kChn1fl$gXS;3C$Hq8zOEF! zF85N58M`OM@c5lFP2)Jz%9{o$mGM2ac9ZNTO}41K+KES|Z6@cMRZm>89OJm{ee%or zo2;0uH{AZW?viM)&a!YPUMrZ-C0%A6i|TlHPAr3aM^RN&$o!kn799!dx_$O!*)k?m z=TpN$S8->YRc*yivW}Ga{Qxpwx~&!hMn_*ATUK9w`eC7`%{TG>5ML5wOES;(1NrZ+ z1)f?ty()83ewSAy{}vOsRoE=~7v~Td{U(=M;IYgfa>?q)(C>SMi_nvAv26n$D@X5t5$mG9YjFO6`I1JZeaJweKKG}RomJs!ZHLK_*d0cP z1{$nXVS;1lpHxPe$lmdf>NzB9J}}AlxZdVv1ApQ<_kjdf>a} z%)(F1#ZSHM4zk|8%Z$6+visphDq~r3kxvGkZ(E2+q~g=!2W9T{9}W9K1^cE ziJqd=M`Vdx-l`79PFJ$VbYyRysYqM%Kil52v`RgxL)hYU2k`EHoUS2oe-Sw8=xMpq z0w-CcY)=(#Z9D(`^a)w;Wzs@IvrI0%t*xA7$xZ?50WBKQedZ8(WV`d~bU%}Mbrtun zxYH-R8!D%7?&j49m7St{lR?{g#T2eW8xX6b%(tM#9>h4v=a2GtP%3A7=ei{6$>{Op zdTv9U#^Yp!U+jr^aZaBFh*61}X5~ZIG=K7nP_8{Si9KQenL#6e9% zTAP_}gVtULpTzG2g|ApoUY@A<;-DIG>ZVU23k%+@(M0IY&La^xQKJ_x8#78FHIi8e zM1|QN4?8fuN|cJoDYQ~y@DON}$b5RHgC*(f(-^$|ttL7r+V_v-1fu%G3+jjM5?4~n zIE=oeiaAPWC%l?jdm?{%ss2$z^W3+LZgu~ei7SwjJe)B*(-A^5F1$YRlZj_neIhqq zYN-1l7uHnoyO*uG_Pt654buU4)gJCB&Z)C*4q6A&qIKg(cU_Gi?TmZ;xc_0}-T77O zL}j}z9(jS+`Twl4pP7V|^gx&r!ISv2rKYQDs%I*yNG1`SbqvA4jO^*9>_8v`Jz?5} z^)?cNfxy5Hc%r8bndI)}MusBRlL!sGt-UBoiXa1E1s^520I7ceru(zA($CyJaY;!G zR2&5)^daGJIFP#smjqM$K=L{SFEWXe|0e;5gAN$@`_;05z(B$DfV4y)#R5i97f5P^g2yOE zP#gdxKw;M4fS@p2pa(z)xBzaFfC^yg428KsVXgpID9jxS^MJqrR-OQBD9jtkZvm1J zVa6m|F9%?BKNC3r$?&8od*d|zpPBq^Iw<%%l#ovar5e=9@vB2r#|3iWNUul4OBVa)R(~ZGMf(0Tag@Z~;Vxb5e0*XSReii_R zkwX8(1z;Qf*86Pq-CRR9?*GCEXn=Gm1>i@~49E;DSR|5CDAzz5Efx?AgaMr)7+_g} zaFC!BAOHmyxKHV~j`yn{&P(}((kSMxa8UQ@XiUHui*Az{IF!26wWt}Dj6kub(n`>|@P-YMi`uBSqd^hU# zd%sr0@3^31w&bxf$w2*2df)Hy03M0}PneE@<`#JPqO^2@ZTYjM2RM+QEh8ur^cDaf z0aye-Th>q{#fm}zTZ~-yRYWM@btrg0%{h|dML;`^T<1fv{K$1y6lZ{>aDxCwobsv$ z;Nc8Kt#fk$oeG7XD=^JJTW(NbZYeO(xuDkhxkFLw9?Jt*L_hD7p{R96fOTFU(@%#( zd87t7t-GsrZeCC{g&SnuS)tcCg8mEG=oFsbP)cYC^rbjB^v?nQV|h0WFX%2+5l~8m z@O$DsNM^&&C?bCMs=q!SfGH~iNH7}$6akuam?H9jMFafiKWM1`K!ZyFVIPGc`hUfQ zO8|ZpAPLrTa=&?$-y)rV2K(Dm`UeOqe|?1i071QO4Ev{h{rRN@Op9a-#ty&^0o6K+_iIkDAXRe}uqX<(5z9ImlBOxG^ad&p}awb_reNYkzxCBlN>fq(&?kNp}x%|2( z;YPL>hQNTB#@5>g_$uM|Ztk{rP-}vX6ToJJ1b|9mKyq`%gWquo;iZvqz(%2wI0PCl ziN=V-QG#%|pzv>+@g)=OAaEe&fdEEz{RfrAV1Sir2mJ*D<0E5Sz;VEf;LR|s)Sq}LaQB11UwjcrIJm?91p{OOck;hsNW`DxK%#)d@y&QxAhmKc z3?~IFxPQ|_fp7me!>~a1!zLISjsvXXzx$%mpxxU9!(uj%7mmgO;p!$l40x8e8HWCo zFGlhYUBNL};9P5SU#UNI50^xO_6z*|(lf9M;D7X$l=@>`!LgJlg@5b8`dD5ykjxxY_}p05nGwlkVw7AbU}^KLQCT9m3DAqN@t| EANzzB?f?J) literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalization_quality.pdf b/audit/dataset_statistics_plots/normalization_quality.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c6865465f2d27054d4c835addc9be4356a167b70 GIT binary patch literal 15186 zcmb_D1yod9v>>1`5=yHe14@X%Ff+^y-6<^~tqk2Ugrp!!DoCmzAtee*Nte$(2=xQ_SCzUS;bckg}9-IrBOT9zBm!v|q~It0xwhrpq{PzPgc zh_Emers?5m28BssU9h$e7EqWP*22sgiU0;&go=tn%CKG05zcTKQ!9H5D0=@c&cQ^(%mu0s zkd>ANSTS>Vfx;B*02U zg?^_mS+~dSyf{J(FTm^Z$`;AR=$*~Nef{Hzx>dr|U86WMEnC=|>d!e9osK$BO?h>) z#IB6cRiY>J&_<(@+%_*G=B+?z@L~1wvMXe&OzBh6OF8Z|$rn|Q9X8Ln&EB8Z{&=T^ zBmAtw?UMy3FJ`6+Xb=a?H`FI2#7&GgOP*fn*CH6~&)9fd{nh)LP<7$kesBMKxoL_* z$-}VA125+LZIZ%~QfZ=0(18^VIW-j6}o!jg1vgUzlbe2`)Z!SU2bkPK$ug@a$+Ioy2wjNVIv=}Y)$&F--(x~FFpCne8G%qiV*JJ#!b=#rXVULdOjesQE2zD{_jI<+%3$xe!2 zmor{tYEB*v_CEc(RoI^ODK~0%86guc<*3nJAGUgXRW@MT7`QzhF?Q_yrhya%}_hPkD zu3mLl`h;#Pfx70-zkg+l&}(m^TXVYPmAg)y$kn%VKL-uU2ZO@1ne5AOFWN89lugKsbB+&6YslP{~b z?kyLOT7R7NwJ)e%wu5x`1eH_H%c=H38Ob=t@XM8kXZ602&sp}*Hd6#irw5h4gVsEI zGyCq!X2f@XsC&qckdEX~k&H{PJYVqKlR!|uj0AbCTK?-tP=bVwGEpiQAM1H*5m(8c>KRrXD>EP@2FgPdp zh)ay~(CB4N*|)dgGUQ`A3~Cn?$puQ<{7Yl;h3Gc~c^B*(dNi<&a7bDCGMIN15 zvaT_GMlxWxxE_=}y!^o(UF7iTbM^L{l$|LDD~_2FFtF@9!Tk`k=P^Mq3P)i0e#D(U zR|^95p@u(UV6gek-D1#yi~SE*i@IGw_L7&F$m@|2p~0q>nQOxsM2; zU4-f;@lL4t4_%cGCYm%zM?`3b=4>&+iYlGh;?hpi1++{N!GxK@!s&btVE)$txgP?M zd;)*LRFnj+o|l+#@mhoTOD#XVlyzIq*t)6RcckvlhZK29xfczOeXV7@z7T46gXx$l&<3s$}Jr^-=C}JP8XwC~P+*R<24^^C{`Gz3R_J(-xAX0YzhZ_x#M_-V(!Hw(ZO9R&*dQArc8Jx!M^m-kvagJO3@J~t}a_4PUv z0mY|Rsy>=A;TMKABJ$J6s&nQe26!^~B`%#vwc`&+kXB)3VU#U(e3%*@6LhnudlT-K z&7H(urZ=R)5?<=0ViXRWWTMk1PzYf(BP-WwXyd)wl31Wp%lyP&G_r~Qq3%rcmhGe; z%f&YNF5|9*8z*gu?%Z0SY(m?Xd$6+IU>WDxt_&m&uc>h}kf=g`^nO35xAWMG`oyM$ zXNLsG`oeIUpP3*hr93@N73pBw-@$gEfsQ>(3GvM%vOhI3s*90h78*Ik}2EIxwSRv7Vtb!pfN~! zOV}yUNBP!3B7Ksb*qx4&7%!bZlC+`exNcESXAR$u*BJ>kmxM*5t7%P~syI#!S{~DK z%WdekS+~pQDQhwwRS8QYoTk1veWOi3eLPz3h~H#iVLJDQ$42UzFEiT%%L6BN&jdYi zzAihWlFc95hRrCMQPbc^<+c_ta0uj*#IG*M=dSd_rVJ;(7M}KDjVv6c$UKx5di}CE~dpQVWe6#hLzpWdpQoLhvO_&|dcFE9_#H2>+^I@rzVOYMR%jjUUs`YMMw14XPysLI6sB7y}@ z^$?EQZKiCJEsXBsHO^&X4p;%drsn&e5P{=U9U-+4FPLHGTsjTb~gv!6N}tIp0#c@&>(piW3HK6hK; zJlo}0D=j+EtjNUrW7j9%Pr5dfjbd^09VQUqK1?frRsX+3Fca8(<6c#|xO$W*m2b$1I*msySj0?fZq|C< zjdZMkV&yBcM^j&C%=%QP&PSr*I={@mtfa^NLDoG@A0ZDgct?mmiSih97YG)V4sz6)gk7nj@T~lkio|T)Cle$etVD*eEk2dMwGguJrSIEaGJVW zJi|{xz8%@4LAS6|f~s_bBS>nP%wLC)T=u@toP%|G|5momZ#X1b{oY-=(KF$k6~VFc za(z!~EYyaH-o)2CqeSc2D+>B}gJw&^KdoyrqHOvHcqF?FI)jhRxy?&^xHswsvpl}U zW*gg)ntw*O-${Vy9jgHcTeY*=;ytTsE5$RZ^K)CvOidZZ`XV(l4e9Pr-nGpr@Gw8~ zXgG6NMVT{n+D=nbR-4V@_()A{>#f;(%_h{e!^$wDR?&M;Y|-s!4sHb%5fwKU^en06 zFDuie_ri{~td@(;FQ^R^ecZjWemi$*lgKYxSL^@MQsyquQi<1vCJ!4mi8OXI@InwU)Qo<7$O zbH4iV`-FlG$}g)Fq%s>d%bk~Re#NPO-ZJIB!dVj_t8RoldU7`Kc&LQ3zpS{jWfke)CPSip<^{I}FRL&bQbUBPRgX6FX<2J$V%O_u)l)iW9r1K>!kI1eCL`LJ_-CI}m z`9d4$P9CZ9Nvouv+o7FW%|9}~~KcWVT@G-1ndUG4eb(r@P7E~n9*WTlgI?Fx_T_kJGSr2~x&^Va&eyAfs2XdSLs|;Vr zf_^f#;FOhP_}TZ(ueXHj)l=!f?P=`q~_cI<( z_^@54mB`f_E^My-0QHvR1_pn^*zlY(qkh&BK&=ddK)iyF&^BD#Ep%j-C=G%WViN=74BAkm*Fl9I#m-#W7q) z3cj%Wxkvj=Jg?OU4<@6k;&c*vi9U#Sx$-Vw-1fSowQVvz^X%LG1QU#K(*%L$Q{f8- z$a=s|1MxRmy^OkoeLXK}#TL!BIKEBMPSUx&o*eH*xoFe1r{{M zhdG=Q&aYY$;-YkeBx41z8JXDmLOcGDcaGs)H{{aT6CysKY9*mxj8MjmeM7>k)eW(y zCqHL<#@OF0`(}rY-Q|s}6H}YI7xL!|aM+N*ce*Re!y5_$uG94cg(gqrl^U z;AgShm%k#TCU+kXyDAPG>5*+b`^8G$E-uV~C^I38t2e+@)JpJH;v;C(Ihl1D2VuCE zH8->Vd}fsTxm&u13cNyTiny634viim8|Jh_7PpZs@3Y!44F#XKE8cxQu_)GAtZR9I z1`pU53H(hm5GFzX5(;dKD*f3@)e665S_${geJ7l2*dSs9|4HfR z23h>8wfM#e@gmvEw?qtlhQ7sCN#zyiHtx*KcKZv92|epmvoX{vxNxdUth|PW>;Ntg z*cTvCe-Ug&qJ+Xv!%1&!2?X%)`aQX~u)Hl1!BXa1kzV!S!cDudW>t+IG}q8&ra_!t zHvQUBRWh%)3!^tUyt3zEJ;hgRBA(LMbygl45~b)(Uoi`R6l`_bqhWe!hr4d8I@CZc z5iVd=((rzm^r%RC_v@)MZt1e)F_#4$?pF}`_wEozTsIAHNa{O{MwMD#dOMV-dSMZH z@%1*8%UaK_eMs0`@eAo0E-UH4=*F)LBqyMe=k90flk@Kw&r3&+7fXzruoqBGa zPMJDL@K19g57=bDF_^z_L23$MbgY=B5x$O!Sa-?p35wsn(xjHr#XxyVA3r)+S*Ze+ zn;N6+psTUVSLn!1vfCuA@_d52Qo%=qK6>edH}`U7EG0ziiyQxVDeJ~{N|_}88~LXf zs5PIEU_I=+kt)n>MF|@egC063`z7ee9bLLVnWwycr{+O88(H)2%?`&%^exSZW(JkT ziPp0r)KAZDsT?|DMYh{r@nG{|ZzcYm((;ven>igYPk%Ai10+8nxyJm(?h1)OT;x^b zMG{a3K4ODR1)4+71saG~$E7@x3n=0tCM((Y_z-wuya6?isJomJN#e=A7DrJ2RlaBc!Cd+Ogh%2L%9f`WS1iAVrya# z|fR1F}2BUoC-)3OFFUtC*wl_Ak{~RLG2$+T68~;GA7Cx{O=qjXovK+^5Lsfjnc$X$*pX?YeowGvX%y93qi_vgq( zW`^(^O{IwL(Vm!U^Ye^fb#WbgR=S3JaS}C-M5K+(JpfAC5^M` zag7hXJ{ON|elnQ9e>BTSyx}#!iA$bG`wja{J%N@B4KTN7kH0joT|yAXIWws{bjL9d zD*0!@Bq!=-0^f5nMxqk>2kz+#KF}AEOgN<`;c$0iY~0xFMv0klrioxnk&aErObWsI z3`nlTtC2!oi+HgO1K+u&Lu@+I+y`jjfQ%8z{};Ol?kI%xGaX=t-)GnXzU+Y3$B9@Dwl(M{9POD!+tc0%p%y|^PAQa*7d_10HiT(S3QeZ`NV zx+Ki6-ui7xpQ?7hSIXFbq@*eGR>kAIVt$n(5(64m`d3C=B-AHDMOEp|OwJ%1!YKp= zh5XGHNIqB3aC6co%!M+YIU-c^kpc5GeNDodznJ3M=_>f$!#dYRGi9~Ur@d@D*L%88 z$~ej?BSF@RptY5D$gZ0^f0ex3nnE&1^e%F2m@?&Qn&GQ3+kwDrOa5a?LcwRX-mE=k zzi*kgNIP^la#)O%C^4;Hz)+@NF)y!DYU1^KX&<*YM=@VdjxrYa^z-E$2`E*yiK^}3 zc!62+xR%s;Q>pAuanzgqh5`cOE$K(=|iGO6LpbUYgMDXu2gbT!fS z6`USpF$R< zLjE9nYcRpnKpJQ5Ir#p%DPC>*1bkNW(`PD!samgk?<{^JrKxg{mWRzLsqHon6<*t< z6qU*i-p;%QPcbnVzH9^{s4$#^GsqbHHpHV6T z)7C4Gs~lVd`v+}M%nwlEjjIBAi3 z<5RBB_6BElKU)X03h%^teqBC6+Z%mX=WQB0%}ya#`ha^O!wGy7*&Kg`BskrU#lmQo zGN$SU?hQ-l0n8tebt3rxA~1nxP(XB$UKb^T9TVRMp5D-hw`$T7tO3@e5*&HpokW`wZm|j73#XQ<( zA|AP5-B$!sx2`|nEFPh$TfJqpeb+Hd$;hsnY?YFnub=bel+Pj0Z!)Wu)v5^DN4alj z-y451N}+m_uNUmYZbC|pJ6}F_E6*}vu++^+MB?zA^|4Tiya~+L9tfBs^ zgW&LtF;!I7p(MWMv>Kh8$K1sNnzNOrTRF5xntqBs=J-Njf90joeAbdg-{+lNr0Yi#Z7?Up;sS+-E@a3xpI_7CL5$nawRJ54jUetcN$cT)TqC+(-O64 z@$yJl`T23lsY!cBBT22i&eZ7p)P|6@+J@~ue!5;(X9d!Z^FcD#3ddu2$lPe64pD_# z$(YHN-kxbI;fzP^JP&GlF4;O0c(|c*gxz#&jr=UtYHe$e6o>4YU`AIikgzR z1=&^_Cs%>T7MAJn11vmJrvuF0&g&3Z-SHPFXK|wl5i(g%+6+-mz~v!lrxx1pzTgX6 zBZbgGj8;C92Uj4Yxxe}vrCb7^-v!7i@Qi{j-8~x(+=#)GHMFs1k9ur%Mf3qv&MHL)|IqY zJHPGf1wDC0XWP$=?xf?i9l!tZROKb%r;c3iV;UEcl}EQ;c8M2@=z7)aw#C(#Pjnh( z>pVH>oMEMnl$KI;C2I?pk)E`=agNeQlgRQLTb-nIukxcqrmy>mU~X~!D;?L)PkxIV zz8dy%#?zj$HoVVl`1qUH)WBlR_KV>p0#E(%IVBuvaa3-#s}%1ri669*uH?{*Y#ku< z0m(Q}L-XIy3@SP1|hIr-T2 z%F^tc^vLu#%w8E!t%zd2P%_8I=B_@mxGq3T@>|sdYr{S6@p%(xACPqbA!>;VWV!F1HTYPVya)=OfU6b1tg>%B}&}MR`HOjdm71E~~FZeDdYacqq=m z*>xxV4CN`$jZiy|vaeA+<4)Tuc}rY_1Wz@tpD@@|sIdqbFHmRxSbC&3B)NwP2Z=hY zb$YzTNEXIEbG@u4R7WJ+>-O+z5rgq*(m+lmP&_ewi@zSnGdes$XGI<@=_xrU^C{nb z*Orpi?1}WJw?qEM6eRl0m6wN}2jOdraC3f4)#osjhdG-@N;^Fay{$bUQ_pshO5g~q zRXJ4?bUG`YsEUQ(N$r8RoQlvkTY{{B$kkiSwQV{1Y09_^OEHr-jQOg<{%tv}iOvtCFqOUX*|Zu?ENGnv!Qy(cZv3dIR< z&nUgS+zf7O5UoJ8`pi}MOp7@Qn9s3GvWJkIV%RWK=1 zqGuPm(qszNd!AFZ9VQP@Ql$ z0-xNF)KK#+MUmBW^66x@_Vn6b62rBDWdcER2CW0QJt*Zx{0+C^isZnqS86@M|4jXM zR6?d!Ps7T$L3}ntXp-%1VHx+lDJ6O#^b3h#O5b#DgTe(o^E|$m`?vGsX$j&zIg53> zqY|&A2S-1z6fJChmSAF@)L$o;5rY@T^=7nPovP!DfXO%JLl!|IW# z&XJsw_eyQc3su^JNrn1WzB;7!a;^!T@ z)$DCaO4OqO3R(O_$LlVVDtZs$k9F9`EAYN%`OS0p#A7RDn1(tOSl%5+vq#(Aa=j;a zI9Ew9k&$OgJL$gZ<2rp?)YiCW5zYODz=)J0c~1xA#9sKNK^br9p}NOcxK|H zK*#rt7+tju7gGbJoElOE-El|RVdmQmpRPGrDxOFVQ@uM)5_5ODKf1Vh?0M_c)o%px zQsxI7^1y#})ZfdX_iCGAl2~W6?>#wHSzR^V^Ab2KtgW&pxLR<|E>f0Q9AFig5_Z3j zK%pQoaDud%vkA`1(ZvA=h3{8DYq%P_>{VEU1b`r*g0%zcl>d$Q`((-Qg~{A#Gz!X% zM4+GuUS3|H*q9d$)*^%Gm9Z{3tG(i79$vtWf=h$7Q7S4bU{>Bs%&L&`y zxd31o7>FJG!HobniBeccc{3{uOQ0Ygj)uT=_J5$@Ku)Cq6xdstNZ4E0nt_W)!^O<* z64+73y6;H@C;U;908J3*6tEw?FRZJYWvk7XX&~9t^|};M->c0$%xmWEjK+3iyLb zkj9TbIOb;`-@o@$z&$+R{U7syKFH%9P7p89kVrs*=Eczux;n_s4br-p|$W>sU}R2h#XqWZ?Qw1@y0ZwF(ptCaxDjrwbfn?DZ}J zkLC9s7KkF>d&W?}kM7+y0Tuz+gY0{G#CJ~$xM;+F&@zJp;b(8GIj|P@d!VaF?33E_ z{)jzl5Wu4&_NiGy_ukC{yjVez`_yc~$g+pa4lwigo;?(>?H+6|ZU7(O?6KnjMeY*= z!ws-3_J%k@k^5rdpvZkf&cJH=es@1)?Gtl>^6iWEJxuZK69gmI-fkCQ%@w-0Lj!tl zzytmr%T*}fJ~a<8=MT8@Xugqm4!q1vWZ{s|GK%Bfd3kS ze|{welac_C!c4GGIEWY~f%spC0WolY*e@*nlED1K*Ds6XA0Q~*er)^)2&(Ge0rUIU zUob^0Q!pg});uWk-j`6WK#<#Ga=%<31>XBHSqk`?2~2u8fDFaa*1^Tr${6Z~hi)82JWk2_eX54XR<`BT&ARvI7-2a23Q7B-knL~fV zz%&W`gWB)I;7EROA^ZkIqX1j~4g=nY{tm+c9`AP;KkuL8_`zrDU-*C@fY#&g|Az6xF+gSa?=a9A z{su!J!G#3={lo{D?@xJ=aLgZbfe`hFu8{no8~QgsKHfiU$;)?u{`k=Rf5^cn0D8-R z<3l0-lnuql`v+YVKj@tPJr~W(|0fJk!XM+1f2=EBG=}#N{?G#GKjvciK)?QPx)==T zeSd=;ARC~*-|frG&nNK5Tz&yyqxl;?AnE-Rh6F#;`8Qo)>hC-QaT+)U`}a6FFA}K3 z{2hh@87`^0>9eL1&6h=HN)+F`>J8(32a{wz-Gn4 m0d#MBeqX`f+yT1h-uAq?vkMmIvgb}fXM={YvdXB+LjDI_ct4x~ literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf b/audit/dataset_statistics_plots/normalized_score_mean_by_eval.pdf new file mode 100644 index 0000000000000000000000000000000000000000..14dea7df2f137c9e8488875c442b033217bc6d3c GIT binary patch literal 22251 zcmb_^1yoes7cU4X-Jt?9bTbUYP}1GqCEZ<;A}ApxB}z9UC?#DA2nZsf0@5i+S%4z> z?nV6mpEz%QthXMn>)hG*oU`NXd-m_dY({lySxy)iFE-=jNl0lUHVncIakX&378Zs; zwf)?zAy6rEFLNhXTL@I$+}7F?!UGi0gouh_Tf10+1qJ?9LDtp93j#kb0M%EsaIm)Y zg7E+PD&y-Vr{!htWewr|MNl{Q^0M}Dfgr%I*ibDyb1QonTL|CDQx8{5Eo(1`5x`bj z77)eS*9!twa0Wz>Jo%J5`BZ?I{7W3z-wJ?ogVK9g1JeBxAF65X>FVua3FrgL5Aav9 zwz4;uboB)|as&VP`S`iuFm6FOKLWzX&Buk{=Ho#CjG$70iacO1o(GuE(iD|5@^49_1p!vk$uC+M!8`RiG_f?F!}8W@C+wX6y75Q@0W>$w$PDv4r8oU1bjS_$kHhl>ddagc38m7UyxU zW<_8d#Wa6u5GfD+c+~TUWJjsjHm5`~MKty=G>dvNQfVs^%^3tR>NYfd+Z(&!)Ax)6 zS1Ffjr2IWsio4&=%_^wAQO`A{X6qng#jz@RSuA_J^|c-}%KYO~XJ~1z^ zQyeN{H*^FqFpE{Vs=asr*;~#zc8Oivfz?!8jbgo;T;gFj=Ewk-ztdA}MNtgC#R$lh_P85W8SK;6gJnMsbEznY2wtw=E56vhoHAKjq{m*m z!rj(}o;y!(@pwF}REPLW7VG(BQd;RkqaurmK@CLX(rD7kaB=Itt9QTcLQl1!rJ(|+ z72ky??ecFcCw+J*XT)}vHt_jP8S>{3Xo5S}9K1|b2AJ|!1p9lBwnV>#X9rP@&X+W% zDIZOmw?%$@-9Zz~c;lG}b!<2P+o2t`ogUpC`(~LM1wX8eJMDsN3*pQj$R$LRd|sN#wtThkKz?3<4MI;=LY;J1l0 zx(h$88tupKb@{(Y^c(3ooVf>~oP0;iQCm@S-pJka{rN`L+xirGxeYoBA8ifkYxVP+ zR@c3BV~ZAW+(iU4O|Bh&l|#&2lpik0l+fTGNcM?M(&OLW|K@cA-ZD+vM5>6vkLtY( zHEzT+739?J_qR+B{c^wFn=ef#-C|{-oiV>sEh}J1o~7<<|7xJu(uOUA|5wi1`MIWr z7Vm8V>D*}1*ee>Y8gX~yrVqDvr}l!&Va2%`)1T2|FFH2S)>iB+V2;aee^o)GUsEl! z-5=Fk@aHM*zm~1S^z5kd>8Et=trZV5`&cwboY5J*>zJK4yhU1PckzPxpLE@S8WkK| zQ8v#o7ae5p4iWVvS9W3Nklg&-#UajEU8tsES2)iXvbWN)hmhplrAbl|tgn@k3=00T zu~BSFL1BA4jPsVK1x_7T$?odbYDueK$MyQBd4eAh!8ODay<5)$## zZzgz~9E%4h0mG)%}O zBoJ(J`ZwC;*JjOo<6I~D$#V6FF^g~8`$I?#jFs6$Q@3*7-wmdi?OXPwN034}%A`RF z{~^`L+~8HEmmT$kr5gD8$)cLgE^}Sx176#jZWYtUUODx#_IaB_i}&B(oXltZBwL4pKe84d-MdyG)gZn=UGZ|vn=*fd zZOg+CajWi z;_})#w_bB<{GsTILZ({s4RQ4mkd3h*Q9xzktXYe0_(x+8roB%!TjX-BtYzth@%gRH z*AZdYS)@4?V+w{kf0i~E`l`4mRmcBQX45o@XU8me(9%g-E%FA&=Le9WFB%G?_nH<# zKK+`mt!sJrhh+RbG|ia_0c*P%c;`=?i&)#X3o7iNvccDp&I#;SdMEMDI61>K)39fj z`utGTIm@*7Li%rD3~FB+!|3~pD2%X0CD@xc`=@LRVK0XwZgj}5X-iss?<~s$4J~uW zUByZxBSyy!^o4;AOK25oV5p&&RDW-BfNZ&iq+2|^5am)b)h)v}_nT%<3_!e){xQ!I z-=HuL&?mF`3#iYW|1CUIAsX$WXrTq-^D|Y9o<_xpC@($Nj=AIxAu3s89Xh_RVPdFg zk#|VTjeG~=r-zl==KSgTvL%?4>)4|n<6#;JN{Sc;S>3NX*x@_z9N4>uOs%w52`uwK zrTR|p!DuMPg*8OOlJAokF8>7c@$qD}d$Uix&bLUr?)~xusja-hVy4!gazva-{9r8~ z$?x+ZD@$JN9zU8ZUVA&wSSjJ*HxF5I=s&GE3xLiewY8c-8f(fM$Li_A0AEkx&WB-N6ZvBtdMJ2u27 zq2c8IrWu=w6$*9!V{MryJX`7*n4A)UW_QQs}H(KYXO~#pD)f|%jve` zbvIl--AY*_dS4n(6PA4G3RbjMk=P_ZX=~gY-Do-z<@^zx1(97RKsYMP^3RVS!$tFxWvEA3<$~!JD$(xDNCmPAz4^^wbeoIQmoQAvw z6@%iT0;>cneN@8x$d)9#p5t!H(<}MV!dA|`V(u3Mbne&U-MSK%@8~tC_quWtZEA|) zsGG{YL1gDql-YI5)~ZzFn}ZVTUmsi`zBdiW zARxeL)<5hv>{5<1#oxRr=gLsc_|))ubrMzQT5!P&idWE-YaGw?55=nX#fMVqEbOje zEp}wvOdo!2kmD-gEewR1r?D0c z0RXK309uHcS9oX=m{Om$!jxtQ)CXiW@D+|mx$X4icS1Pj80?rV=mo#bd-u;IzVi#b zUo_(-`EIo>KLu;fX?~JlCsyXEn9@sHYZJTL2-VznMSQ_FlbjMv2QrC^3x>Ve&Cxs}14UhX)52#@{paAyM+*@c9HM!V&!cExSmfMB*YhCi>?a7;1#)Xjep@R>C5t zkFN1YXRSoHLFRhfbtb|$W;L>ETh{N29UQ7jgpHC220t~*wDxeMQtE%Fvgbd-QX1sk zS#3Bm;!)Z#%vy3ks9nwib6= zMT%?m%9r=tr3;~f(U&sKDe8OY*|6{Q$6V=5?+QtfS0qnZD?w9tDY=kdY2>bwdp%_p zm-KByrQF3kbL|6LU8>r8Gj{^~@ld5gNIy|OGc2zfmHP?Bpbxou7d?FH&UtE$`pmZ| zZ@KA4xrMNb?;g<4Y)2S(^KRVVv1Bt~ylRcJOg*!=!ciRKNRs@btRdj?)$DPxZ;T=f zgcD5MQfA}Np6}$(qmk28*zDXmJiI_09dj8;Zy`bTfOGzLZ!u~r#G%75o0iEApUayw ze}1Ili2wHJS*-$%A|a&9r$oDwh*FI1C0{b%4Qm;GUi8lhnc?=MUfxI=2DKnUve^}b z+;Fr3Xr%Y_4#&i@%_>$zMmWYR3Od@neRPxfw6EI5J#IrJBr$uDt#i8awhHc~L?SGW z36k#_KFhOguD9uh-oL1PPy$m--Ql?F`IJyLG1|%Mbt0+D1I}bB(s&uI`jEBK;wn}# z7Vei!9D-T}Bv#ygnHLqDT^47(IUmU5IL#=hQ|Bl2d8)~U$#W9=wBHhXX-klHyW>`w z+sGZUA(r3_OIZ@Dtv52P5uYXmwLTVFHtHu`+s(i2%>&M=PQBRzu>Ur<%Iiv^o7|WfgITN>QO`+M z-CZ_d*sGK5*SdU&7UjAm|B*htGUw@BPsNXW2@zP`U+75iD;*O&UX^PlR3F|5_Qtb% zdm(Bvh{UD-$}2X?_ecc$_bl_|wdLpiZ^7VjUTtpm3hr~nMK(p)OW`)ytl`FHH@G6t zVJ*6~6+k%PmQs*cgOgNutS#(N5)r(z7MCbfC9R1#;;G(mQI+AA@uI9?UOXNS{~RrO zz{%w)Q^9$8{#zuOM5QPO7_RgWKWvknw%v*4N4cpouMg@D7DKPRl&%sbq8*zuN-21C zzofdT+WBX`TJas{d%>fdihg2Aho4sreRbq|ceI5WjAWEd)jSS)u=>%c{L6wwuI%~O zN*VVVPGYS+uP-L6XM7lv>We_=&G4EDTh5xaOTO$9)yi}~ANJiaVwzJXH&toFzQUy_ zUt}cHE&TGH|GBKMnmrJBQs3oI!rV@T7w(nxhIQR#<9YWE)`Pv8beP&Cj+w_{6Czk2 zVVqad_Q_ni;#<6glD+BDuHBfg^esl6Egiv>nqlVovby&E$3Z+@#8EvQgC48>8JS5E z=2fgyZa;~`MD!+|JKD-0-M4(~$#ly$hvm8hti{RwNwjRTU?2Zz^5lgBir25Lo{qEG zf0D7p?O5~<<@^?x*qJ$$jz{C~XzpBe^FzPDv~a_Z&kF%!!P%c_yqJPokT?vuDsrlO z_|GuR!})nML5D#hwqcSz?J$?p(&;LdZ~A-GE35P+=o1<|ARVW3vJWQAx)pBSMvpWUdpxW$ zkz(H)NE}>QQ)&}Fyy#y_oeG}ba409}Rb*mDB0Cbq!_W8M=0n6Z;XMb&#egC+bjusx zM0Jzfr=P{P70gjb_)cHt^Sauxh;m6abn?1%B*vm~0Z+VkRM{R4w5HjeDK-BxMOe=g z22_|PHjW|)iA+e43=hA+8HCbka+ZeSuFa4dLvC)@hzwvv@QyvhdnAP|b(>>|q2))L z{ZVIIZ8 zc$DODMc~>)7nK#%(jvl(!9u_#O=w)=H5-Ktj5kxA-o5E90=jbZA@ zYBA#`y*&DmEtG1}g3d=)&uGRzou+Tcina8Mt7iC2Fj=E%4C;%5b>r(xfYd?fst!tfqYprEm8Rhg>aH;(^t`QU)FxSe`Q%qNSM--VNe zK)#>8e9kr@_4YSXzj_>=_gGF6RChAZ-7;gNsX#N0l2-Z=Xnu`-NxRdlvIAIF@-@+l z ziT6ncV_Ns?56h9CQf*~TFxAfM)-nW^+WXyZbC&h*4}P+jX9sf)DqH+*6gU!m0Ovi! zID$x&@OW{Io;w4d>)b@i{o%xRuVd+G6xY}DoDw%K=Vuebop$PwjVR{}*Zd2yzIT1T zxANr$dBjic;#qI!Aj4@6^-1;w;b;7G7_an+67}7nlO_5eNUn{(fu~l+r&U#kc3+ND zX7RqKNTz;6)25?uJXgm@UqjetUU&9MHpaAcbv>KTqPRWhEfLL^hRv{Xr?;gP3vtmR z)U0HD1|PMECIw1|HTmBV+$*NjUKhM!6O`SSekG%Yn#MA7Yz}L*8dc8XVZeBVhpue- zqaDZ%6S4b_6lCrJ9$$IVh{)fPGI}``U35inK$pD}H}JCG*b_Z-P}6D`>+ZG$Zd;{B zU`7|be71=!RGdO@@>ag(Gc{#Cxcqc6=>#QL+wCEO;B(8K-hotJHq9H@j47WqXx6po z+E&{Lwy2h>d#wiFqEnRZ3Wt5W>qbwo_N+*B;)ujQY2*iu6h;UEl3qdrmUzwr<3tjg z+|dANvRQhL@#aqA_R9qgw@+f*T&nrv?N=zW^ z#;wba7`?vl=Vqot9#SWdh0jO z`?0TnBkI$*E61M7B;$FrTjRN@B(C3M5%eDiCsr05=Iz3`(9;Vz*%VhgH$L%S&M>Ai z4NORb$NwqTT48twdu+sk61D+2K+hRE!tQ(GvnX7 zIMN%sD{kWG{hiijz%f~4;hk+8PTZXv*J5>@QbQ(;oN`pCvV|PUWis;^Q(pxHaX$%i zd}i;XzrOmZ@&lS>O26w|1C!)Vb1)X2vI%(TqL}@(-etElLp&G3Iu*aVn1Qz|cmuy-E7Ptu0wf zpHRV4i-aKsRbKFY!LQkfXgg^v5^0biOW0W^$|~^^F48besUL!2P<7Ue=X(5Z=y=0g zj%t?%l!wWn=rOtS%J14DF7ZTGy?7W)=DjfB%S^-nb20iO$qGI@-4iqaYxEDzT_5v2 zem;*{_ok9Wy0~E3_gmcP?49(Fa(wH_^7dM;92e1Q%;90$roAa+K0Q~fGaWIr@Sp6{ z{M@2|F-$97ww22$jVK)pR8Shyu3=gfr(v6_vL83*;nCV)TT~%5tk+w2`QHCZCi-K# zh7fhY_wDPQ-8)yVJ)3ivdVs_hNU$!9`wU%!h^@ry5XVIeP-!E|8n{pTdI91#w>~3# zLm&)g9{mbEFonfMaP;VeYqa}RxNRqKa%;>Jjl?%&< zTG0W-4At~ca;9Lkz4@=z6MbI|HiXUOwBECPxt{)rqj=Ot){??idd5L&>4l>mc>%Fu z#_DaZkTj2Z&FiWUCOKZ4&!x*qy-lb&h&5KT_&y((voLOOc~D+8MdXg!k!3RD8vJft z>~tK2*)qv4grZKKgzQmEy1T1adp==9q=)}b+9&vj-DJza?qvd(BB3`(^hJWCd3n#U z#DGXtNTdcwPH$|kK_ZJmudL5*zC*1?BhW|R^ROAvt(f{6obi4ggp%x?21s?6Lt>|Qi&eVyqM)Ic6XpIIY9qCR{rF2>N`<3Pp#mA*M%8my27GK zt>l*OEi9B1c7!@U<(=;we>hiiHFR6dsK(}_FO%M1zSU<`Fqb5E;pXD|>H^N6en0c* zzkS>v-yFYmL?4mv86rEaS}G9LXI@zMQC*8AkJCZC$~Bxr61BCelC$}ydG1u^YvJV} z#)Rq_f`{kwqlj+HaVjh==;vnJj#Ci_!)a(^Lejo|dsm$|tQA+^qs03zTH59H)+T0$ zRRP9wV<1VeW1`FyaX213+X(GyyJ-Z!#qHiFp-w)CX7Pc~--VfBOu8oiSe9)%U(QQi zjy30vdY*_u5HTIk^8Kkp_Oe^i@^0nN1{@wegBKoTHQ_sam*pA+F?Ymhi!Tc;LUf2=_uIB2}%K@Y+ywg8y5$~HP z*}_E}rb!yaZ^lOoQVyffIPd1};%&?vp>(Z1FD+@y)M&IjF5mmf?ia5b2x{P_$cV&&6dR|~_slwU1tCuv4O zEo+PdJCo_FtsZ?yNkZle!jOgeMemn*Gv*%abH$-g=PvqZ(=ChZBshnW;94UbFYige zzVTL(Pv~AkPHzi~E$(H^rEvV56y@d=huFkh1CQL>d|T>2Z{uL(@lKkN=5wh>idajn ztvIav;LeUMZ0*2@wT<*=9ViF&S23nKZa#cpl2tqw;V|62ik;4nasksHqk_s{1C1J2 zkR$}f=8B(56N>j*W9k4jB(d60Z5IOZ)kD8AQ?*>oPgr z8_27qHyCplB4Z+Iyo{UTAkCmR+}|sC8=Q{mpA)p1Ar{HN`jG+Ay>ul{Zx( zD;R+i_G~wbu5YN1*Q_30`w>(Abr<7il7ZOY2INIDmpRM8`+G44g*aUN&AUNO+Rk{K zmprg)^_L$)u|K$7%Bv7h6f|~|Jx}o-rNToB4Xv0C{S`{ydz8n#cqkh~K?d0Q z&ajqYd1a2nKd29fn-QUT<*|^KBBtFO|D|{2JxhMYtZ>X|02P9)jx2jL}29^r?db zS1yoe(lnO5kmbB3^*&mR>E&marSG4+?5S(-moT)!Q`j|;5`E2xH>#83@`aY5{N{^m z2Lvt-*Q}736A3!!L7Zils@~&_4#VAR!sa6se|XoQNq@0&<%m*o+Dw?xT&dt?A65u7 z@DUv%foo@ksA#a^%T-Oh-m)K&Pjf6H86QNsp$)+I1$Jb3?`#-ln^#fVyTvigzkIzX z{6ZtI(Qtw%+9sjn`KbCz+()0-*kr7v;fXOJ6G08AH0>BQQO1Y;4xLPnkvL%+y>!G| zmRtO&3l=7)s2@j6Bx;E}LeIs%1 zijX_c(AQ@ckDeW(bd8&2A+aSAn9B=0gDpigjlh}CUbdE7h~ivkhHYtySisTDc%M_@ z>vr^WwX{#I6p8~MDV8X#CAhX!8wI-hUAr6!CDXZI!N|-=_raT%eiieEdso4QvX%A3 zDNP?~&cdh_L#|k%a3Lp)Ov`Kc1MYU;Xc>Hs?W)iINUcY+ zL461zX$U#ypr6al+m~MG_KHfU?_Ay*{x%1n+#mTMl9iV&1b5>m&iwFZo{p1$+@2tN3KuPCahsZ@mmA>{|D z;5qdhy?$O)6#hp~R`iyCl4ssNPh4S0$UF1yPE}Y_oX+DJN@4W!q2P<-(%52}H{^RI zaXf^C5_0SD$mFUd#xg~6%UB*gDS)ZuI_s6FG<@OXnBRZgqL$GjpjNmtpX6Sf%weVw z41G|JGiK?3<3;2|Hlig}KVt;rgXoYq_on84z+Ihv%jJ(x4j*J#@(Xt_plLrAMzR#o zi)50(bC#7X84U%O7u>i_du03KsE#!U!1*;1<)dKERP>LuM3zbno!SVdH(#y8tGd!+ zT`7etTfSc9Aojt|HWxH6d}zL2?JRI-)GdzVmRvq_M*IgvyCmeB8N!14{iLv3Yv;Wy zi(g9plU;86NKnt+c~4bNs3-VrN1wJo?`1Pv=gO8)ZoYum^-#k`*f#H!BcJPQH^Jht z$S3#qO?Tjli$}#%-b&*ahGn}LzS+w=r^Xs%Jj^KJ7zwiywHJ!cEP^Dm%KRX66@~>k zaMBvBKTOnMjW#e*;J%u#vV7RyT1jD$vZ$7%evAHi1PuCFKRM3*Hc-uMlrJB|-AIGFA&4h<@ z>Ba9jYIp%}H)d|J1eC5rhik95#Xr8-G1z==Qj}n%V9PqLDAL~4uXFk90cXcvYm~8A zCQQ)2u5*407e}Q3-RmWKp90yrWK$v62Td5EBM0d5Ay#3oS?{m#A?odP-%eJjDSn1) zzTSs$I2bzjPYT;8eZ!^au$K-`>e|`BVrQYmzoMsS=kP_F&F-=6&_Wf*wmcbMC3W!; z*;`YI(8bs5uWx7GXW7F~l~UWTvcDDu*v^yb6{?R{0oL9Xc4fYGrS*-9~M1ODV?z9+L zpwytV-KxDsf+;ZZ@~UB^#x-xH{{<7IL+W-=4gAYJ=G zp+XuVgrvKWOb%dY*|$?s|GnWa9D<1d;Z-^uk#=;gTfK0IikQ*}H7Qa>xd~dHm#pGy zpmoGs?Z%09)Ge&~bb+*4AxP_D($`CYoSV(}h_Q9P`3TI_GyV)AmdOgeRe5EDRQnN@ zxu45BxGHU5O~y}x2|xYIV-ghjZeH&`+2p<*Rrf<*NoX5=bHFm46s;Zql1laSLNCJ| z(#H&Ys^>1);~l+gO5e>HX+~XB-n=%tTQ&go4;5oXqB#;gk2nJag!A#~0y}hkn%q3R zI^2BR=PWG4NTs-n<+&nJiF@Zl6(sc`JyZ`$eytuID&S-M%^TQ*{A+8D{|s|-I1jj4 z$_+;&4liQ*y*nu0nwtAa@^9{36{uz(io#RWWiV4vr7KbO?JuKzW+LZCA`TKbEO>_X z8LQU43Bo+vM(!_zp#lc` zYj^q7M(%_6xc8B8MuNw9&h&z4DtG|fQcY`QfuZ$UpA{Y^N$nmvO0cbLncX6nC#*0j zsb)P_7b8`c-s<~$Q|;Df?x@9P@I3=BGUXIyV@aO}R+fRxA}K6O((-h2>UFrHDdVlo z57zKSK2D-`SxI5sC%?4Rchf)Z5!(52A03NPBhL8}nWyW?p$!ffR@_!>=?viF+tY7{ z!}nMYr8*~@B#U0sJwc(P$oP?UQ&B^_%|DyAAr`Ie4z+Ll75;CHJNlnI)Z*VX9zh7| z_?4twWE2hzjvADzn+eQvABmfWGxN&K3U#9#1IK`Btl|cWC%#$4Cjqh)M1I> z9`ylsmKmY%oJfyV*~^1xv5+L?s2X&hs`rufMyfPf_t9%7y;Tlo2o1j(uYT>wpfQlN zD81;>l@su#c6RrZ@%jUtk|6QU*8-Ma6@LA-w(CgP!V1qv854 zm3Bfov;-=-(D5;Ro`X68kuf%Y-#{qc=&ED}rMiS`%EIiNh4rmEYvG5MLOnJ5jsqWa z(bx;I%Oze-R~y)-iTyMVUi*5ENne^1i3O11BKVosBf%X$+{Qh!U2$6ute&p{Z^l*% zkDifdY@y$|jiOhh@~Rqooqa}EaAV@ak+kz3Sq~e%Ty?`k^TMePNB0>{;W#E;nr1l* z>TrcIC#i;9MB5St^d)m8fh<%NKRs!0BE#QU+QY$rVm`zQrM+MQwxa#%3yJYkd%%mp z5&<*irAEJ=oR_(I{f@3p0sFWF%_^ONp0v>3JIYI3@EL1I11o(o8_seo;>1M~k2JlO zHMa~j%#`;67pc?Ytv?c7w4Sr$zNhAjORg-6{y?8epyvZy5#hM{W4ZjChL63omk16= zclS}-BJ_%oSPcmlg2B#mKtNRxA58*x74VzciG?3dq|UIP!IYm_>~Y&~M#^*F4k){q zTkw_GpJhR+(ZY=;MwkO{4!1_5>oLa{7h?v4FAkB6!UxIzJDZotN0L(X-{v!u9aM7^ zjQdtoT|%{#Eelo1f>9jUZp@UZ@V7kkctbacM0g|+iAUfp<1Mft2Mj6FKSVL0gyQ>( zLTXVt;AWO6o61m_iof)0-0oCHsglK_00jqy-FRq==wNBnJI(>6g0S&sEiB`l*GS$u=!^2q2KT> z4MDQa2oeJ$12t!f?J{bD_<(^Y4zO=PZhF7^tU%-?fAmb(L*0v5CAENs?txo-n`ej~ zMBr;)TOPrWOUdE4cqI&t(IYXVJV}*ZVHn$HMxEq@?zhG5Ef{$l?B;Jaj9d#Z%|{bj zqk8r(paNskc4NF8*S<~DumxKoQypqjZK7RQ&d3m5+54ek*;AfAEoS5jQ`1zL*Wtv} zf=C(k?{h3s43tB8_tc(WPxT3lvnR8i)iVHR3AM9j=y1{x5Y=^ zG#9`h$I2o}3QOm=y-}^V*D$cu;}1j{y=~*M!yp@Ayz1U( zjU|F}L}R)!^vN})3KsEq5Aq=aNW5p5>-=6TLs@(HW8t5V@d_6wP8Y@XZt0-Ds=nqO21CZ>eMqL;$GiW zytViN#$fWnexq58(O${tX`~oM?gyc%j}|0}CFinuU*@;z-zM}G3wv3ryyWeoy)Wwk z+a@VC<@hdy-tj1!H!kt}nwi2j9Hw5a!c$~nmC?LSE1H*BfD_3`k|_sUXQ=tv+VttW zOF5Kjjt3QqW=PNzFL3(&KQ-e_Qt+Ucz?1>^fN~640K%eQM+Uiz}!6>g` zPWp{4lTT76`pmB#s(%&SSaug%T6=%gP2UIq$+_9)dgW^O^xbUp0Da$!cosgsV(wQJ z%APnHnA>WqI*1m#TI_c!X0jOBH=vA1u)p2M&&2$$dC4AzfAKAy*F1$ZQHhMVSNH7U zUIVGm*fjZ;ewch^8pP(}lO!f=IqSsB)?URA@=O@3jEa=6zFLj+F_&4xJp2W|#&o2x$vd>8_ExNdwN$OM#<|REKHNoy{@7#YcKrzDT zsen7c9wBq1dhXr+-cC`Sc-G+0&4{fIdjTF^iHlwrq* z_e-Aqd)6**1hoogf$eBCvV=q7Hcwj}DNy-Bw{_@&qO?+xr=3K_} z`<~|m>N0xy2A@Cm8GMUt#3tZu^y3OCS>Q^^4^OW#?#di2QGtzzFDr_(nqHw3-a(%` zQjXVzMYiY{S-3{Ptumx1h|{F&CAx*MU-f^JZ4a-q9>WpFT}s30scfsg?pnmmdKDj? zsOKqL+@R7GeW{GkNuq0%MA?Wm8c>BD@Ib5!I zrF==sV5|1M&$An-(I2D8kaQPPC=!ntPB#*{m!4*p0(?R!bA!jI;NA zNW50WAfVl#FZD&^!k}5H{-evDh4y-IX(=^tyuLUY=|%fnti(au7s#E4wJNheT;4fRKmp49n&SjaSZ{z%(rYJI?H2!_QP>EB;e3XJL*w72D9= z0(xRf|DRFLEDbw}!*lNYsuf>3CeR*hg+ekXgF#sfR?yv~)m z9y3`e^T&{ewkUm((twz$D}sCl_2e&6BVO3F7tv(lSJ?WqkSVu zH)Oe_0HcLYz+F8(P)_ygK2wIQpvd)T+V;M(%6t`%LOU_bH`JBn{Tm^*THOqYv;o7Z z>Z}?Mu0n5FJ3Ytvf{!X9xj`-2M*%lR26hk8HdM@!EZPFwbARIu!Luw-tN$KTfIUx? zkmr7eBH$sH@2P#lk8rH4MEUA3Fj>qgp|evMV~g>OI30eT#W(I~r^QP0GxY9h`KHsr*>g5#Mp1NcQcu*%lU(2rT3Be7ZON$mOXU%@pr1(MV}L%sm2DyNCEWVL+ZG2jd!aWLJGL+{Se?>d^{t95W>z z72gEcMEfm4jr~*n7^>4_g^!9ES|u;;j%D3t+k&A4RUdR z%o>1110+z82l!>)e*z{%{Hs_1UrT*1lf=77_@R7NfpDF6;s{ZYnehVFtyU1x)L?Fi zr?!SgDIbv6R}rPW2bGEnZhm!6=lS=M_4BTJ!q{}Jw$2LiV_HAr1igpun>P$kW-zQX zI$kf`mn23Mg%QZ2X1axVNvay=z>4*m=bBJ)MJUI=Q8ZeE}@ zy7DBq&B4zJcWS3NV^5DbDp_>rL;SnhRH({+1sdq}mRV4U!qxPjWkWTV#LjX*j9ByuBxM?668i~p1xq48@d8(<9#QQTMP z-z<6P;(z3<@`m{}CUFH!)kcCa9I8{mQSB1A)FL|t$(xi9__7v{U0))oM~;8hyif2>ciO7H=R|iJEoN4@ z+dMVDf%;D`D`VxdHe3aRIXBrU+8C-&H(c$ME@j86r7UA5r!0>p)z;2F?R~s`h?XX0 zgJca0_)*FK!o;tfD#wXZppxdE*1zuL)MO3R4cH|-?9H82v_Y$TczQ|MnR@^=p~~hb z_dEy$HWaKNZS864VejVU>H&d)KuHq<)$+FRI!@ODG5|u5s<|_e^yJTWe^r+JmF0w! zj}HOigacVkc({Snh`@F6*EAD^pOt7J#VCq1E79i~j zh%12XDKaiV785Xi%m32G{3c;TrR{BOtbyb!V73(_2poV~o^IxrKq44GH#aaVi<7mD z*U7Diy{(-W1e{jd`MKFyy8vWs4|`WD2zZpp-`c|!0(Et<#)f)bcZGm6dK(~%3-G_a z51_0kATt73PyyP7tUw1PUkti0uY}0euApr1gYA0d>70P#*~NI*=6x z7#;io<$xbF16g38diGXcc7SEU7xLc;WsW=g_wSYd=SBZ=h2y?M*6)D6PUg0rfa%ZT z0TP~;V4@sBz#pJs>);no9&kP^W$q^b%eee7K5VG|$rl88{z3sr5@c^F;bQA#4G0d^ z^0Ic;1#eW$eUCK)Yy!Lo|4SQmtRJY35riMus`&qKVgJ9j0scw}yZ|r)K>!W}T-X8v zf)G9c-@yc75I7wE>jG#90p4F_0ow0+Cv|?;J^hLO`}sfRfi}Q)jxT`Uv1LGJ;KdL8 zkQ2O)zXO4T9}o=u4phcQ0PiVC2OAUs1UN1WJU^~?QtqF6Cv6A<>q5sguzjGt6Ctp{ zPhQ|{0NVl;{4FHNCQ2*Ft z0Q$c_k1Y+-z~}$EP8eZ>0{m9^^e6Z#9QPm~^uM3|=KK5e&o@b9vqVUY6p-O0e` z|MEHfd$KA4IZp2YjK`7?7&yxS?xcX%^4FaV5WIff$wGi}^!TY9-~_+!TNguqXD{0daZM?eAnfM|EZ2MCfU ze146)yeInDKmhMO=5-wUj!zE(6_2AK?+Hg+2=9qjc3|{8E`1VIf!;V4#UAL`Uk@B1 zh!d7hU|c;e>Yk_ugkdn+9=GEL0a8{R*ZE&O2jcJlD8It`@wz-9 z=1E9DUd0B!dH}BRs~t}WaLV+!78uKcqh7!6kI@3(i3DC?JU^y`7=Z6YS#JpNW&l*| ze+Oy46A^sCuzy_TI^=lv1u*r404EBLY5u?)adHRZ3;thR{}-vi@4w@&EDUmt=KdX+ zLAicEmw^2n0sWKm4k{%96oOisLttPCgi7%IUyA`?;H22Uf+eW%35);91#12m5j4VI zL{Poo68=ICzf$=@mF%rRNCv$8pe2s8+j#@g{#eM9EG$4)ga1{Q0y5Nrppykah;B}< zUQYHF5Fa=fjGIf417hdp<>n~_g*yN9jLX%-mK7TcKt(HWOCaOjzpJ@f*+4AJEgbP<%Gt=%?o<@DH;#xv!`jmNsrSsI4~lfrtyNAX-?JU1&512 zXuucHdr#5e;LH3cjfWo`LjI(2^KhS*kDG@d#D#y9g9(7)=rj!ujuC&B;n zeB7sL01xn;|Dzl?k03Y{{Yitv!FT5rjUOD!PSFH_b9tv|yxid6_h&g?-oN?s0`Blv zIeu^;KUG%%9FhK{A%IT*iv|PM>rR(L{M{c20WiG$*&ZLy-!|Yw05gMA<@jM>Dx*K^ z^7DWr>M0rm3|yyZe89xxG)?et-w@yh$Miqz3IH8^nkLBow|s&;V37K=t{?&&9#7E( z|MCqOH~cU8VBCCv`8|+H6O1K)v#oVULG*^ohk$=c&MDXo^hzcn!415#fqL; literal 0 HcmV?d00001 diff --git a/audit/dataset_statistics_plots/normalized_score_variability.pdf b/audit/dataset_statistics_plots/normalized_score_variability.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9af42030016afaea7577f2332ebc577db172ddbb GIT binary patch literal 34031 zcmZVlbyOSS)(4ChcXy|_1a~b`pv5T`T!RLe;>9UmTHK|$B{-BKK??y26qn*o2@Zvp zo1SyNd*Aill|N=pGRf@O>)HF+KMA|;OI1E0eoe@dE?gmIL1qtI7hG9cW&wjh zPY|3|*u#-oK-b0*jJX# zWtRBwsY-yan!c}%FNj(6zi;T;`1*pp-I>Myo#G1UJK5MfyE`(A|L;?84?BI3FS8l) zwl7tYp@0H>nFU_EAwy95-%qqpuUQ(EskN>I_CcH?x2$7@3ThAUh9x(7*6~e7!+7 zZny#Y6QLSYZX;f6j}H{VTttu1pQ&)RB@SjuMSGHOeTeX$ZX+&40&)51tFX}dn)S-y zxy>{;lXPMVudI}mk_IZ2~_=)=Wc-AcMS}xVd4E8Rp?8!tm;H(-I|KjZF`sPN!qv#JcdRXF% zyT{Y|-CIHBfhXW`aB9XA>eBih7UN&l%{Sx0OLrVw)(_C1=8a*cLyzm;UoYs-{`~&6 zef^@*%Btv(WLjguAN;4MTHYSxQMz@kyDRB?G|k?x;0;6Vh#FznOHaw&{!^;$d4dm z4$A!_m-IOD-tzBbDHY^nPza^P29v@SDd({6o!v{1cVW}66K6gtj?)-_pH~FEJ`=lr zp?Y<`PxWV)^zo30zyC>U+N}1hI^(J1U|PUO`fVR@(s)pNR!uVnj(GX^ctqe`=%3#! zXY!%<6AMqb+r|qXFG3#HMtZ|lC5EyQh8`mF&IX95E7F;t3o|r!p#mcC^;!DMpENq> z0^FgnTq!(`hpQR9znmIHN<$cB z3DM!sAZ;YN3;w9poL;Mza-w_gEb?`7oj)M(F7JCH+H2pJ5FAu2#z(4Gh<5S0v~|Z{ z#JbesthC;Y&a`NG3n)!e^qMi!GTtTHN_m^EP+(-qH)M8P=gmQ(%Yu@T*F&CBmA-jx&QUq-g*iT*yPLv9CY1w+Ax zlfso*x_WbQTWY~dCZc>Hl3%bFY*o^90e`(iPNrwKf*YH~r6qSbMkf1b1z#xFhT`T6 zX)1Ci6AJdGnmSb_39Ui0>@@B0eK?YBylqjhh#`|`OlK$$a=z~aW2I7x$$PV)OnRli zrXN!_Cnh#0`WI8xd9v-)se}50NU9ZV&@Sr@!Skxh*~8Jt*c*H0TN#lwco=#g%Mzo! zuae&e)r)mC3PtYsRcWec_hT6$go-*gW+>fubqjeZLrw_4O5T%GfVe_LIipp^VX@8e z2mmGd{otYi=CIw5myas-bLqHkm;o)#n2%Zn+;Lo<(G4Q`i~%Xoid$Xf_|-GItxuw~ zrKl$uBzEmyT8VCbBJS805$$i`oRaZ(w8L&+Z%}+$$hpyY>q_r-aSE1UN1ltkM0AQu zGcItltxDjvv` zNmpt~t?@&xjl+wcm;$Wn@%QAs#AD z_l`n?ao$|hJ4lp+zOKUa2vJzftgR1Em8GM1ET&F*pHHo8`-P7+TG7zPA^asinl0s{ z%{cJ+n|(y;qKI3jgJcj;kzwpR`HO<&=#9l1HK>~gw(<>iwb%%zn3(I>9uRx>hihCO zx?W01#(Z|O5~<#>;-SSf3G69%ZNWk21AmHO`$8-NAPT|ZPDbOj^N8VfvpkCxY1>2> zsZcr#gNt`+vAXwp=R*MB6KR3NZWcpFnb2|T=YzN?ICL*|D}Hib;kl;7`vpi6za*iY zvw!!_x^A=VjfGN+j}`2fMMgx#SDX19c(o*)WhT-CYt52nz7M@p;#pe?E+ICmC-oS; z8~z`0QGA__Mg3|Rib7RC8TAMXs<)<7lGCnuUqnoEq%*%aPL(wA-NCW0T!I|FBptoO zkfj=+^(0XbLCu2EqubwwCX8VekMox%Y6wk~^q2%OEOi37omeY?@V zh-O6zXau}b{nnHqhBr~ek&#o}hv`El&~lKn+l+yzEPb0>-9m*uye_YIERka9ESfMX}k8RLBe+Siz zN@jeZ$28aP-g-+?XuKH11@i}di4`t=MP2e8D>LeQ$D?7&EzWEt=2aZGFqV9*?;@=f zQ}Gl2>~<015VOwWTwWTntdFX7pLvSl-*_iq)e%f%-Kvq0P!MgcXLth3y(B8?F7Zpr ziX%xba(NP;aH2XyWK_XzKaNbguwEU?oywG2&L6qklo-9vSmRhjUJ+SR90s1>o_F(w zWskZoO(jNp6{!S<3az)b)0;;5-(fqSU!NhMwnw)K`n0j$dig%>*6(i}xG1}GR&o!4 zic%O{@|4}d(X2;UmDDxcVF91t(74NAPrMqjgoY6u{@MTUQ%Np zs;_ZM7?{Wy(Tp~O`KtonSR4#DqTWRRwev@u79<4Ml8LH1>;(9DKJNwJZg! z+u1aL`RE7UMe_P2OzB7^yd$Nk@f1b;AKIU5W9U&}K17BsK6w2#q_ed#uoJHY{V)G9)Z6bp(6ePB*?HOzM~rD zIEo*DIAW(#98a=nXt^xq6$`}Csn&2n)6Iy@I`0l6GDe)a)yCmrMkz9-0oSA@ZG1ay z*;la99O_)5&*J29f1!JhGK3A((Tm}oT2U_HOfVfNgd-8NGtUA4P~J?0{z4OeCL9Tu|Yc=7b2!#5xl7r6(~%Pz2qdS8&WQ>~N779be6EY?daL9lIrk0Q{%hLzO%OGhU#zUAz$O>JEa_^MTRL=KnxGcJ zrrxe4=D%~~`$8e#wY*GI z%VwEiN^N9jwXD-C<+#7TZdHCI(*sdjx?CvGwaDd+zGvE}=WR8xJ*YV)t3WY^cpe<7 zSa`p&$yd55NWbu{LBlLjTS>)`DS_#fd$-JaSNXKmJdr{L=5}_LniM(~7q~zi^J8>H z%0cQSzc^wjU_i-nvk!5cqc!r3Xj{bpq(QT7i)XQ4ykBTDXr*L+=OYeNlO=U4!8$hJ zCB!;T^={-)ex8klcm=RJ1awCwlDzzk7W}>3AW&A%?H2=Kwge60=gn=NN%ZsQ_2s+o zF^I>S1K%2WDv&<=iIZ0KKq)M)`7Vj51ec;eI!Qbor8s*p`(lhYGlC>|!-Jt3-GBf) zq=nre)shHp5;M-Eb%9cY@4=QFOhjEqFNC2yH~KkT2sQPRheIVFC9klL;*wMR4k(*6 z;a>e7PA|)09rjFix{%nC6@k5Ib3P^6v6Y-6^h@^_1wN62 zk`0%D-!F9afIf|Zx`e{yer(j)@hyLw+!62283BWPOn5GdHmsTgM4v7SVQ!aPnOy*m zZv+XVC~-RmHTi|561jt7l*hcZQl%@%zeYzUNR!KVk-0?@ROUL zpKBAF*_umGvtH8Pe&X9fJGO96Xq+g%#82B_L(!{H30frUeATFrK=}#o4CS?imoH9W ztlmg=sie8hgTqAUyK#mq)B?XtA@wkJU!Z5vjUOIc!uQKr4cqh9r246YmI$Gg2WK_4}E>k1+8& z+GX5pwF4j?UpyadQ|B05@+Qf2T-|-!WKNne2ly9v&B=1d_U9oZd*^%>R79X_;#6cV*$|5|@@H^A%cMq;WERiK) zLUB+))8YWnR3HOWrg+RXi}7wQSLvletoniOblgDu4gGGt^T_`huJBWu53*m#9V*+P zByrnLx2j+rTei#+^#=|c65o0?5z+l@MeOi;?^fEylv8jVjT>SmWz*1gZZ?0g2uY%# zYA`mVBM$3WD#0Cax&>7~yxHUQ0++DQ7qg3*8eg z3V23H#dg7S-_M^(fAT^sxRczFk1Nc#!oEEy1FJR06Oc)^IXL>0iD+M!y63rr(4{A> zs>M`y^KP}E?CT`O!eR#cXt#b_Bh>FpBF0LktjX3`PLKgE+4|29F)yO^giW1v#PN~Ha^_`BTD8(F4;+Of6 z^N?N?d6IAMUP1L{!r#45P7^WjZAem84Jl0$BFy5JZ;oKV;CJRKbUt`Cd3A-#Q*(su z+B)1!DtL4*6uTy1u8a7uR>>(M{vCd{fv(~So8d;|sJjBzWqd-slU4_;FUOtAJf@Q8 z!%E7N8s$DWLWp|@DA5o z{uMvUsgb_nusqFx!qs_Y;9FL+o^L92#|yhH&}opcbY{}GD^~y(vqT{mC!@=v55aC^ z`z!xW5n6}lA6AexCzPm zFtb`XnPESasHZX#!PFwHgLkeX=9^h~xc`)Vr;{;6bR@U z5Qc|1$0_~PtcFUJN>VQ18r>(x2`kThNJ7}yeIFU4w!qJ3Wai8$98IP?2)1wS5Kdars=EEU%Cr9y zu_b{Cy)Tfl+cTR|8oU&c9yt*$#aS8`^)Y-`+9vSU>Qang?*!p1)E=m2UX;_fI_yvK zaXb540q-Y6d!J&it&N&CWn;sT4g{u@Z0&tYh|8gInQdy0YBMur>iATce z0MYdE8)Jk)>^sYkuhrWVR-)o{KxVMIU^S3zD>-*R&lXn5r>U9$CICBSp%Z!xR4`$U