diff --git a/.gitignore b/.gitignore index d493519d7..8e12943bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # Local data (generated by running adapters) # data/ +audit/ +plan/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/every_eval_ever/helpers/dataset_statistics.py b/every_eval_ever/helpers/dataset_statistics.py new file mode 100644 index 000000000..349d041fc --- /dev/null +++ b/every_eval_ever/helpers/dataset_statistics.py @@ -0,0 +1,1221 @@ +"""Descriptive and uncertainty-aware summaries for Every Eval Ever.""" + +from __future__ import annotations + +import argparse +import json +import math +import random +import re +import statistics +import sys +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any, Callable, Iterable + +SEP = '=' * 72 +SUB = '-' * 72 + +REPO_ID = 'evaleval/EEE_datastore' +FOLDER_PATH = 'viewer_parquets' +HUGGING_FACE_DATASTORE = f'datasets/{REPO_ID}/{FOLDER_PATH}/**/*.parquet' + +CONTINUOUS_SCORE_TYPE = 'continuous' +STABILIZATION_WEIGHT = 5.0 +BOOTSTRAP_ITERATIONS = 400 +RANDOM_SEED = 20260429 +SCORE_GROUP_KEYS = ( + 'benchmark', + 'evaluation_name', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', +) +METADATA_FIELD_GROUP_ORDER = ( + 'eval metadata', + 'benchmark metadata', + 'model metadata', +) +REQUIRED_METADATA_FIELDS = ('inference_engine',) +METADATA_FIELD_CANDIDATES = ( + { + 'key': 'generation_config_present', + 'label': 'generation config', + 'group': 'eval metadata', + }, + { + 'key': 'generation_temperature', + 'label': 'temperature', + 'group': 'eval metadata', + }, + { + 'key': 'generation_max_tokens', + 'label': 'max tokens', + 'group': 'eval metadata', + }, + { + 'key': 'generation_agentic_config_present', + 'label': 'agentic config', + 'group': 'eval metadata', + }, + { + 'key': 'inference_engine', + 'label': 'inference engine/platform', + 'group': 'eval metadata', + }, + { + 'key': 'source_locator', + 'label': 'source URL / HF repo', + 'group': 'benchmark metadata', + }, + { + 'key': 'source_organization_url', + 'label': 'source org URL', + 'group': 'benchmark metadata', + }, + { + 'key': 'evaluator_relationship', + 'label': 'evaluator relationship', + 'group': 'benchmark metadata', + }, + { + 'key': 'detailed_results_file', + 'label': 'detailed results', + 'group': 'benchmark metadata', + }, + { + 'key': 'has_uncertainty', + 'label': 'uncertainty', + 'group': 'benchmark metadata', + }, + { + 'key': 'uncertainty_num_samples', + 'label': 'sample count', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_id', + 'label': 'metric ID', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_kind', + 'label': 'metric kind', + 'group': 'benchmark metadata', + }, + { + 'key': 'metric_unit', + 'label': 'metric unit', + 'group': 'benchmark metadata', + }, + { + 'key': 'model_parameters', + 'label': 'model parameters', + 'group': 'model metadata', + }, + { + 'key': 'model_license', + 'label': 'model license', + 'group': 'model metadata', + }, +) + + +def read_data(datastore: str) -> list[str]: + from huggingface_hub import HfFileSystem + + hffs = HfFileSystem() + files = hffs.glob(datastore) + return [f'hf://{f}' for f in files if f.endswith('dataset.parquet')] + + +def load_schema_table(con: Any, table: str) -> None: + schema_urls = read_data(HUGGING_FACE_DATASTORE) + if not schema_urls: + raise RuntimeError('No schema parquet files found') + con.execute( + f""" + CREATE OR REPLACE TABLE {table} AS + SELECT * FROM read_parquet(?, union_by_name=true, filename=true) + """, + [schema_urls], + ) + + +def extract_result_rows(con: Any, schema_table: str) -> list[dict[str, Any]]: + rows = con.execute( + f""" + SELECT + schema_version, + evaluation_id, + model_info.id AS model_id, + model_info.developer AS model_developer, + model_info.inference_platform AS inference_engine, + er.evaluation_name AS evaluation_name, + er.source_data.dataset_name AS benchmark, + er.metric_config.score_type AS score_type, + er.metric_config.lower_is_better AS lower_is_better, + TRY_CAST(er.metric_config.min_score AS DOUBLE) AS min_score, + TRY_CAST(er.metric_config.max_score AS DOUBLE) AS max_score, + TRY_CAST(er.score_details.score AS DOUBLE) AS score, + er.score_details.uncertainty IS NOT NULL AS has_uncertainty, + er.metric_config.metric_id AS metric_id, + er.metric_config.metric_name AS metric_name, + er.metric_config.metric_kind AS metric_kind, + er.metric_config.metric_unit AS metric_unit, + source_metadata.source_organization_name AS source_organization, + er.generation_config IS NOT NULL AS generation_config_present, + TRY_CAST( + er.generation_config.generation_args.temperature AS DOUBLE + ) AS generation_temperature, + TRY_CAST( + er.generation_config.generation_args.max_tokens AS BIGINT + ) AS generation_max_tokens, + er.generation_config.generation_args.agentic_eval_config IS NOT NULL + AS generation_agentic_config_present, + er.source_data.source_type AS source_data_type, + er.source_data.hf_repo AS source_hf_repo, + er.source_data.url AS source_urls, + source_metadata.source_organization_url + AS source_organization_url, + source_metadata.evaluator_relationship AS evaluator_relationship, + detailed_evaluation_results.file_path AS detailed_results_file, + TRY_CAST( + er.score_details.uncertainty.num_samples AS BIGINT + ) AS uncertainty_num_samples, + to_json(model_info.additional_details) + AS model_additional_details_json + FROM {schema_table}, + LATERAL UNNEST(evaluation_results) AS t(er) + """ + ).fetchall() + columns = [ + 'schema_version', + 'evaluation_id', + 'model_id', + 'model_developer', + 'inference_engine', + 'evaluation_name', + 'benchmark', + 'score_type', + 'lower_is_better', + 'min_score', + 'max_score', + 'score', + 'has_uncertainty', + 'metric_id', + 'metric_name', + 'metric_kind', + 'metric_unit', + 'source_organization', + 'generation_config_present', + 'generation_temperature', + 'generation_max_tokens', + 'generation_agentic_config_present', + 'source_data_type', + 'source_hf_repo', + 'source_urls', + 'source_organization_url', + 'evaluator_relationship', + 'detailed_results_file', + 'uncertainty_num_samples', + 'model_additional_details_json', + ] + extracted = [] + for row in rows: + item = dict(zip(columns, row)) + source_urls = item.get('source_urls') + item['source_locator'] = item.get('source_hf_repo') or source_urls + model_details = parse_json_mapping( + item.pop('model_additional_details_json', None) + ) + item['model_parameters'] = ( + model_details.get('params_billions') + or model_details.get('parameters') + or model_details.get('parameter_count') + ) + item['model_license'] = model_details.get('license') + extracted.append(item) + return extracted + + +def parse_json_mapping(value: Any) -> dict[str, Any]: + if not value: + return {} + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return {} + if isinstance(parsed, dict): + return parsed + return {} + + +def normalize_score( + score: float, + min_score: float, + max_score: float, + lower_is_better: bool, +) -> float: + normalized = (score - min_score) / (max_score - min_score) + if lower_is_better: + normalized = 1.0 - normalized + return normalized + + +def percentile(values: list[float], pct: float) -> float | None: + if not values: + return None + ordered = sorted(values) + index = (len(ordered) - 1) * pct + lower = math.floor(index) + upper = math.ceil(index) + if lower == upper: + return ordered[int(index)] + weight = index - lower + return ordered[lower] * (1.0 - weight) + ordered[upper] * weight + + +def numeric_summary(values: Iterable[float]) -> dict[str, float | int | None]: + vals = [v for v in values if v is not None and math.isfinite(v)] + if not vals: + return { + 'count': 0, + 'min': None, + 'median': None, + 'mean': None, + 'max': None, + 'stddev': None, + } + return { + 'count': len(vals), + 'min': min(vals), + 'median': statistics.median(vals), + 'mean': statistics.mean(vals), + 'max': max(vals), + 'stddev': statistics.stdev(vals) if len(vals) > 1 else 0.0, + } + + +def shared_evaluation_key(row: dict[str, Any]) -> str: + parts = [ + *(row.get(key) for key in SCORE_GROUP_KEYS), + row.get('score_type'), + row.get('min_score'), + row.get('max_score'), + bool(row.get('lower_is_better')), + ] + return json.dumps(parts, sort_keys=False, separators=(',', ':')) + + +def quality_counts(rows: list[dict[str, Any]]) -> dict[str, int]: + counts = { + 'total_result_rows': len(rows), + 'missing_score': 0, + 'missing_bounds': 0, + 'zero_width_bounds': 0, + 'incompatible_score_type': 0, + 'out_of_range': 0, + 'missing_metadata': 0, + 'has_uncertainty': 0, + } + for row in rows: + score = row.get('score') + min_score = row.get('min_score') + max_score = row.get('max_score') + if score is None: + counts['missing_score'] += 1 + if min_score is None or max_score is None: + counts['missing_bounds'] += 1 + elif min_score == max_score: + counts['zero_width_bounds'] += 1 + elif score is not None and not min_score <= score <= max_score: + counts['out_of_range'] += 1 + if row.get('score_type') != CONTINUOUS_SCORE_TYPE: + counts['incompatible_score_type'] += 1 + if not row.get('model_id') or not row.get('benchmark'): + counts['missing_metadata'] += 1 + if row.get('has_uncertainty'): + counts['has_uncertainty'] += 1 + return counts + + +def valid_normalized_rows( + rows: list[dict[str, Any]], +) -> tuple[list[dict[str, Any]], dict[str, int]]: + valid = [] + exclusions = { + 'missing_score': 0, + 'missing_bounds': 0, + 'zero_width_bounds': 0, + 'incompatible_score_type': 0, + 'out_of_range': 0, + } + for row in rows: + score = row.get('score') + min_score = row.get('min_score') + max_score = row.get('max_score') + if score is None: + exclusions['missing_score'] += 1 + continue + if min_score is None or max_score is None: + exclusions['missing_bounds'] += 1 + continue + if min_score == max_score: + exclusions['zero_width_bounds'] += 1 + continue + if row.get('score_type') != CONTINUOUS_SCORE_TYPE: + exclusions['incompatible_score_type'] += 1 + continue + if not min_score <= score <= max_score: + exclusions['out_of_range'] += 1 + continue + normalized = normalize_score( + float(score), + float(min_score), + float(max_score), + bool(row.get('lower_is_better')), + ) + valid_row = dict(row) + valid_row['normalized_score'] = normalized + valid_row['shared_evaluation_key'] = shared_evaluation_key(row) + valid.append(valid_row) + return valid, exclusions + + +def distinct_count(rows: list[dict[str, Any]], key: str) -> int: + return len({row.get(key) for row in rows if row.get(key) is not None}) + + +def count_values( + rows: list[dict[str, Any]], key: str +) -> list[dict[str, int | str]]: + counts = Counter( + str(row.get(key)) for row in rows if row.get(key) is not None + ) + return [ + {'value': value, 'count': count} + for value, count in counts.most_common() + ] + + +def count_values_with_unknown( + rows: list[dict[str, Any]], key: str, unknown: str = 'unknown' +) -> list[dict[str, int | str]]: + counts = Counter() + for row in rows: + value = row.get(key) + normalized = unknown if value is None else str(value).strip() + counts[normalized or unknown] += 1 + return [ + {'value': value, 'count': count} + for value, count in counts.most_common() + ] + + +def models_per_benchmark(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + grouped: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + benchmark = row.get('benchmark') + if benchmark is not None: + grouped[str(benchmark)].append(row) + + summaries = [] + for benchmark, items in grouped.items(): + summaries.append( + { + 'benchmark': benchmark, + 'unique_models': distinct_count(items, 'model_id'), + 'result_rows': len(items), + } + ) + summaries.sort( + key=lambda item: ( + -int(item['unique_models']), + -int(item['result_rows']), + str(item['benchmark']), + ) + ) + return summaries + + +def has_value(value: Any) -> bool: + if value is None: + return False + if isinstance(value, bool): + return value + if isinstance(value, str): + return bool(value.strip()) + if isinstance(value, list | tuple | set | dict): + return bool(value) + return True + + +def benchmark_name(row: dict[str, Any]) -> str: + value = row.get('benchmark') + if value is None: + return 'unknown' + text = str(value).strip() + return text or 'unknown' + + +def format_benchmark_label(benchmark: str, result_rows: int) -> str: + return f'{benchmark} (n={result_rows:,})' + + +def field_present_rate(rows: list[dict[str, Any]], field: str) -> float: + if not rows: + return 0.0 + return sum(has_value(row.get(field)) for row in rows) / len(rows) + + +def model_family_name(row: dict[str, Any]) -> str: + value = row.get('model_developer') or row.get('model_id') + if value is None: + return 'unknown' + text = str(value).strip() + return text or 'unknown' + + +def format_group_label(group: str, result_rows: int) -> str: + return f'{group} (n={result_rows:,})' + + +def metadata_completeness_by_group( + rows: list[dict[str, Any]], + group_name: Callable[[dict[str, Any]], str], + group_key: str, + groups_key: str, + top_count_key: str, + other_count_key: str, + selection_key: str, + selection_description: str, + top_groups: int, + top_fields: int, +) -> dict[str, Any]: + candidate_fields = [ + field + for field in METADATA_FIELD_CANDIDATES + if any(field['key'] in row for row in rows) + ] + if not rows or not candidate_fields: + return { + 'fields': [], + groups_key: [], + 'matrix': [], + top_count_key: top_groups, + other_count_key: 0, + selection_key: selection_description, + 'field_group_order': list(METADATA_FIELD_GROUP_ORDER), + } + + rows_by_group: dict[str, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + rows_by_group[group_name(row)].append(row) + + field_summaries = [] + for field in candidate_fields: + key = str(field['key']) + group_rates = [ + field_present_rate(items, key) for items in rows_by_group.values() + ] + present_rate = field_present_rate(rows, key) + missing_rate = 1.0 - present_rate + group_stddev = ( + statistics.pstdev(group_rates) + if len(group_rates) > 1 + else 0.0 + ) + selection_score = missing_rate * max(group_stddev, 0.05) + field_summaries.append( + { + 'key': key, + 'label': str(field['label']), + 'group': str(field['group']), + 'missing_rate': missing_rate, + 'benchmark_stddev': group_stddev, + 'group_stddev': group_stddev, + 'selection_score': selection_score, + } + ) + field_summaries.sort( + key=lambda item: ( + -float(item['selection_score']), + -float(item['missing_rate']), + str(item['label']), + ) + ) + selected_fields = field_summaries[:top_fields] + selected_field_keys_set = {field['key'] for field in selected_fields} + required_fields = [ + field + for field in field_summaries + if field['key'] in REQUIRED_METADATA_FIELDS + and field['key'] not in selected_field_keys_set + ] + selected_fields.extend(required_fields) + group_rank = { + group: index for index, group in enumerate(METADATA_FIELD_GROUP_ORDER) + } + selected_fields.sort( + key=lambda item: ( + group_rank.get(str(item['group']), len(group_rank)), + -float(item['selection_score']), + str(item['label']), + ) + ) + + top_group_names = [ + group + for group, _ in sorted( + ( + (group, len(items)) + for group, items in rows_by_group.items() + ), + key=lambda item: (-item[1], item[0]), + )[:top_groups] + ] + selected_field_keys = [field['key'] for field in selected_fields] + + group_summaries = [] + selected_groups: dict[str, list[dict[str, Any]]] = {} + for group in top_group_names: + items = rows_by_group[group] + selected_groups[group] = items + group_summaries.append( + { + group_key: group, + 'label': format_group_label(group, len(items)), + 'result_rows': len(items), + 'overall_completeness': average_completeness( + items, selected_field_keys + ), + } + ) + + other_rows = [ + row + for group, items in rows_by_group.items() + if group not in top_group_names + for row in items + ] + if other_rows: + selected_groups['Other'] = other_rows + group_summaries.append( + { + group_key: 'Other', + 'label': format_group_label('Other', len(other_rows)), + 'result_rows': len(other_rows), + 'overall_completeness': average_completeness( + other_rows, selected_field_keys + ), + } + ) + + group_summaries.sort( + key=lambda item: ( + item[group_key] == 'Other', + float(item['overall_completeness']), + str(item[group_key]), + ) + ) + + matrix = [] + selected_fields_by_key = { + str(field['key']): field for field in selected_fields + } + for group_summary in group_summaries: + group = str(group_summary[group_key]) + items = selected_groups[group] + for field_key in selected_field_keys: + present_rate = field_present_rate(items, str(field_key)) + field = selected_fields_by_key[str(field_key)] + matrix.append( + { + group_key: group, + f'{group_key}_label': group_summary['label'], + 'benchmark': group, + 'benchmark_label': group_summary['label'], + 'field': str(field_key), + 'field_label': field['label'], + 'present_rate': present_rate, + 'missing_rate': 1.0 - present_rate, + 'result_rows': len(items), + } + ) + + return { + 'fields': selected_fields, + groups_key: group_summaries, + 'matrix': matrix, + top_count_key: top_groups, + other_count_key: len(other_rows), + selection_key: selection_description, + 'field_group_order': list(METADATA_FIELD_GROUP_ORDER), + } + + +def metadata_completeness( + rows: list[dict[str, Any]], + top_benchmarks: int = 20, + top_fields: int = 12, +) -> dict[str, Any]: + return metadata_completeness_by_group( + rows, + group_name=benchmark_name, + group_key='benchmark', + groups_key='benchmarks', + top_count_key='top_benchmark_count', + other_count_key='other_result_rows', + selection_key='benchmark_selection', + selection_description=( + 'Top benchmarks by result-row count, with remaining benchmarks ' + 'aggregated as Other; rows are sorted by overall metadata ' + 'completeness.' + ), + top_groups=top_benchmarks, + top_fields=top_fields, + ) + + +def model_family_metadata_completeness( + rows: list[dict[str, Any]], + top_model_families: int = 20, + top_fields: int = 12, +) -> dict[str, Any]: + return metadata_completeness_by_group( + rows, + group_name=model_family_name, + group_key='model_family', + groups_key='model_families', + top_count_key='top_model_family_count', + other_count_key='other_result_rows', + selection_key='model_family_selection', + selection_description=( + 'Top model families/developers by result-row count, with ' + 'remaining families aggregated as Other; rows are sorted by ' + 'overall metadata completeness.' + ), + top_groups=top_model_families, + top_fields=top_fields, + ) + + +def average_completeness( + rows: list[dict[str, Any]], fields: list[str] +) -> float: + if not rows or not fields: + return 0.0 + present = sum( + has_value(row.get(field)) for row in rows for field in fields + ) + return present / (len(rows) * len(fields)) + + +def grouped_summaries( + rows: list[dict[str, Any]], + value_key: str, + group_keys: tuple[str, ...], + limit: int, +) -> list[dict[str, Any]]: + grouped: dict[tuple[Any, ...], list[float]] = defaultdict(list) + for row in rows: + value = row.get(value_key) + if value is None: + continue + grouped[tuple(row.get(key) for key in group_keys)].append(float(value)) + + summaries = [] + for group, values in grouped.items(): + item = {key: group[index] for index, key in enumerate(group_keys)} + item.update(numeric_summary(values)) + summaries.append(item) + summaries.sort(key=lambda item: (-int(item['count']), str(item))) + return summaries[:limit] + + +def bootstrap_interval_and_support( + values: list[float], + threshold: float, + iterations: int = BOOTSTRAP_ITERATIONS, +) -> tuple[list[float | None], float | None]: + if not values: + return [None, None], None + rng = random.Random(RANDOM_SEED + len(values)) + estimates = [] + for _ in range(iterations): + sample = [values[rng.randrange(len(values))] for _ in values] + estimates.append(statistics.mean(sample)) + return [ + percentile(estimates, 0.025), + percentile(estimates, 0.975), + ], sum(estimate > threshold for estimate in estimates) / len(estimates) + + +def stabilized_estimate( + mean_score: float, + count: int, + corpus_mean: float, + weight: float = STABILIZATION_WEIGHT, +) -> float: + return (count * mean_score + weight * corpus_mean) / (count + weight) + + +def coverage_aware_model_summaries( + rows: list[dict[str, Any]], limit: int +) -> list[dict[str, Any]]: + if not rows: + return [] + corpus_mean = statistics.mean(row['normalized_score'] for row in rows) + key_means = { + key: statistics.mean(item['normalized_score'] for item in items) + for key, items in _group_rows(rows, 'shared_evaluation_key').items() + } + + summaries = [] + for model_id, items in _group_rows(rows, 'model_id').items(): + scores = [item['normalized_score'] for item in items] + centered_scores = [ + item['normalized_score'] + - key_means[item['shared_evaluation_key']] + + corpus_mean + for item in items + ] + raw_mean = statistics.mean(scores) + centered_mean = statistics.mean(centered_scores) + stabilized = stabilized_estimate(raw_mean, len(scores), corpus_mean) + rng = random.Random(RANDOM_SEED + len(scores)) + bootstrap_scores = [] + for _ in range(BOOTSTRAP_ITERATIONS): + sample = [scores[rng.randrange(len(scores))] for _ in scores] + bootstrap_scores.append( + stabilized_estimate( + statistics.mean(sample), len(sample), corpus_mean + ) + ) + interval = [ + percentile(bootstrap_scores, 0.025), + percentile(bootstrap_scores, 0.975), + ] + support = sum(score > corpus_mean for score in bootstrap_scores) / len( + bootstrap_scores + ) + summaries.append( + { + 'model_id': model_id, + 'result_count': len(items), + 'benchmark_count': distinct_count(items, 'benchmark'), + 'evaluation_count': distinct_count( + items, 'shared_evaluation_key' + ), + 'mean_normalized_score': raw_mean, + 'benchmark_centered_score': centered_mean, + 'stabilized_score': stabilized, + 'uncertainty_interval': interval, + 'support_above_corpus_average': support, + } + ) + summaries.sort( + key=lambda item: ( + -float(item['stabilized_score']), + -int(item['evaluation_count']), + str(item['model_id']), + ) + ) + return summaries[:limit] + + +def pairwise_model_comparisons( + rows: list[dict[str, Any]], + min_shared_evals: int, + top_model_limit: int, + comparison_limit: int, +) -> list[dict[str, Any]]: + by_model_key: dict[str, dict[str, list[float]]] = defaultdict( + lambda: defaultdict(list) + ) + model_counts = Counter( + row['model_id'] for row in rows if row.get('model_id') + ) + top_models = { + model + for model, _ in model_counts.most_common(top_model_limit) + if model is not None + } + for row in rows: + model_id = row.get('model_id') + if model_id not in top_models: + continue + by_model_key[model_id][row['shared_evaluation_key']].append( + row['normalized_score'] + ) + + model_scores = { + model: { + key: statistics.mean(values) + for key, values in scores_by_key.items() + } + for model, scores_by_key in by_model_key.items() + } + models = sorted(model_scores) + comparisons = [] + rng = random.Random(RANDOM_SEED) + for index, model_a in enumerate(models): + for model_b in models[index + 1 :]: + shared_keys = sorted( + set(model_scores[model_a]) & set(model_scores[model_b]) + ) + if len(shared_keys) < min_shared_evals: + continue + diffs = [ + model_scores[model_a][key] - model_scores[model_b][key] + for key in shared_keys + ] + boot_means = [] + for _ in range(BOOTSTRAP_ITERATIONS): + sample = [diffs[rng.randrange(len(diffs))] for _ in diffs] + boot_means.append(statistics.mean(sample)) + comparisons.append( + { + 'model_a': model_a, + 'model_b': model_b, + 'shared_evaluation_count': len(shared_keys), + 'mean_paired_difference': statistics.mean(diffs), + 'uncertainty_interval': [ + percentile(boot_means, 0.025), + percentile(boot_means, 0.975), + ], + 'support_model_a_higher': sum( + value > 0 for value in boot_means + ) + / len(boot_means), + } + ) + comparisons.sort( + key=lambda item: ( + -int(item['shared_evaluation_count']), + -abs(float(item['mean_paired_difference'])), + str(item['model_a']), + str(item['model_b']), + ) + ) + return comparisons[:comparison_limit] + + +def descriptive_statistics( + rows: list[dict[str, Any]], + summary_limit: int, + metadata_top_benchmarks: int, + metadata_top_model_families: int, +) -> dict[str, Any]: + valid_rows, exclusions = valid_normalized_rows(rows) + return { + 'counts': { + 'result_rows': len(rows), + 'unique_models': distinct_count(rows, 'model_id'), + 'unique_developers': distinct_count(rows, 'model_developer'), + 'unique_benchmarks': distinct_count(rows, 'benchmark'), + 'unique_evaluations': distinct_count(rows, 'evaluation_name'), + }, + 'schema_versions': count_values(rows, 'schema_version'), + 'inference_engines': count_values_with_unknown( + rows, 'inference_engine' + ), + 'models_per_benchmark': models_per_benchmark(rows), + 'metadata_completeness': metadata_completeness( + rows, top_benchmarks=metadata_top_benchmarks + ), + 'model_family_metadata_completeness': ( + model_family_metadata_completeness( + rows, top_model_families=metadata_top_model_families + ) + ), + 'quality': quality_counts(rows), + 'normalization_exclusions': exclusions, + 'score_summaries': grouped_summaries( + rows, + 'score', + SCORE_GROUP_KEYS, + summary_limit, + ), + 'normalized_score_summaries': grouped_summaries( + valid_rows, + 'normalized_score', + SCORE_GROUP_KEYS, + summary_limit, + ), + } + + +def build_statistics_report( + rows: list[dict[str, Any]], + summary_limit: int, + metadata_top_benchmarks: int, + metadata_top_model_families: int, + comparison_limit: int, + top_model_limit: int, + min_shared_evals: int, + descriptive_only: bool, +) -> dict[str, Any]: + valid_rows, exclusions = valid_normalized_rows(rows) + report = { + 'descriptive': descriptive_statistics( + rows, + summary_limit, + metadata_top_benchmarks, + metadata_top_model_families, + ), + 'observational': { + 'valid_normalized_rows': len(valid_rows), + 'exclusions': exclusions, + }, + } + if descriptive_only: + return report + report['observational'].update( + { + 'coverage_aware_model_summaries': coverage_aware_model_summaries( + valid_rows, top_model_limit + ), + 'pairwise_model_comparisons': pairwise_model_comparisons( + valid_rows, + min_shared_evals, + top_model_limit, + comparison_limit, + ), + } + ) + return report + + +def _group_rows( + rows: list[dict[str, Any]], key: str +) -> dict[Any, list[dict[str, Any]]]: + grouped: dict[Any, list[dict[str, Any]]] = defaultdict(list) + for row in rows: + grouped[row.get(key)].append(row) + return grouped + + +def section(title: str) -> None: + print(f'\n{SEP}') + print(f' {title.upper()}') + print(SUB) + + +def print_table(items: list[dict[str, Any]], columns: list[str]) -> None: + for item in items: + parts = [] + for column in columns: + value = item.get(column) + if isinstance(value, float): + value = f'{value:.4f}' + parts.append(f'{column}={value}') + print(' ' + ' '.join(parts)) + + +def print_report(report: dict[str, Any], descriptive_only: bool) -> None: + descriptive = report['descriptive'] + section('dataset counts') + for key, value in descriptive['counts'].items(): + print(f' {key:<32} {value:>10,}') + + section('quality diagnostics') + for key, value in descriptive['quality'].items(): + print(f' {key:<32} {value:>10,}') + + section('normalization exclusions') + for key, value in report['observational']['exclusions'].items(): + print(f' {key:<32} {value:>10,}') + + section('inference engines') + print_table( + descriptive['inference_engines'][:10], + ['value', 'count'], + ) + + section('models per benchmark') + print_table( + descriptive['models_per_benchmark'][:10], + ['benchmark', 'unique_models', 'result_rows'], + ) + + section('score summaries') + print_table( + descriptive['score_summaries'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], + ) + + section('normalized score summaries') + print_table( + descriptive['normalized_score_summaries'], + [ + 'benchmark', + 'evaluation_name', + 'metric_id', + 'count', + 'mean', + 'median', + 'stddev', + ], + ) + + if descriptive_only: + return + + section('coverage-aware model summaries') + print_table( + report['observational']['coverage_aware_model_summaries'], + [ + 'model_id', + 'evaluation_count', + 'stabilized_score', + 'benchmark_centered_score', + 'support_above_corpus_average', + ], + ) + + section('pairwise model comparisons') + print_table( + report['observational']['pairwise_model_comparisons'], + [ + 'model_a', + 'model_b', + 'shared_evaluation_count', + 'mean_paired_difference', + 'support_model_a_higher', + ], + ) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Generate Every Eval Ever dataset statistics.' + ) + parser.add_argument( + '--table', default='eee', help='Table name for the in-memory database' + ) + parser.add_argument( + '--stats-output', + type=Path, + help='Optional JSON output path for the statistics report', + ) + parser.add_argument( + '--summary-limit', + default=10, + type=int, + help='Number of descriptive summary rows to print', + ) + parser.add_argument( + '--metadata-top-benchmarks', + default=20, + type=int, + help=( + 'Number of largest benchmarks to show in metadata completeness; ' + 'remaining benchmarks are aggregated as Other' + ), + ) + parser.add_argument( + '--metadata-top-model-families', + default=20, + type=int, + help=( + 'Number of largest model families/developers to show in metadata ' + 'completeness; remaining families are aggregated as Other' + ), + ) + parser.add_argument( + '--comparison-limit', + default=50, + type=int, + help='Number of pairwise comparison rows to print', + ) + parser.add_argument( + '--top-model-limit', + default=50, + type=int, + help='Number of most-covered models to include in comparisons', + ) + parser.add_argument( + '--min-shared-evals', + default=5, + type=int, + help='Minimum shared evaluation keys for pairwise comparisons', + ) + parser.add_argument( + '--descriptive-only', + action='store_true', + help='Skip observational comparison summaries', + ) + args = parser.parse_args(argv) + if not re.fullmatch(r'[A-Za-z_][A-Za-z0-9_]*', args.table): + parser.error('--table must be a valid SQL identifier') + if args.summary_limit < 1: + parser.error('--summary-limit must be at least 1') + if args.metadata_top_benchmarks < 1: + parser.error('--metadata-top-benchmarks must be at least 1') + if args.metadata_top_model_families < 1: + parser.error('--metadata-top-model-families must be at least 1') + if args.comparison_limit < 1: + parser.error('--comparison-limit must be at least 1') + if args.top_model_limit < 1: + parser.error('--top-model-limit must be at least 1') + if args.min_shared_evals < 1: + parser.error('--min-shared-evals must be at least 1') + return args + + +def main(argv: list[str] | None = None) -> None: + args = parse_args(argv) + import duckdb + + schema_table = f'{args.table}_schema' + with duckdb.connect(':memory:') as con: + try: + con.execute('LOAD httpfs;') + except duckdb.Error: + con.execute('INSTALL httpfs;') + con.execute('LOAD httpfs;') + + load_schema_table(con, schema_table) + rows = extract_result_rows(con, schema_table) + + report = build_statistics_report( + rows, + summary_limit=args.summary_limit, + metadata_top_benchmarks=args.metadata_top_benchmarks, + metadata_top_model_families=args.metadata_top_model_families, + comparison_limit=args.comparison_limit, + top_model_limit=args.top_model_limit, + min_shared_evals=args.min_shared_evals, + descriptive_only=args.descriptive_only, + ) + print_report(report, args.descriptive_only) + + if args.stats_output: + args.stats_output.parent.mkdir(parents=True, exist_ok=True) + args.stats_output.write_text( + json.dumps(report, indent=2, sort_keys=True) + '\n', + encoding='utf-8', + ) + print(f'\nWrote statistics JSON to {args.stats_output}') + + +if __name__ == '__main__': + try: + main() + except RuntimeError as exc: + print(str(exc), file=sys.stderr) + sys.exit(1) diff --git a/misc/dataset_statistics_summary_writer.py b/misc/dataset_statistics_summary_writer.py new file mode 100644 index 000000000..ca9e19c51 --- /dev/null +++ b/misc/dataset_statistics_summary_writer.py @@ -0,0 +1,109 @@ +"""Preserved Markdown summary writer removed from plot_dataset_statistics.py.""" + +from __future__ import annotations + +import statistics +from pathlib import Path +from typing import Any + + +def label(row: dict[str, Any]) -> str: + benchmark = str(row['benchmark']) + evaluation = str(row['evaluation_name']) + if benchmark == evaluation: + return benchmark + return f'{benchmark}: {evaluation}' + + +def top_rows( + rows: list[dict[str, Any]], key: str, limit: int +) -> list[dict[str, Any]]: + return sorted(rows, key=lambda row: (-float(row[key]), label(row)))[:limit] + + +def pct(part: int, total: int) -> float: + return 100.0 * part / total if total else 0.0 + + +def write_summary( + stats: dict[str, Any], + rows: list[dict[str, Any]], + plot_paths: dict[str, Path], + output_path: Path, +) -> None: + descriptive = stats['descriptive'] + counts = stats['descriptive']['counts'] + quality = stats['descriptive']['quality'] + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + out_of_range = exclusions.get('out_of_range', 0) + models_per_benchmark = descriptive.get('models_per_benchmark', []) + inference_engines = descriptive.get('inference_engines', []) + most_covered = top_rows(rows, 'count', 6) + highest_variance = sorted( + rows, key=lambda row: float(row['stddev'] or 0.0), reverse=True + )[:4] + hardest = sorted(rows, key=lambda row: float(row['mean']))[:4] + easiest = sorted(rows, key=lambda row: float(row['mean']), reverse=True)[:4] + model_counts = [ + int(row['unique_models']) + for row in models_per_benchmark + if int(row['unique_models']) > 0 + ] + median_models = statistics.median(model_counts) if model_counts else 0 + max_models = max(model_counts) if model_counts else 0 + top_model_datasets = models_per_benchmark[:6] + known_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() != 'unknown' + ) + unknown_engine_rows = sum( + int(row['count']) + for row in inference_engines + if str(row['value']).strip().lower() == 'unknown' + ) + top_engines = inference_engines[:6] + + def names(items: list[dict[str, Any]]) -> str: + return ', '.join(label(item) for item in items) + + def benchmark_model_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["benchmark"]} ({int(item["unique_models"]):,})' + for item in items + ) + + def engine_names(items: list[dict[str, Any]]) -> str: + return ', '.join( + f'{item["value"]} ({int(item["count"]):,})' for item in items + ) + + relative_plots = { + name: path.relative_to(output_path.parent) + if path.is_relative_to(output_path.parent) + else path + for name, path in plot_paths.items() + } + text = f"""# Dataset Statistics Summary + +This report summarizes the latest Every Eval Ever datastore snapshot represented by `dataset_statistics.json`. In the statistics file, “dataset” is represented by the `benchmark` field, which comes from `evaluation_results[].source_data.dataset_name`. That naming is worth keeping in mind when reading the figures: a benchmark is the dataset or leaderboard family that supplied the result rows, while an evaluation name is the finer slice or metric label inside that benchmark. The corpus contains {counts['result_rows']:,} result rows across {counts['unique_benchmarks']:,} datasets, {counts['unique_evaluations']:,} evaluation names, {counts['unique_developers']:,} developers, and {counts['unique_models']:,} models. The coverage plot (`{relative_plots['coverage']}`) gives the first scale check: the datastore is broad in model count, but its row-level mass is still concentrated in a smaller number of repeated evaluation families. + +Normalization quality is strong for this snapshot. Of {quality['total_result_rows']:,} result rows, {valid:,} rows can be converted onto the shared zero-to-one scale, or {pct(valid, quality['total_result_rows']):.1f}% of the dataset. The only observed normalization exclusion is {out_of_range:,} out-of-range rows; missing scores, missing bounds, zero-width bounds, and incompatible score types are all zero. This means the normalized score summaries are a reasonable map of cross-benchmark score distributions. It does not make all metrics semantically identical, but it does put the numeric ranges on a common axis so that difficulty, saturation, and spread are easier to compare. The normalization quality plot (`{relative_plots['quality']}`) is therefore a guardrail figure: it says whether the rest of the normalized-score visuals are based on most of the corpus or on a narrow filtered subset. + +Coverage is uneven by design. The most-covered normalized summaries are {names(most_covered)}. These heavily represented evaluations dominate aggregate descriptive patterns, so the top-coverage chart (`{relative_plots['top_coverage']}`) should be read alongside any mean-score chart. A benchmark with thousands of rows provides a much steadier estimate than a niche evaluation with dozens or hundreds of rows, even if both appear as one row in the summary table. High row coverage can mean a benchmark has broad model participation, multiple reported submetrics, repeated submissions, or some combination of the three. The plot is intentionally row-count oriented, because the descriptive JSON is primarily row-oriented; it should not be read as a direct measure of benchmark popularity without checking model coverage separately. + +The new model-per-dataset histogram (`{relative_plots['models_per_dataset']}`) adds that missing model-coverage view. Across datasets, the median number of unique models is {median_models:g}, and the largest dataset-level model count is {max_models:,}. The highest-coverage datasets by unique model count are {benchmark_model_names(top_model_datasets)}. This distribution is important because a dataset with many models tells us more about the breadth of the ecosystem than a dataset with many rows from a smaller model set. A heavy right tail in this histogram means a few datasets act as common comparison hubs, while many others remain specialized or sparsely covered. That is not necessarily bad; specialized datasets are often where the datastore gets its texture. But it does mean corpus-wide summaries should avoid treating every benchmark as equally well sampled. + +The inference-engine spread plot (`{relative_plots['engine_spread']}`) describes how result rows are distributed across recorded running engines or inference platforms, depending on which runtime metadata is present in the datastore export. The leading runtime labels are {engine_names(top_engines)}. In this snapshot, {known_engine_rows:,} rows have a named runtime field and {unknown_engine_rows:,} rows fall under `unknown`. The `unknown` bucket is expected whenever source records report model identity but not the serving/runtime layer. Runtime spread should therefore be read as an observability diagnostic, not just as a usage ranking. A large `unknown` bucket says that many results are still useful for model and benchmark analysis, but they cannot support claims about vLLM, Ollama, hosted APIs, or other runtime-specific execution paths. Where runtime names are present, the chart gives a quick view of which execution backends are represented strongly enough for follow-up slicing. + +Mean normalized scores vary sharply across tasks. The lowest means include {names(hardest)}, while the highest means include {names(easiest)}. These values should not be interpreted as a leaderboard: they summarize all available submitted model results within each benchmark/evaluation pair, not matched model cohorts. They are best used to spot which evaluations are generally difficult, saturated, or mixed across the collected model population. A low mean can indicate a hard benchmark, a benchmark with many older or weaker systems, or a metric whose upper range is rarely reached. A high mean can indicate an easier task, a saturated benchmark, a curated set of strong submissions, or a metric where the lower-performing tail is missing. The summary plots do not decide among those explanations, but they point to where a closer paired analysis would be valuable. + +The variability plots add the most diagnostic texture. High-standard-deviation evaluations such as {names(highest_variance)} indicate tasks where model results span a wide range, often because the benchmark separates weak and strong systems clearly or because the source data combines distinct regimes. The range plot (`{relative_plots['range']}`) highlights the same issue from min-to-max spread, while the mean-versus-standard-deviation scatter (`{relative_plots['variability']}`) separates broad, high-confidence coverage from sparse or volatile summaries. Evaluations with both substantial coverage and high spread are especially useful for model comparison because they appear to discriminate among systems rather than clustering everyone near the same score. Evaluations with low spread can still matter, but they may be better suited for pass/fail checks, regression testing, or detecting severe failures than for fine-grained ranking. + +The PDF figures are meant to be inspected together rather than as standalone claims. The count and quality charts answer whether the data is large and clean enough to trust. The top-coverage and model-per-dataset charts separate result-row volume from unique-model breadth. The engine chart shows whether runtime metadata is available and how concentrated it is. The mean, variability, and range charts then answer where the benchmark landscape is concentrated, sparse, easy, hard, or discriminative. Keeping those questions separate avoids a common mistake: treating a high row count as evidence of broad participation, or treating a normalized mean as a direct model-quality claim. + +Overall, the datastore is large, mostly normalization-ready, and informative for benchmark-level descriptive analysis. The main caveat is comparability: normalized scores put different metrics on a common scale, but they do not control for which models appear in each benchmark. Use these figures as a map of datastore coverage, runtime observability, and score distribution, then rely on paired or coverage-aware analyses for direct model comparisons. The descriptive plots are best thought of as a scouting layer: they reveal where the datastore is rich, where metadata is thin, and where more careful model-by-model analysis is likely to pay off. +""" + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(text, encoding='utf-8') diff --git a/misc/eval_hierarchy.json b/misc/eval_hierarchy.json new file mode 100644 index 000000000..5ad8650ba --- /dev/null +++ b/misc/eval_hierarchy.json @@ -0,0 +1,4187 @@ +{ + "stats": { + "family_count": 20, + "composite_count": 20, + "standalone_benchmark_count": 10, + "single_benchmark_count": 108, + "slice_count": 58, + "metric_count": 208, + "metric_rows_scanned": 41616 + }, + "qa": { + "fallback_metric_count": 2231, + "fallback_metrics": [ + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "appworld", + "single_benchmark": "appworld/test_normal", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "browsecompplus", + "single_benchmark": "browsecompplus", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "la_leaderboard", + "single_benchmark": "La leaderboard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Chat Hard", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Safety", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Reasoning", + "metric_name": "Score" + }, + { + "composite_benchmark": "reward-bench", + "single_benchmark": "Prior Sets (0.5 weight)", + "metric_name": "Score" + } + ], + "metric_like_single_benchmark_count": 0, + "metric_like_single_benchmarks": [], + "single_equals_only_metric_count": 0, + "single_equals_only_metric": [] + }, + "families": [ + { + "key": "ace", + "display_name": "Ace", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "diy", + "display_name": "DIY", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "food", + "display_name": "Food", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "gaming", + "display_name": "Gaming", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "shopping", + "display_name": "Shopping", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex", + "display_name": "Apex", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [ + { + "key": "apex-agents", + "display_name": "Apex Agents", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "benchmarks": [ + { + "key": "corporate_law", + "display_name": "Corporate Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "corporate_lawyer", + "display_name": "Corporate Lawyer", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "management_consulting", + "display_name": "Management Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "mean_score", + "display_name": "Mean Score", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "pass_at_8", + "display_name": "Pass@8", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + }, + { + "key": "apex-v1", + "display_name": "Apex V1", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "benchmarks": [ + { + "key": "big_law", + "display_name": "Big Law", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "consulting", + "display_name": "Consulting", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "investment_banking", + "display_name": "Investment Banking", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medicine_md", + "display_name": "Medicine (MD)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ] + } + ] + }, + { + "key": "appworld", + "display_name": "Appworld", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "arc-agi", + "display_name": "Arc Agi", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "v1_public_eval", + "display_name": "v1_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v1_semi_private", + "display_name": "v1_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_private_eval", + "display_name": "v2_Private_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_public_eval", + "display_name": "v2_Public_Eval", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v2_semi_private", + "display_name": "v2_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost_per_task", + "display_name": "Cost per Task", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "v3_semi_private", + "display_name": "v3_Semi_Private", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "cost", + "display_name": "Cost", + "sources": [ + "metric_config" + ] + }, + { + "key": "score", + "display_name": "Score", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "bfcl", + "display_name": "Bfcl", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "format_sensitivity", + "display_name": "Format sensitivity", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "format_sensitivity_max_delta", + "display_name": "Format Sensitivity Max Delta", + "sources": [ + "metric_config" + ] + }, + { + "key": "format_sensitivity_stddev", + "display_name": "Format Sensitivity Standard Deviation", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "live", + "display_name": "Live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "live_accuracy", + "display_name": "Live accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_multiple_ast_accuracy", + "display_name": "Live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_ast_accuracy", + "display_name": "Live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_parallel_multiple_ast_accuracy", + "display_name": "Live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "live_simple_ast_accuracy", + "display_name": "Live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "memory", + "display_name": "Memory", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "kv_accuracy", + "display_name": "Memory KV accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "recursive_summarization_accuracy", + "display_name": "Memory recursive summarization accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "vector_accuracy", + "display_name": "Memory vector accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "multi_turn", + "display_name": "Multi turn", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "long_context_accuracy", + "display_name": "Multi-turn long-context accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_function_accuracy", + "display_name": "Multi-turn missing function accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "miss_parameter_accuracy", + "display_name": "Multi-turn missing parameter accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "non_live", + "display_name": "Non live", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ast_accuracy", + "display_name": "Non-live AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "multiple_ast_accuracy", + "display_name": "Non-live multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_ast_accuracy", + "display_name": "Non-live parallel AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "parallel_multiple_ast_accuracy", + "display_name": "Non-live parallel multiple AST accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "simple_ast_accuracy", + "display_name": "Non-live simple AST accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "overall", + "display_name": "Overall", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "latency_p95", + "display_name": "Latency 95th Percentile", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_mean", + "display_name": "Latency Mean", + "sources": [ + "metric_config" + ] + }, + { + "key": "latency_std", + "display_name": "Latency Standard Deviation", + "sources": [ + "metric_config" + ] + }, + { + "key": "overall_accuracy", + "display_name": "Overall Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + }, + { + "key": "total_cost", + "display_name": "Total Cost", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "relevance", + "display_name": "Relevance", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "irrelevance_detection_accuracy", + "display_name": "Irrelevance detection accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "relevance_detection_accuracy", + "display_name": "Relevance detection accuracy", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "web_search", + "display_name": "Web search", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "base_accuracy", + "display_name": "Multi-turn base accuracy", + "sources": [ + "metric_config" + ] + }, + { + "key": "no_snippet_accuracy", + "display_name": "Web-search no-snippet accuracy", + "sources": [ + "metric_config" + ] + } + ] + } + ] + }, + { + "key": "browsecompplus", + "display_name": "browsecompplus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "fibble", + "display_name": "Fibble", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [ + { + "key": "fibble_arena", + "display_name": "Fibble arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ], + "category": "other" + }, + { + "key": "fibble1_arena", + "display_name": "Fibble1 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "metric_config" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble2_arena", + "display_name": "Fibble2 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble3_arena", + "display_name": "Fibble3 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble4_arena", + "display_name": "Fibble4 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + }, + { + "key": "fibble5_arena", + "display_name": "Fibble5 arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ], + "category": "other" + } + ], + "composites": [] + }, + { + "key": "global_mmlu_lite", + "display_name": "Global MMLU Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "reasoning", + "standalone_benchmarks": [], + "composites": [], + "slices": [ + { + "key": "arabic", + "display_name": "Arabic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "bengali", + "display_name": "Bengali", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "burmese", + "display_name": "Burmese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "chinese", + "display_name": "Chinese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_agnostic", + "display_name": "Culturally Agnostic", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "culturally_sensitive", + "display_name": "Culturally Sensitive", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "english", + "display_name": "English", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "french", + "display_name": "French", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "german", + "display_name": "German", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "hindi", + "display_name": "Hindi", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "indonesian", + "display_name": "Indonesian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "italian", + "display_name": "Italian", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "japanese", + "display_name": "Japanese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "korean", + "display_name": "Korean", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "portuguese", + "display_name": "Portuguese", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "spanish", + "display_name": "Spanish", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "swahili", + "display_name": "Swahili", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "yoruba", + "display_name": "Yoruba", + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + } + ], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "benchmark_default" + ] + } + ] + }, + { + "key": "helm", + "display_name": "HELM", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks", + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "question answering", + "dialogue modeling", + "text generation", + "grade school mathematics", + "math word problems", + "legal", + "finance", + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries", + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Extractive question answering", + "Fill mask", + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "category": "general", + "standalone_benchmarks": [], + "composites": [ + { + "key": "helm_capabilities", + "display_name": "Helm capabilities", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "olympiads", + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Multiple-choice question answering across a broad range of subjects", + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)", + "Open-ended text generation in response to diverse user queries" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "helm_capabilities", + "display_name": "Capabilities", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "ifeval_strict_acc", + "display_name": "IFEval Strict Acc", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-Pro", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "cot_correct", + "display_name": "COT correct", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "omni_math", + "display_name": "Omni-MATH", + "has_card": true, + "tags": { + "domains": [ + "math", + "olympiads" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving Olympiad-level mathematical problems", + "Solving competition-level mathematical problems", + "Rule-based evaluation on a filtered subset of problems (Omni-MATH-Rule)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wildbench", + "display_name": "WildBench", + "has_card": true, + "tags": { + "domains": [ + "Info Seeking", + "Math & Data", + "Reasoning & Planning", + "Creative Tasks" + ], + "languages": [ + "English" + ], + "tasks": [ + "Open-ended text generation in response to diverse user queries" + ] + }, + "slices": [], + "metrics": [ + { + "key": "wb_score", + "display_name": "WB Score", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Helm classic", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference", + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification", + "summarization", + "journalism", + "news media", + "commonsense reasoning", + "STEM", + "humanities", + "social sciences", + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification", + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups", + "Summarization", + "Four-way multiple-choice selection for event continuation", + "Commonsense inference", + "Multiple-choice question answering", + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "boolq", + "display_name": "BoolQ", + "has_card": true, + "tags": { + "domains": [ + "natural language understanding", + "reading comprehension", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Yes/no question answering", + "Text-pair classification" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "civilcomments", + "display_name": "CivilComments", + "has_card": true, + "tags": { + "domains": [ + "machine learning fairness", + "bias measurement", + "toxic comment classification", + "text classification" + ], + "languages": [ + "English" + ], + "tasks": [ + "Binary toxicity classification (toxic vs. non-toxic)", + "Analysis of performance across identity subgroups" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_classic", + "display_name": "Classic", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "cnn_dailymail", + "display_name": "CNN/DailyMail", + "has_card": true, + "tags": { + "domains": [ + "summarization", + "journalism", + "news media" + ], + "languages": [ + "English" + ], + "tasks": [ + "Summarization" + ] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hellaswag", + "display_name": "HellaSwag", + "has_card": true, + "tags": { + "domains": [ + "commonsense reasoning", + "natural language inference" + ], + "languages": [ + "English" + ], + "tasks": [ + "Four-way multiple-choice selection for event continuation", + "Commonsense inference" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "imdb", + "display_name": "IMDB", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ms_marco_trec", + "display_name": "MS MARCO (TREC)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "ndcg_10", + "display_name": "NDCG@10", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_open_book", + "display_name": "NaturalQuestions (open-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "quac", + "display_name": "QuAC", + "has_card": true, + "tags": { + "domains": [ + "question answering", + "dialogue modeling", + "text generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Extractive question answering", + "Text generation", + "Fill mask" + ] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "raft", + "display_name": "RAFT", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "truthfulqa", + "display_name": "TruthfulQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "xsum", + "display_name": "XSUM", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "rouge_2", + "display_name": "ROUGE-2", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Helm instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "general", + "benchmarks": [ + { + "key": "anthropic_rlhf_dataset", + "display_name": "Anthropic RLHF dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "best_chatgpt_prompts", + "display_name": "Best ChatGPT Prompts", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_instruct", + "display_name": "Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "koala_test_dataset", + "display_name": "Koala test dataset", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "open_assistant", + "display_name": "Open Assistant", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "self_instruct", + "display_name": "Self Instruct", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "vicuna", + "display_name": "Vicuna", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "harmlessness", + "display_name": "Harmlessness", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Helm lite", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems", + "legal", + "law", + "finance", + "medical knowledge", + "professional medical exams", + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering", + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks", + "Free-form multiple-choice question answering", + "Open-domain question answering", + "Multiple-choice question answering" + ] + }, + "category": "general", + "benchmarks": [ + { + "key": "gsm8k", + "display_name": "GSM8K", + "has_card": true, + "tags": { + "domains": [ + "grade school mathematics", + "math word problems" + ], + "languages": [ + "English" + ], + "tasks": [ + "Solving grade school math word problems", + "Text generation for question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "legalbench", + "display_name": "LegalBench", + "has_card": true, + "tags": { + "domains": [ + "legal", + "law", + "finance" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text classification", + "Question answering", + "Text generation", + "Rule-application tasks" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "helm_lite", + "display_name": "Lite", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [ + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + } + ], + "metrics": [] + }, + { + "key": "math", + "display_name": "MATH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "score_details" + ] + } + ] + }, + { + "key": "medqa", + "display_name": "MedQA", + "has_card": true, + "tags": { + "domains": [ + "medical knowledge", + "professional medical exams" + ], + "languages": [ + "English" + ], + "tasks": [ + "Free-form multiple-choice question answering", + "Open-domain question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu", + "display_name": "MMLU", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "narrativeqa", + "display_name": "NarrativeQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "naturalquestions_closed_book", + "display_name": "NaturalQuestions (closed-book)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "f1", + "display_name": "F1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "openbookqa", + "display_name": "OpenbookQA", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "wmt_2014", + "display_name": "WMT 2014", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "bleu_4", + "display_name": "BLEU-4", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "helm_mmlu", + "display_name": "Helm mmlu", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering" + ] + }, + "category": "reasoning", + "slices": [ + { + "key": "abstract_algebra", + "display_name": "Abstract Algebra", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "anatomy", + "display_name": "Anatomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "astronomy", + "display_name": "Astronomy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "business_ethics", + "display_name": "Business Ethics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "clinical_knowledge", + "display_name": "Clinical Knowledge", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "college_physics", + "display_name": "College Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "computer_security", + "display_name": "Computer Security", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "conceptual_physics", + "display_name": "Conceptual Physics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "econometrics", + "display_name": "Econometrics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "electrical_engineering", + "display_name": "Electrical Engineering", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "elementary_mathematics", + "display_name": "Elementary Mathematics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "formal_logic", + "display_name": "Formal Logic", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "global_facts", + "display_name": "Global Facts", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "high_school_world_history", + "display_name": "High School World History", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "human_sexuality", + "display_name": "Human Sexuality", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "international_law", + "display_name": "International Law", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "jurisprudence", + "display_name": "Jurisprudence", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "logical_fallacies", + "display_name": "Logical Fallacies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "machine_learning", + "display_name": "Machine Learning", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "management", + "display_name": "Management", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "marketing", + "display_name": "Marketing", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mean", + "display_name": "Mean", + "metrics": [ + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix" + ] + } + ] + }, + { + "key": "medical_genetics", + "display_name": "Medical Genetics", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "miscellaneous", + "display_name": "Miscellaneous", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_all_subjects", + "display_name": "MMLU All Subjects", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "moral_scenarios", + "display_name": "Moral Scenarios", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "nutrition", + "display_name": "Nutrition", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "philosophy", + "display_name": "Philosophy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "prehistory", + "display_name": "Prehistory", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "professional_psychology", + "display_name": "Professional Psychology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "public_relations", + "display_name": "Public Relations", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "security_studies", + "display_name": "Security Studies", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "sociology", + "display_name": "Sociology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "us_foreign_policy", + "display_name": "Us Foreign Policy", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "virology", + "display_name": "Virology", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "world_religions", + "display_name": "World Religions", + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + } + ], + "metrics": [] + } + ] + }, + { + "key": "hfopenllm_v2", + "display_name": "Hfopenllm v2", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa", + "instruction following", + "mathematics", + "explanation generation", + "STEM", + "humanities", + "social sciences", + "math", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "philosophy", + "computer science", + "history", + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)", + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation", + "Multiple-choice question answering across a broad range of subjects", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "category": "instruction_following", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "bbh", + "display_name": "BBH", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "gpqa", + "display_name": "GPQA", + "has_card": true, + "tags": { + "domains": [ + "biology", + "physics", + "chemistry", + "open domain qa", + "open book qa", + "multiple choice qa" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering", + "Question answering", + "Text generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "ifeval", + "display_name": "IFEval", + "has_card": true, + "tags": { + "domains": [ + "instruction following" + ], + "languages": [ + "English" + ], + "tasks": [ + "Text generation", + "Following verifiable instructions (e.g., word counts, formatting rules, keyword mentions)" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "math_level_5", + "display_name": "MATH Level 5", + "has_card": true, + "tags": { + "domains": [ + "mathematics", + "explanation generation" + ], + "languages": [ + "English" + ], + "tasks": [ + "Mathematical problem solving", + "Step-by-step solution generation", + "Final answer generation" + ] + }, + "slices": [], + "metrics": [ + { + "key": "exact_match", + "display_name": "Exact Match", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "mmlu_pro", + "display_name": "MMLU-PRO", + "has_card": true, + "tags": { + "domains": [ + "STEM", + "humanities", + "social sciences", + "math", + "physics", + "chemistry", + "law", + "engineering", + "economics", + "health", + "psychology", + "business", + "biology", + "philosophy", + "computer science", + "history" + ], + "languages": [ + "English" + ], + "tasks": [ + "Multiple-choice question answering across a broad range of subjects" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "musr", + "display_name": "MUSR", + "has_card": true, + "tags": { + "domains": [ + "reasoning", + "commonsense reasoning", + "planning" + ], + "languages": [ + "English" + ], + "tasks": [ + "Question answering", + "Solving murder mysteries", + "Solving object placement problems", + "Solving team allocation problems" + ] + }, + "slices": [], + "metrics": [ + { + "key": "accuracy", + "display_name": "Accuracy", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "la_leaderboard", + "display_name": "La leaderboard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "livecodebenchpro", + "display_name": "Livecodebenchpro", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "easy_problems", + "display_name": "Easy Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "hard_problems", + "display_name": "Hard Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + }, + { + "key": "medium_problems", + "display_name": "Medium Problems", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "pass_at_1", + "display_name": "Pass@1", + "sources": [ + "evaluation_description" + ] + } + ] + } + ] + }, + { + "key": "reward-bench", + "display_name": "Reward Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "chat", + "display_name": "Chat", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "chat_hard", + "display_name": "Chat Hard", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "prior_sets_0_5_weight", + "display_name": "Prior Sets (0.5 weight)", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reasoning", + "display_name": "Reasoning", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench", + "display_name": "Reward bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "reward-bench-2", + "display_name": "Reward Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "safety", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "factuality", + "display_name": "Factuality", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "focus", + "display_name": "Focus", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "math", + "display_name": "Math", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "precise_if", + "display_name": "Precise IF", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "reward_bench_2", + "display_name": "Reward bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "evaluation_name" + ] + } + ] + }, + { + "key": "safety", + "display_name": "Safety", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "ties", + "display_name": "Ties", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "sciarena", + "display_name": "Sciarena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "cost_per_100_calls", + "display_name": "Cost per 100 Calls", + "sources": [ + "metric_config" + ] + }, + { + "key": "elo", + "display_name": "Elo Rating", + "sources": [ + "metric_config" + ] + }, + { + "key": "rank", + "display_name": "Rank", + "sources": [ + "metric_config" + ] + } + ] + }, + { + "key": "swe-bench", + "display_name": "Swe Bench", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau-bench-2", + "display_name": "Tau Bench 2", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "benchmarks": [ + { + "key": "tau_bench_2_airline", + "display_name": "tau-bench-2/airline", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_retail", + "display_name": "tau-bench-2/retail", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "tau_bench_2_telecom", + "display_name": "tau-bench-2/telecom", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + } + ] + }, + { + "key": "terminal-bench-2.0", + "display_name": "Terminal Bench 2 0", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "agentic", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "theory_of_mind", + "display_name": "Theory of mind", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "score", + "display_name": "Score", + "sources": [ + "fallback" + ] + } + ] + }, + { + "key": "wordle_arena", + "display_name": "Wordle arena", + "has_card": false, + "tags": { + "domains": [], + "languages": [], + "tasks": [] + }, + "category": "other", + "standalone_benchmarks": [], + "composites": [], + "slices": [], + "metrics": [ + { + "key": "average_attempts", + "display_name": "Average Attempts", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + }, + { + "key": "average_latency_ms", + "display_name": "Average Latency (ms)", + "sources": [ + "evaluation_name_suffix" + ] + }, + { + "key": "win_rate", + "display_name": "Win Rate", + "sources": [ + "evaluation_name_suffix", + "metric_config" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/misc/eval_hierarchy.md b/misc/eval_hierarchy.md new file mode 100644 index 000000000..d6005f3c8 --- /dev/null +++ b/misc/eval_hierarchy.md @@ -0,0 +1,459 @@ +# EEE Eval Hierarchy + +## QA Summary +- Families: `20` +- Composite benchmarks: `20` +- Standalone benchmarks: `10` +- Benchmarks: `108` +- Slices: `58` +- Unique metrics: `208` +- Metric rows scanned: `41616` +- Fallback metrics: `2231` +- Benchmarks that still look metric-like: `0` +- Benchmarks where name matches the only metric: `0` + +### Fallback Metrics +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `appworld` -> `appworld/test_normal` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `browsecompplus` -> `browsecompplus` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `la_leaderboard` -> `La leaderboard` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` +- `reward-bench` -> `Chat` -> `Score` +- `reward-bench` -> `Chat Hard` -> `Score` +- `reward-bench` -> `Safety` -> `Score` +- `reward-bench` -> `Reasoning` -> `Score` +- `reward-bench` -> `Prior Sets (0.5 weight)` -> `Score` + +## Hierarchy + +- [ ] Ace + - [ ] DIY + - Score + - [ ] Food + - Score + - [ ] Gaming + - Score + - [ ] Overall + - Score + - [ ] Shopping + - Score +- [ ] Apex + - [ ] Apex Agents + - [ ] Corporate Law + - Pass@1 + - [ ] Corporate Lawyer + - Mean Score + - [ ] Investment Banking + - Pass@1 + - [ ] Management Consulting + - Pass@1 + - [ ] Overall + - Mean Score + - Pass@1 + - Pass@8 + - [ ] Apex V1 + - [ ] Big Law + - Score + - [ ] Consulting + - Score + - [ ] Investment Banking + - Score + - [ ] Medicine (MD) + - Score + - [ ] Overall + - Score +- [ ] Appworld + - Score +- [ ] Arc Agi + - [ ] v1_Public_Eval + - Cost per Task + - Score + - [ ] v1_Semi_Private + - Cost per Task + - Score + - [ ] v2_Private_Eval + - Cost per Task + - Score + - [ ] v2_Public_Eval + - Cost per Task + - Score + - [ ] v2_Semi_Private + - Cost per Task + - Score + - [ ] v3_Semi_Private + - Cost + - Score +- [ ] Bfcl + - [ ] Format sensitivity + - Format Sensitivity Max Delta + - Format Sensitivity Standard Deviation + - [ ] Live + - Live accuracy + - Live multiple AST accuracy + - Live parallel AST accuracy + - Live parallel multiple AST accuracy + - Live simple AST accuracy + - [ ] Memory + - Accuracy + - Memory KV accuracy + - Memory recursive summarization accuracy + - Memory vector accuracy + - [ ] Multi turn + - Accuracy + - Multi-turn base accuracy + - Multi-turn long-context accuracy + - Multi-turn missing function accuracy + - Multi-turn missing parameter accuracy + - [ ] Non live + - Non-live AST accuracy + - Non-live multiple AST accuracy + - Non-live parallel AST accuracy + - Non-live parallel multiple AST accuracy + - Non-live simple AST accuracy + - [ ] Overall + - Latency 95th Percentile + - Latency Mean + - Latency Standard Deviation + - Overall Accuracy + - Rank + - Total Cost + - [ ] Relevance + - Irrelevance detection accuracy + - Relevance detection accuracy + - [ ] Web search + - Accuracy + - Multi-turn base accuracy + - Web-search no-snippet accuracy +- [ ] browsecompplus + - Score +- [ ] Fibble + - [ ] Fibble arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble1 arena + - Average Attempts + - Win Rate + - [ ] Fibble2 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble3 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble4 arena + - Average Attempts + - Average Latency (ms) + - Win Rate + - [ ] Fibble5 arena + - Average Attempts + - Average Latency (ms) + - Win Rate +- [ ] Global MMLU Lite + - Arabic + - Accuracy + - Bengali + - Accuracy + - Burmese + - Accuracy + - Chinese + - Accuracy + - Culturally Agnostic + - Accuracy + - Culturally Sensitive + - Accuracy + - English + - Accuracy + - French + - Accuracy + - German + - Accuracy + - Hindi + - Accuracy + - Indonesian + - Accuracy + - Italian + - Accuracy + - Japanese + - Accuracy + - Korean + - Accuracy + - Portuguese + - Accuracy + - Spanish + - Accuracy + - Swahili + - Accuracy + - Yoruba + - Accuracy + - Accuracy +- [x] HELM + - [x] Helm capabilities + - [ ] Capabilities + - Mean + - Score + - [x] GPQA + - COT correct + - [x] IFEval + - IFEval Strict Acc + - [x] MMLU-Pro + - COT correct + - [x] Omni-MATH + - Accuracy + - [x] WildBench + - WB Score + - [x] Helm classic + - [x] BoolQ + - Exact Match + - [x] CivilComments + - Exact Match + - [ ] Classic + - Mean + - Win Rate + - [x] CNN/DailyMail + - ROUGE-2 + - [x] HellaSwag + - Exact Match + - [ ] IMDB + - Exact Match + - [x] MMLU + - Exact Match + - [ ] MS MARCO (TREC) + - NDCG@10 + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (open-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [x] QuAC + - F1 + - [ ] RAFT + - Exact Match + - [ ] TruthfulQA + - Exact Match + - [ ] XSUM + - ROUGE-2 + - [ ] Helm instruct + - [ ] Anthropic RLHF dataset + - Harmlessness + - [ ] Best ChatGPT Prompts + - Harmlessness + - [ ] Instruct + - Mean + - Win Rate + - [ ] Koala test dataset + - Harmlessness + - [ ] Open Assistant + - Harmlessness + - [ ] Self Instruct + - Harmlessness + - [ ] Vicuna + - Harmlessness + - [x] Helm lite + - [x] GSM8K + - Exact Match + - [x] LegalBench + - Exact Match + - [ ] Lite + - Mean + - Win Rate + - [ ] MATH + - Accuracy + - [x] MedQA + - Exact Match + - [x] MMLU + - Exact Match + - [ ] NarrativeQA + - F1 + - [ ] NaturalQuestions (closed-book) + - F1 + - [ ] OpenbookQA + - Exact Match + - [ ] WMT 2014 + - BLEU-4 + - [x] Helm mmlu + - Abstract Algebra + - Exact Match + - Anatomy + - Exact Match + - Astronomy + - Exact Match + - Business Ethics + - Exact Match + - Clinical Knowledge + - Exact Match + - College Physics + - Exact Match + - Computer Security + - Exact Match + - Conceptual Physics + - Exact Match + - Econometrics + - Exact Match + - Electrical Engineering + - Exact Match + - Elementary Mathematics + - Exact Match + - Formal Logic + - Exact Match + - Global Facts + - Exact Match + - High School World History + - Exact Match + - Human Sexuality + - Exact Match + - International Law + - Exact Match + - Jurisprudence + - Exact Match + - Logical Fallacies + - Exact Match + - Machine Learning + - Exact Match + - Management + - Exact Match + - Marketing + - Exact Match + - Mean + - Win Rate + - Medical Genetics + - Exact Match + - Miscellaneous + - Exact Match + - MMLU All Subjects + - Exact Match + - Moral Scenarios + - Exact Match + - Nutrition + - Exact Match + - Philosophy + - Exact Match + - Prehistory + - Exact Match + - Professional Psychology + - Exact Match + - Public Relations + - Exact Match + - Security Studies + - Exact Match + - Sociology + - Exact Match + - Us Foreign Policy + - Exact Match + - Virology + - Exact Match + - World Religions + - Exact Match +- [x] Hfopenllm v2 + - [ ] BBH + - Accuracy + - [x] GPQA + - Accuracy + - [x] IFEval + - Accuracy + - [x] MATH Level 5 + - Exact Match + - [x] MMLU-PRO + - Accuracy + - [x] MUSR + - Accuracy +- [ ] La leaderboard + - Score +- [ ] Livecodebenchpro + - [ ] Easy Problems + - Pass@1 + - [ ] Hard Problems + - Pass@1 + - [ ] Medium Problems + - Pass@1 +- [ ] Reward Bench + - [ ] Chat + - Score + - [ ] Chat Hard + - Score + - [ ] Prior Sets (0.5 weight) + - Score + - [ ] Reasoning + - Score + - [ ] Reward bench + - Score + - [ ] Safety + - Score +- [ ] Reward Bench 2 + - [ ] Factuality + - Score + - [ ] Focus + - Score + - [ ] Math + - Score + - [ ] Precise IF + - Score + - [ ] Reward bench 2 + - Score + - [ ] Safety + - Score + - [ ] Ties + - Score +- [ ] Sciarena + - Cost per 100 Calls + - Elo Rating + - Rank +- [ ] Swe Bench + - Score +- [ ] Tau Bench 2 + - [ ] tau-bench-2/airline + - Score + - [ ] tau-bench-2/retail + - Score + - [ ] tau-bench-2/telecom + - Score +- [ ] Terminal Bench 2 0 + - Score +- [ ] Theory of mind + - Score +- [ ] Wordle arena + - Average Attempts + - Average Latency (ms) + - Win Rate diff --git a/plan/backend-canonical-identity-plan.md b/plan/backend-canonical-identity-plan.md new file mode 100644 index 000000000..3cd00e9c5 --- /dev/null +++ b/plan/backend-canonical-identity-plan.md @@ -0,0 +1,115 @@ +# Backend Canonical Identity Plan (Data Audit + Actions) + +## Snapshot audited + +- **Code repo (`~/every_eval_ever`)** updated to `aa966f7cf` (origin/main). +- **Datastore (`evaleval/EEE_datastore`)** updated to `5edc7b9`. +- Audit scope: all aggregate JSON files under `data/**` (`6448` files, `49659` evaluation results). + +## What is happening (evidence from latest data) + +1. **Metric identity is mostly missing in production data** + - `metric_config.metric_name` missing in **37071 / 49659** results. + - `metric_config.metric_id` missing in **37071 / 49659** results. + - This is concentrated in major configs: `hfopenllm_v2`, `helm_*`, `reward-bench`, `global-mmlu-lite`, `fibble_arena`, `wordle_arena`, `terminal-bench-2.0`, etc. + - Concrete live examples: + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405` has **19** results and **0 / 19** populated `metric_name` or `metric_id` fields; the only explicit labels are `evaluation_name` values such as `Global MMLU Lite`, `Culturally Sensitive`, `Arabic`, `English`, etc. + - `wordle_arena/qwen/qwen3-8b/1776347262.820056` has **3** results and **0 / 3** populated `metric_name` or `metric_id` fields. + - Backend implication: cannot reliably group/compare metrics without string parsing heuristics. + +2. **`evaluation_name` is frequently carrying metric semantics** + - **615** results have metric-like `evaluation_name`. + - Confirmed examples: + - `apex-agents`: `evaluation_name: "Overall Pass@1"` (metric semantics in eval field). + - `bfcl`: `evaluation_name: "bfcl.memory.accuracy"` while metric fields are also populated (eval and metric axes collapsed). + - `theory_of_mind`: `evaluation_name: "accuracy on theory_of_mind for scorer ..."` (legacy converter style). + - `wordle_arena/qwen/qwen3-8b/1776347262.820056`: `evaluation_name` values are `wordle_arena_win_rate`, `wordle_arena_avg_attempts`, and `wordle_arena_avg_latency_ms`, so the eval axis is fully metric-shaped. + - `global-mmlu-lite/xai_grok-3-mini/1773936496.366405`: `evaluation_name` is used for suite/slice labels (`Global MMLU Lite`, `Arabic`, `French`, etc.) while the implicit metric remains unstated, so eval and metric identity are still entangled even though the names are not metric-like. + - Backend implication: card grouping by evaluation name produces metric-shaped “benchmarks”. + +3. **`score_details.details` is overloaded as a nested telemetry dump** + - Found **52208** JSON-encoded values stored as strings inside `score_details.details`. + - HELM MMLU example (`Abstract Algebra` / `College Physics`) contains many cross-subject entries (e.g., College Chemistry/Biology stats inside College Physics row), mixing eval slice + telemetry dimensions. + - Backend implication: requires expensive post-parsing and risks accidental interpretation as benchmark/metric labels. + +4. **Benchmark/evaluation_id naming is not consistently aligned** + - **257** files where `evaluation_id` prefix does not match top-level folder benchmark codename. + - Main cases: + - `reward-bench` folder vs `evaluation_id` prefix `reward-bench-2`. + - `tau-bench-2_{domain}` and `appworld_test_normal` folders vs `evaluation_id` prefixes with hierarchical paths (`tau-bench-2/...`, `appworld/...`). + - Backend implication: any logic keyed on only one naming source (folder or `evaluation_id`) drifts. + +5. **Eval library naming is not standardized** + - **16 distinct `eval_library.name` values** including mixed casing and source-specific names (`lm-evaluation-harness`, `BFCL`, `Artificial Analysis`, `ARC Prize leaderboard`, `harbor`, `unknown`, etc.). + - Backend implication: harness-level analytics and joins need alias normalization today. + +6. **Fibble family note** + - Current snapshot no longer has `fibble1_arena`, `fibble2_arena` top-level folders; it is consolidated as `fibble_arena`. + - But fibble still encodes both slice and metric in `evaluation_name` (`fibble_arena_1lie_win_rate`, `...avg_attempts`), with missing metric IDs. + +7. **`detailed_evaluation_results` coverage can be metric-selective inside one aggregate run** + - Current live example: `wordle_arena/qwen/qwen3-8b/1776347262.820056` exposes **3** aggregate metrics (`win_rate`, `avg_attempts`, `avg_latency_ms`) and links one sample file with **35** rows. + - All **35 / 35** current sample rows in `9a357c44-1c36-43dc-a764-de1f3e204fe1_samples.jsonl` carry `evaluation_name = "wordle_arena_win_rate"`. + - The same aggregate currently declares `detailed_evaluation_results.total_rows = 27`, so file-link metadata and actual sample-row counts can already disagree in production. + - Backend implication: a linked sample file does not imply run-wide instance coverage. Aggregate-to-instance linkage must remain metric-scoped, and instance-availability badges should be computed per metric or per eval-summary node, not per run. + +## Backend-centric recommendations (proposed) + +1. **Enforce canonical identity at ingestion (hard)** + - Persist canonical tuple (backend-owned): + `(run_id, model_id, benchmark_family_id, eval_slice_id, metric_id, harness_id, result_index)`. + - Keep raw fields in parallel (`raw_evaluation_name`, `raw_metric_description`, etc.) for audit/debug. + +2. **Add registry-backed resolution with confidence** + - Resolve benchmark/eval-slice/metric/harness via registry aliases (`exact`, `normalized`, `fuzzy`, `manual`). + - Store `strategy`, `confidence`, `review_status`; quarantine low-confidence rows from card generation. + +3. **Add semantic validation gates in ingestion CI** + - Reject or flag: + - metric-like `evaluation_name` without explicit metric identity, + - `evaluation_name == metric_name` collisions, + - benchmark-family naming drift (`folder` vs `evaluation_id` inconsistencies). + - linked sample files whose rows cover only a strict subset of aggregate metrics without explicit metric-scoped coverage metadata. + - linked sample files whose observed row count disagrees with declared `detailed_evaluation_results.total_rows`. + - Keep structural schema validation, but add these semantic checks as a second gate. + +4. **Phase-in stricter schema usage for metrics** + - Immediate: warn-only for missing `metric_name`/`metric_id`. + - Next: soft fail in bot with override. + - Final: hard fail (for new submissions) unless `metric_name` + `metric_id` present. + +5. **Serve frontend from canonical IDs only** + - Frontend card grouping/filtering must use canonical IDs, never raw labels. + - Raw labels are display metadata only. + - Instance availability must be attached to canonical metric/eval-summary IDs, not inferred from the existence of any `detailed_evaluation_results` file on the parent run. + - This prevents recurring “benchmark cards that are actually metrics”. + +## Should we fix adapters and regenerate data? + +**Short answer: yes, but only for adapter-owned benchmark families.** + +### Good candidates for adapter-fix + regenerate + +Adapters exist in `utils/` for: +- `hfopenllm_v2` +- `helm` (`helm_lite`, `helm_mmlu`, `helm_capabilities`, `helm_classic`, `helm_instruct`) +- `rewardbench` +- `global-mmlu-lite` +- `terminal_bench_2` +- `exgentic` (used by tau/appworld/swe/browsecompplus in this dataset) + +These are high-leverage because they account for a large share of missing metric identity. + +### Not fully solved by adapter regeneration alone + +Several benchmark families in data are not obviously sourced from current `utils/` adapters (or are manually/externally produced), including examples like: +- `apex-agents`, `apex-v1`, `bfcl`, `artificial-analysis-llms`, `arc-agi`, `sciarena`, `fibble_arena`, `wordle_arena`, `ace`, `la_leaderboard`. + +For these, you need a **backfill canonicalization migration** + submission template updates, not only adapter patches. + +### Practical plan + +1. Patch adapters to emit explicit `metric_name` + `metric_id` and metric-free `evaluation_name`. +2. Regenerate adapter-owned families in a controlled replay branch. +3. Run one-time migration for non-adapter/manual families. +4. Turn on semantic gating and canonical-ID-only serving. diff --git a/scripts/plot_dataset_statistics.py b/scripts/plot_dataset_statistics.py new file mode 100644 index 000000000..e2eb04f13 --- /dev/null +++ b/scripts/plot_dataset_statistics.py @@ -0,0 +1,714 @@ +"""Generate PDF plots from dataset statistics JSON.""" + +from __future__ import annotations + +import argparse +import json +import math +import statistics +import textwrap +from pathlib import Path +from typing import Any + +PLOT_FILES = { + 'coverage': 'coverage_counts.pdf', + 'quality': 'normalization_quality.pdf', + 'top_coverage': 'top_evaluation_coverage.pdf', + 'mean': 'normalized_score_mean_by_eval.pdf', + 'variability': 'normalized_score_variability.pdf', + 'range': 'score_range_by_eval.pdf', + 'models_per_dataset': 'models_per_dataset_histogram.pdf', + 'engine_spread': 'inference_engine_spread.pdf', + 'writeup_overview': 'writeup_dataset_statistics_overview.pdf', +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Generate dataset-statistics PDF plots.' + ) + parser.add_argument( + '--input', + type=Path, + default=Path('audit/dataset_statistics.json'), + help='Path to dataset_statistics.json.', + ) + parser.add_argument( + '--output-dir', + type=Path, + default=Path('audit/dataset_statistics_plots'), + help='Directory for generated PDF plots.', + ) + parser.add_argument( + '--top-n', + type=int, + default=25, + help='Number of evaluation rows to show in ranked plots.', + ) + return parser.parse_args() + + +def load_statistics(path: Path) -> dict[str, Any]: + with path.open(encoding='utf-8') as handle: + return json.load(handle) + + +def import_plotting() -> tuple[Any, Any | None]: + try: + import matplotlib.pyplot as plt + except ModuleNotFoundError as exc: + raise SystemExit( + 'matplotlib is required to generate plots. Install matplotlib ' + 'or seaborn in the active environment and rerun this script.' + ) from exc + + try: + import seaborn as sns + except ModuleNotFoundError: + sns = None + + if sns is not None: + sns.set_theme(style='whitegrid', context='talk') + else: + plt.style.use('ggplot') + return plt, sns + + +def label(row: dict[str, Any]) -> str: + benchmark = str(row['benchmark']) + evaluation = str(row['evaluation_name']) + metric = row.get('metric_id') or row.get('metric_name') + if benchmark == evaluation: + base = benchmark + else: + base = f'{benchmark}: {evaluation}' + if metric: + return f'{base} [{metric}]' + return base + + +def short_label(value: str, width: int = 46) -> str: + return textwrap.shorten(value, width=width, placeholder='...') + + +def wrapped_label(value: str, width: int = 16) -> str: + return '\n'.join(textwrap.wrap(value, width=width, break_long_words=False)) + + +def columns( + rows: list[dict[str, Any]], keys: tuple[str, ...] +) -> dict[str, list[Any]]: + return {key: [row[key] for row in rows] for key in keys} + + +def save(fig: Any, path: Path) -> None: + fig.tight_layout() + fig.savefig(path, format='pdf', bbox_inches='tight') + + +def plot_coverage_counts( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + counts = stats['descriptive']['counts'] + order = [ + 'result_rows', + 'unique_models', + 'unique_developers', + 'unique_evaluations', + 'unique_benchmarks', + ] + rows = [ + {'metric': key.replace('_', ' ').title(), 'count': counts[key]} + for key in order + ] + + fig, ax = plt.subplots(figsize=(10, 5.5)) + if sns is not None: + sns.barplot( + data=columns(rows, ('metric', 'count')), + x='metric', + y='count', + hue='metric', + ax=ax, + legend=False, + ) + else: + ax.bar([row['metric'] for row in rows], [row['count'] for row in rows]) + ax.set_yscale('log') + ax.set_xlabel('') + ax.set_ylabel('Count, log scale') + ax.set_title('Dataset Coverage') + ax.tick_params(axis='x', rotation=25) + for index, row in enumerate(rows): + ax.text( + index, row['count'], f'{row["count"]:,}', ha='center', va='bottom' + ) + + path = output_dir / PLOT_FILES['coverage'] + save(fig, path) + plt.close(fig) + return path + + +def plot_normalization_quality( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + valid = stats['observational']['valid_normalized_rows'] + exclusions = stats['observational']['exclusions'] + rows = [{'category': 'valid normalized rows', 'count': valid}] + [ + {'category': key.replace('_', ' '), 'count': value} + for key, value in exclusions.items() + ] + + fig, ax = plt.subplots(figsize=(11, 5.5)) + if sns is not None: + sns.barplot( + data=columns(rows, ('category', 'count')), + x='category', + y='count', + hue='category', + ax=ax, + legend=False, + ) + else: + ax.bar( + [row['category'] for row in rows], [row['count'] for row in rows] + ) + ax.set_yscale('symlog', linthresh=1) + ax.set_xlabel('') + ax.set_ylabel('Rows, symmetric log scale') + ax.set_title('Normalization Quality') + ax.tick_params(axis='x', rotation=25) + for index, row in enumerate(rows): + ax.text( + index, + max(row['count'], 1), + f'{row["count"]:,}', + ha='center', + va='bottom', + ) + + path = output_dir / PLOT_FILES['quality'] + save(fig, path) + plt.close(fig) + return path + + +def top_rows( + rows: list[dict[str, Any]], key: str, limit: int +) -> list[dict[str, Any]]: + return sorted(rows, key=lambda row: (-float(row[key]), label(row)))[:limit] + + +def plot_top_evaluation_coverage( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + selected = list(reversed(top_rows(rows, 'count', top_n))) + labels = [short_label(label(row), 58) for row in selected] + counts = [row['count'] for row in selected] + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + if sns is not None: + sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, counts) + ax.set_xlabel('Normalized result rows') + ax.set_ylabel('') + ax.set_title(f'Top {len(selected)} Evaluations By Coverage') + + path = output_dir / PLOT_FILES['top_coverage'] + save(fig, path) + plt.close(fig) + return path + + +def plot_normalized_score_means( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + selected = sorted(rows, key=lambda row: (float(row['mean']), label(row)))[ + :top_n + ] + labels = [short_label(label(row), 58) for row in selected] + means = [row['mean'] for row in selected] + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + if sns is not None: + sns.barplot(x=means, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, means) + ax.set_xlim(0, 1) + ax.set_xlabel('Mean normalized score') + ax.set_ylabel('') + ax.set_title(f'Lowest {len(selected)} Mean Normalized Scores') + + path = output_dir / PLOT_FILES['mean'] + save(fig, path) + plt.close(fig) + return path + + +def plot_score_variability( + rows: list[dict[str, Any]], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + fig, ax = plt.subplots(figsize=(10, 7)) + draw_score_landscape(ax, rows, sns) + ax.set_title('Normalized Score Level vs. Variability') + + path = output_dir / PLOT_FILES['variability'] + save(fig, path) + plt.close(fig) + return path + + +def draw_score_landscape( + ax: Any, + rows: list[dict[str, Any]], + sns: Any | None, + annotation_limit: int = 8, +) -> None: + plot_rows = [ + { + 'mean': row['mean'], + 'stddev': row['stddev'] or 0.0, + 'count': row['count'], + 'label': label(row), + } + for row in rows + if row.get('mean') is not None + ] + if not plot_rows: + ax.text( + 0.5, + 0.5, + 'No normalized score summaries available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_axis_off() + return + + max_count = max(row['count'] for row in plot_rows) + sizes = [ + 45 + 455 * math.sqrt(row['count'] / max_count) for row in plot_rows + ] + + if sns is not None: + sns.scatterplot( + data=columns(plot_rows, ('mean', 'stddev', 'count')), + x='mean', + y='stddev', + size='count', + sizes=(45, 500), + alpha=0.75, + legend=False, + ax=ax, + ) + else: + ax.scatter( + [row['mean'] for row in plot_rows], + [row['stddev'] for row in plot_rows], + s=sizes, + alpha=0.75, + ) + ax.set_xlim(0, 1) + ax.set_xlabel('Mean normalized score') + ax.set_ylabel('Standard deviation') + + notable = sorted( + plot_rows, + key=lambda row: ( + row['stddev'] * math.log1p(row['count']), + row['count'], + abs(row['mean'] - 0.5), + ), + reverse=True, + )[:annotation_limit] + for row in notable: + ax.annotate( + short_label(row['label'], 24), + (row['mean'], row['stddev']), + xytext=(5, 4), + textcoords='offset points', + fontsize=8, + ) + + +def plot_score_ranges( + rows: list[dict[str, Any]], + output_dir: Path, + plt: Any, + top_n: int, +) -> Path: + selected = sorted( + rows, + key=lambda row: (float(row['max']) - float(row['min']), label(row)), + )[-top_n:] + labels = [short_label(label(row), 58) for row in selected] + mins = [row['min'] for row in selected] + maxes = [row['max'] for row in selected] + means = [row['mean'] for row in selected] + ypos = list(range(len(selected))) + + fig, ax = plt.subplots(figsize=(11, max(6, top_n * 0.35))) + for y, low, high in zip(ypos, mins, maxes, strict=True): + ax.hlines(y, low, high, color='#5b6770', linewidth=2.0, alpha=0.9) + ax.scatter(means, ypos, color='#0072b2', s=32, zorder=3, label='mean') + ax.set_yticks(ypos) + ax.set_yticklabels(labels) + ax.set_xlim(0, 1) + ax.set_xlabel('Normalized score range') + ax.set_ylabel('') + ax.set_title(f'Widest {len(selected)} Normalized Score Ranges') + ax.legend(loc='lower right') + + path = output_dir / PLOT_FILES['range'] + save(fig, path) + plt.close(fig) + return path + + +def plot_models_per_dataset_histogram( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + rows = stats['descriptive'].get('models_per_benchmark', []) + values = [row['unique_models'] for row in rows if row['unique_models'] > 0] + + fig, ax = plt.subplots(figsize=(10, 6)) + if values: + bins = min(30, max(8, len(set(values)))) + if sns is not None: + sns.histplot(values, bins=bins, ax=ax, color='#0072b2') + else: + ax.hist(values, bins=bins, color='#0072b2', edgecolor='white') + median_value = statistics.median(values) + ax.axvline( + median_value, + color='#d55e00', + linestyle='--', + linewidth=2, + label=f'median={median_value:g}', + ) + ax.legend() + else: + ax.text( + 0.5, + 0.5, + 'No model-per-dataset summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_xlabel('Unique models per dataset') + ax.set_ylabel('Datasets') + ax.set_title('Distribution Of Unique Models Per Dataset') + + path = output_dir / PLOT_FILES['models_per_dataset'] + save(fig, path) + plt.close(fig) + return path + + +def plot_inference_engine_spread( + stats: dict[str, Any], + output_dir: Path, + plt: Any, + sns: Any | None, + top_n: int, +) -> Path: + rows = stats['descriptive'].get('inference_engines', []) + selected = rows[:top_n] + remaining = rows[top_n:] + if remaining: + selected = selected + [ + { + 'value': 'other', + 'count': sum(int(row['count']) for row in remaining), + } + ] + labels = [short_label(str(row['value']), 48) for row in selected] + counts = [row['count'] for row in selected] + + fig, ax = plt.subplots(figsize=(10, max(5, len(selected) * 0.45))) + if selected: + if sns is not None: + sns.barplot(x=counts, y=labels, hue=labels, ax=ax, legend=False) + else: + ax.barh(labels, counts) + ax.set_xscale('log') + ax.set_xlim(left=1) + if ax.get_ylim()[0] < ax.get_ylim()[1]: + ax.invert_yaxis() + else: + ax.text( + 0.5, + 0.5, + 'No inference-engine summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_xlabel('Result rows (log scale)') + ax.set_ylabel('') + ax.set_title('Recorded Inference Engine/Platform Spread') + + path = output_dir / PLOT_FILES['engine_spread'] + save(fig, path) + plt.close(fig) + return path + + +def plot_writeup_overview( + stats: dict[str, Any], output_dir: Path, plt: Any, sns: Any | None +) -> Path: + fig, (ax_benchmark, ax_model_family, ax_score) = plt.subplots( + 1, + 3, + figsize=(22.5, 9.6), + gridspec_kw={'width_ratios': [1.55, 1.35, 1.0], 'wspace': 0.34}, + ) + draw_metadata_completeness( + ax_benchmark, + stats, + plt, + sns, + completeness_key='metadata_completeness', + rows_key='benchmarks', + row_key='benchmark', + title='A. Reporting completeness by benchmark', + show_colorbar=False, + ) + draw_metadata_completeness( + ax_model_family, + stats, + plt, + sns, + completeness_key='model_family_metadata_completeness', + rows_key='model_families', + row_key='model_family', + title='B. Reporting completeness by model family', + show_colorbar=False, + ) + draw_score_landscape( + ax_score, + stats['descriptive'].get('normalized_score_summaries', []), + sns, + annotation_limit=7, + ) + ax_score.set_title('C. Score landscape by metric') + ax_score.title.set_fontsize(15) + + path = output_dir / PLOT_FILES['writeup_overview'] + save(fig, path) + plt.close(fig) + return path + + +def draw_metadata_completeness( + ax: Any, + stats: dict[str, Any], + plt: Any, + sns: Any | None, + completeness_key: str, + rows_key: str, + row_key: str, + title: str, + show_colorbar: bool, +) -> None: + completeness = stats['descriptive'].get(completeness_key, {}) + fields = completeness.get('fields', []) + row_groups = completeness.get(rows_key, []) + matrix_rows = completeness.get('matrix', []) + if not fields or not row_groups or not matrix_rows: + ax.text( + 0.5, + 0.5, + 'No metadata completeness summary available', + ha='center', + va='center', + transform=ax.transAxes, + ) + ax.set_axis_off() + return + + field_order = [field['key'] for field in fields] + field_labels = [wrapped_label(str(field['label']), 13) for field in fields] + field_groups = [str(field.get('group', 'metadata')) for field in fields] + row_order = [row_group[row_key] for row_group in row_groups] + row_labels = [ + short_label(str(row_group['label']), 38) for row_group in row_groups + ] + value_by_cell = { + (row.get(row_key, row.get('benchmark')), row['field']): ( + 100.0 * row['present_rate'] + ) + for row in matrix_rows + } + values = [ + [ + value_by_cell.get((row_group, field), 0.0) + for field in field_order + ] + for row_group in row_order + ] + + if sns is not None: + cbar_kws = ( + {'label': '% present', 'fraction': 0.05, 'pad': 0.05} + if show_colorbar + else {} + ) + sns.heatmap( + values, + ax=ax, + vmin=0, + vmax=100, + cmap='RdYlGn', + xticklabels=field_labels, + yticklabels=row_labels, + linewidths=0.35, + linecolor='white', + alpha=0.68, + cbar=show_colorbar, + cbar_kws=cbar_kws, + ) + else: + image = ax.imshow( + values, + vmin=0, + vmax=100, + cmap='RdYlGn', + alpha=0.68, + extent=(0, len(field_labels), len(row_labels), 0), + ) + if show_colorbar: + colorbar = plt.colorbar(image, ax=ax, fraction=0.046, pad=0.04) + colorbar.set_label('% present') + ax.set_xticks([index + 0.5 for index in range(len(field_labels))]) + ax.set_xticklabels(field_labels) + ax.set_yticks( + [index + 0.5 for index in range(len(row_labels))] + ) + ax.set_yticklabels(row_labels) + draw_heatmap_values(ax, values) + draw_metadata_field_groups(ax, field_groups) + + ax.set_title(title, pad=28) + ax.title.set_fontsize(15) + ax.set_xlabel('') + ax.set_ylabel('') + ax.tick_params(axis='x', labelrotation=60, labelsize=9, pad=2) + ax.tick_params(axis='y', labelsize=10) + for tick in ax.get_xticklabels(): + tick.set_ha('right') + tick.set_rotation_mode('anchor') + + +def draw_heatmap_values(ax: Any, values: list[list[float]]) -> None: + for y_index, row in enumerate(values): + for x_index, value in enumerate(row): + ax.text( + x_index + 0.5, + y_index + 0.5, + f'{value:.0f}', + ha='center', + va='center', + color='#111111', + fontsize=6.2, + bbox={ + 'facecolor': 'white', + 'edgecolor': 'none', + 'boxstyle': 'round,pad=0.12', + 'alpha': 0.58, + }, + ) + + +def draw_metadata_field_groups(ax: Any, field_groups: list[str]) -> None: + if not field_groups: + return + + start = 0 + while start < len(field_groups): + group = field_groups[start] + end = start + 1 + while end < len(field_groups) and field_groups[end] == group: + end += 1 + + center = (start + end) / 2 + ax.text( + center, + 1.03, + group, + transform=ax.get_xaxis_transform(), + ha='center', + va='bottom', + fontsize=8.5, + fontweight='bold', + color='#303030', + clip_on=False, + ) + ax.plot( + [start + 0.1, end - 0.1], + [1.025, 1.025], + transform=ax.get_xaxis_transform(), + color='#303030', + linewidth=0.8, + clip_on=False, + ) + if end < len(field_groups): + ax.axvline(end, color='white', linewidth=2.0) + ax.axvline(end, color='#303030', linewidth=0.55, alpha=0.55) + start = end + + +def main() -> None: + args = parse_args() + if args.top_n < 1: + raise SystemExit('--top-n must be at least 1') + + stats = load_statistics(args.input) + rows = stats['descriptive']['normalized_score_summaries'] + if not rows: + raise SystemExit( + 'No normalized_score_summaries found in statistics JSON.' + ) + + output_dir = args.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + plt, sns = import_plotting() + + plot_paths = { + 'coverage': plot_coverage_counts(stats, output_dir, plt, sns), + 'quality': plot_normalization_quality(stats, output_dir, plt, sns), + 'top_coverage': plot_top_evaluation_coverage( + rows, output_dir, plt, sns, args.top_n + ), + 'mean': plot_normalized_score_means( + rows, output_dir, plt, sns, args.top_n + ), + 'variability': plot_score_variability(rows, output_dir, plt, sns), + 'range': plot_score_ranges(rows, output_dir, plt, args.top_n), + 'models_per_dataset': plot_models_per_dataset_histogram( + stats, output_dir, plt, sns + ), + 'engine_spread': plot_inference_engine_spread( + stats, output_dir, plt, sns, args.top_n + ), + 'writeup_overview': plot_writeup_overview( + stats, output_dir, plt, sns + ), + } + print(f'Wrote {len(plot_paths)} PDF plots to {output_dir}') + + +if __name__ == '__main__': + main() diff --git a/tests/test_dataset_statistics.py b/tests/test_dataset_statistics.py new file mode 100644 index 000000000..cc8266315 --- /dev/null +++ b/tests/test_dataset_statistics.py @@ -0,0 +1,424 @@ +from __future__ import annotations + +from every_eval_ever.helpers import dataset_statistics as stats + + +def row( + model_id: str, + benchmark: str, + evaluation_name: str, + score: float | None, + *, + min_score: float | None = 0.0, + max_score: float | None = 1.0, + lower_is_better: bool = False, + score_type: str | None = 'continuous', + inference_engine: str | None = None, + metric_id: str | None = 'score', + metric_name: str | None = 'Score', + metric_kind: str | None = 'accuracy', + metric_unit: str | None = 'proportion', + **metadata, +) -> dict: + result = { + 'schema_version': '0.2.2', + 'evaluation_id': f'{model_id}/{benchmark}/{evaluation_name}', + 'model_id': model_id, + 'model_developer': model_id.split('/')[0], + 'inference_engine': inference_engine, + 'benchmark': benchmark, + 'evaluation_name': evaluation_name, + 'score': score, + 'min_score': min_score, + 'max_score': max_score, + 'lower_is_better': lower_is_better, + 'score_type': score_type, + 'has_uncertainty': False, + 'metric_id': metric_id, + 'metric_name': metric_name, + 'metric_kind': metric_kind, + 'metric_unit': metric_unit, + } + result.update(metadata) + return result + + +def test_normalization_respects_lower_is_better(): + assert stats.normalize_score(0.8, 0.0, 1.0, False) == 0.8 + assert stats.normalize_score(0.2, 0.0, 1.0, True) == 0.8 + + +def test_invalid_rows_are_excluded_and_counted(): + rows = [ + row('a', 'bench', 'eval', None), + row('a', 'bench', 'eval', 0.5, min_score=None), + row('a', 'bench', 'eval', 0.5, min_score=1.0, max_score=1.0), + row('a', 'bench', 'eval', 0.5, score_type='binary'), + row('a', 'bench', 'eval', 2.0), + row('a', 'bench', 'eval', 0.8), + ] + + valid, exclusions = stats.valid_normalized_rows(rows) + + assert len(valid) == 1 + assert exclusions == { + 'missing_score': 1, + 'missing_bounds': 1, + 'zero_width_bounds': 1, + 'incompatible_score_type': 1, + 'out_of_range': 1, + } + + +def test_shared_evaluation_key_includes_metric_scale_and_direction(): + base = row('a', 'bench', 'eval', 0.8) + different_metric = row( + 'a', + 'bench', + 'eval', + 0.7, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ) + different_scale = row('a', 'bench', 'eval', 80.0, max_score=100.0) + different_direction = row('a', 'bench', 'eval', 0.2, lower_is_better=True) + + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_metric + ) + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_scale + ) + assert stats.shared_evaluation_key(base) != stats.shared_evaluation_key( + different_direction + ) + + +def test_min_shared_evals_filters_pairwise_comparisons(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + row('model/a', 'b2', 'e2', 0.8), + row('model/b', 'b2', 'e2', 0.6), + ] + valid, _ = stats.valid_normalized_rows(rows) + + assert ( + stats.pairwise_model_comparisons( + valid, min_shared_evals=3, top_model_limit=10, comparison_limit=10 + ) + == [] + ) + comparisons = stats.pairwise_model_comparisons( + valid, min_shared_evals=2, top_model_limit=10, comparison_limit=10 + ) + assert len(comparisons) == 1 + assert comparisons[0]['shared_evaluation_count'] == 2 + + +def test_stabilized_estimates_move_sparse_models_toward_corpus_mean(): + estimate = stats.stabilized_estimate( + mean_score=1.0, count=1, corpus_mean=0.5, weight=5.0 + ) + + assert 0.5 < estimate < 1.0 + + +def test_probability_style_support_outputs_are_bounded(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + row('model/a', 'b2', 'e2', 0.8), + row('model/b', 'b2', 'e2', 0.6), + row('model/a', 'b3', 'e3', 0.7), + row('model/b', 'b3', 'e3', 0.6), + ] + valid, _ = stats.valid_normalized_rows(rows) + + model_summaries = stats.coverage_aware_model_summaries(valid, limit=10) + comparisons = stats.pairwise_model_comparisons( + valid, min_shared_evals=2, top_model_limit=10, comparison_limit=10 + ) + + for summary in model_summaries: + support = summary['support_above_corpus_average'] + assert 0.0 <= support <= 1.0 + assert 0.0 <= comparisons[0]['support_model_a_higher'] <= 1.0 + + +def test_json_report_shape(): + rows = [ + row('model/a', 'b1', 'e1', 0.9), + row('model/b', 'b1', 'e1', 0.7), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + metadata_top_benchmarks=12, + metadata_top_model_families=20, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=False, + ) + + assert set(report) == {'descriptive', 'observational'} + assert report['descriptive']['counts']['result_rows'] == 2 + assert 'inference_engines' in report['descriptive'] + assert 'models_per_benchmark' in report['descriptive'] + assert 'metadata_completeness' in report['descriptive'] + assert 'model_family_metadata_completeness' in report['descriptive'] + assert 'metric_id' in report['descriptive']['score_summaries'][0] + assert 'coverage_aware_model_summaries' in report['observational'] + assert 'pairwise_model_comparisons' in report['observational'] + + +def test_score_summaries_group_by_metric_identity(): + rows = [ + row('model/a', 'arc', 'v1_Semi_Private', 0.98), + row( + 'model/a', + 'arc', + 'v1_Semi_Private', + 17.0, + max_score=77.2, + lower_is_better=True, + metric_id='cost_per_task', + metric_name='Cost per task', + metric_kind='cost', + metric_unit='usd', + ), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=10, + metadata_top_benchmarks=12, + metadata_top_model_families=20, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + + raw_summaries = report['descriptive']['score_summaries'] + normalized_summaries = report['descriptive']['normalized_score_summaries'] + + assert {item['metric_id'] for item in raw_summaries} == { + 'score', + 'cost_per_task', + } + assert {item['count'] for item in raw_summaries} == {1} + assert {item['metric_id'] for item in normalized_summaries} == { + 'score', + 'cost_per_task', + } + + +def test_models_per_benchmark_dedupes_model_counts(): + rows = [ + row('model/a', 'bench-one', 'eval-a', 0.9), + row('model/a', 'bench-one', 'eval-b', 0.8), + row('model/b', 'bench-one', 'eval-a', 0.7), + row('model/c', 'bench-two', 'eval-a', 0.6), + ] + + summaries = stats.models_per_benchmark(rows) + + assert summaries == [ + { + 'benchmark': 'bench-one', + 'unique_models': 2, + 'result_rows': 3, + }, + { + 'benchmark': 'bench-two', + 'unique_models': 1, + 'result_rows': 1, + }, + ] + + +def test_inference_engine_counts_group_missing_as_unknown(): + rows = [ + row('model/a', 'bench', 'eval', 0.9, inference_engine='vllm'), + row('model/b', 'bench', 'eval', 0.8, inference_engine=''), + row('model/c', 'bench', 'eval', 0.7, inference_engine=None), + row('model/d', 'bench', 'eval', 0.6, inference_engine='ollama'), + row('model/e', 'bench', 'eval', 0.5, inference_engine='vllm'), + ] + + assert stats.count_values_with_unknown(rows, 'inference_engine') == [ + {'value': 'vllm', 'count': 2}, + {'value': 'unknown', 'count': 2}, + {'value': 'ollama', 'count': 1}, + ] + + +def test_metadata_completeness_counts_present_and_missing_values(): + rows = [ + row( + 'model/a', + 'bench-a', + 'eval', + 0.9, + inference_engine='vllm', + generation_temperature=0.2, + source_locator='https://example.test/data', + has_uncertainty=True, + ), + row( + 'model/b', + 'bench-a', + 'eval', + 0.8, + inference_engine=' ', + generation_temperature=None, + source_locator=[], + has_uncertainty=False, + ), + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=1, top_fields=16 + ) + matrix = { + (item['benchmark'], item['field']): item + for item in summary['matrix'] + } + + assert matrix[('bench-a', 'inference_engine')]['present_rate'] == 0.5 + assert matrix[('bench-a', 'generation_temperature')][ + 'present_rate' + ] == 0.5 + assert matrix[('bench-a', 'source_locator')]['present_rate'] == 0.5 + assert matrix[('bench-a', 'has_uncertainty')]['present_rate'] == 0.5 + + +def test_metadata_completeness_aggregates_other_benchmarks(): + rows = [ + row('model/a', 'bench-a', 'eval', 0.9, generation_temperature=0.1), + row('model/b', 'bench-a', 'eval', 0.8, generation_temperature=0.2), + row('model/c', 'bench-b', 'eval', 0.7, generation_temperature=None), + row('model/d', 'bench-b', 'eval', 0.6, generation_temperature=None), + row('model/e', 'bench-c', 'eval', 0.5, generation_temperature=0.3), + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=2, top_fields=16 + ) + + assert summary['other_result_rows'] == 1 + assert summary['benchmarks'][-1]['benchmark'] == 'Other' + assert summary['benchmarks'][-1]['label'] == 'Other (n=1)' + assert summary['benchmarks'][-1]['result_rows'] == 1 + assert any( + item['benchmark'] == 'Other' + and item['field'] == 'generation_temperature' + and item['present_rate'] == 1.0 + for item in summary['matrix'] + ) + + +def test_metadata_top_benchmarks_argument_controls_report_shape(): + rows = [ + row('model/a', 'bench-a', 'eval', 0.9, generation_temperature=0.1), + row('model/b', 'bench-a', 'eval', 0.8, generation_temperature=0.2), + row('model/c', 'bench-b', 'eval', 0.7, generation_temperature=None), + row('model/d', 'bench-c', 'eval', 0.6, generation_temperature=None), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + metadata_top_benchmarks=1, + metadata_top_model_families=20, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + completeness = report['descriptive']['metadata_completeness'] + + assert completeness['top_benchmark_count'] == 1 + assert [item['benchmark'] for item in completeness['benchmarks']] == [ + 'bench-a', + 'Other', + ] + assert completeness['other_result_rows'] == 2 + + +def test_metadata_top_model_families_argument_controls_report_shape(): + rows = [ + row('family-a/model-1', 'bench', 'eval', 0.9, model_license='mit'), + row('family-a/model-2', 'bench', 'eval', 0.8, model_license='apache'), + row('family-b/model-1', 'bench', 'eval', 0.7, model_license=None), + row('family-c/model-1', 'bench', 'eval', 0.6, model_license=None), + ] + + report = stats.build_statistics_report( + rows, + summary_limit=5, + metadata_top_benchmarks=20, + metadata_top_model_families=1, + comparison_limit=5, + top_model_limit=5, + min_shared_evals=1, + descriptive_only=True, + ) + completeness = report['descriptive'][ + 'model_family_metadata_completeness' + ] + + assert completeness['top_model_family_count'] == 1 + assert [ + item['model_family'] for item in completeness['model_families'] + ] == ['family-a', 'Other'] + assert completeness['other_result_rows'] == 2 + + +def test_metadata_field_selection_favors_missing_and_uneven_fields(): + rows = [ + row( + f'model/a-{index}', + 'bench-complete', + 'eval', + 0.9, + generation_temperature=0.2, + source_locator=None, + ) + for index in range(5) + ] + [ + row( + f'model/b-{index}', + 'bench-missing', + 'eval', + 0.8, + generation_temperature=None, + source_locator=None, + ) + for index in range(5) + ] + + summary = stats.metadata_completeness( + rows, top_benchmarks=2, top_fields=1 + ) + + assert summary['fields'][0]['key'] == 'generation_temperature' + assert summary['fields'][0]['selection_score'] > 0.05 + + +def test_cli_help_uses_summary_limit_not_top_n(capsys): + try: + stats.parse_args(['--help']) + except SystemExit: + pass + + output = capsys.readouterr().out + assert '--summary-limit' in output + assert '--metadata-top-benchmarks' in output + assert '--metadata-top-model-families' in output + assert '--top-n' not in output