diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index e43eb01b9..d57d7f30f 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -3,7 +3,7 @@ import os import uuid from pathlib import Path -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Union, cast _HELM_IMPORT_ERROR: Exception | None = None try: @@ -29,14 +29,18 @@ Exception ) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex - DaciteConfig = from_dict = None # type: ignore[assignment] - PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = ( - RunSpec - ) = Any # type: ignore[assignment] - get_model_deployment = register_builtin_configs_from_helm_package = ( - from_json - ) = None # type: ignore[assignment] - ModelDeploymentNotFoundError = Exception # type: ignore[assignment] + DaciteConfig = cast(Any, None) + from_dict = cast(Any, None) + PerInstanceStats = cast(Any, None) + AdapterSpec = cast(Any, None) + RequestState = cast(Any, None) + ScenarioState = cast(Any, None) + Stat = cast(Any, None) + RunSpec = cast(Any, None) + get_model_deployment = cast(Any, None) + register_builtin_configs_from_helm_package = cast(Any, None) + from_json = cast(Any, None) + ModelDeploymentNotFoundError = cast(Any, Exception) from every_eval_ever.converters import SCHEMA_VERSION from every_eval_ever.converters.common.adapter import ( @@ -45,8 +49,11 @@ SupportedLibrary, ) from every_eval_ever.converters.common.utils import sha256_file +from every_eval_ever.converters.helm.metrics import is_core_metric from every_eval_ever.converters.helm.instance_level_adapter import ( HELMInstanceLevelDataAdapter, + _evaluation_result_id, + _score_from_stat, ) from every_eval_ever.converters.helm.utils import extract_reasoning from every_eval_ever.eval_types import ( @@ -67,7 +74,6 @@ SourceType, Uncertainty, ) -from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog def _require_helm_dependencies() -> None: @@ -106,6 +112,7 @@ def metadata(self) -> AdapterMetadata: return AdapterMetadata( name='HELMAdapter', version='0.0.1', + supported_library_versions=['helm'], description='HELM adapter with dynamic metrics and unified JSONL instance logging', ) @@ -129,7 +136,8 @@ def _split_model_id(self, model_id: str | None) -> tuple[str, str]: if not model_id: return ('unknown', 'unknown') if '/' in model_id: - return tuple(model_id.split('/', 1)) + developer, name = model_id.split('/', 1) + return (developer, name) return ('unknown', model_id) def _extract_model_info(self, adapter_spec: AdapterSpec) -> ModelInfo: @@ -191,6 +199,7 @@ def _load_file_if_exists(self, dir_path, file_name) -> Any: return None def _load_evaluation_run_logfiles(self, dir_path) -> Dict: + """Load the HELM files needed for aggregate and detail conversion.""" scenario_state_dict = self._load_file_if_exists( dir_path, self.SCENARIO_STATE_FILE ) @@ -210,14 +219,18 @@ def _load_evaluation_run_logfiles(self, dir_path) -> Dict: } def transform_from_directory( - self, dir_path: str, output_path: str, metadata_args: Dict[str, Any] - ): + self, + dir_path: str | Path, + metadata_args: Dict[str, Any] | None = None, + output_path: str | None = None, + ) -> List[EvaluationLog]: """ Transforms HELM results into one aggregate EvaluationLog and one instance-level JSONL file containing all samples. """ - # all_instance_logs: List[InstanceLevelEvaluationLog] = [] aggregate_logs: List[EvaluationLog] = [] + metadata_args = metadata_args or {} + dir_path = str(dir_path) file_uuids = metadata_args.get('file_uuids') @@ -260,11 +273,6 @@ def transform_from_directory( aggregate_logs.append(agg) converted_idx += 1 - # # Write all consolidated instance logs to JSONL - # with open(output_path, 'w', encoding='utf-8') as f: - # for log in all_instance_logs: - # f.write(json.dumps(log.model_dump(), ensure_ascii=False) + '\n') - return aggregate_logs def _extract_generation_args( @@ -318,6 +326,7 @@ def _extract_evaluation_time( def _extract_dataset_name( self, run_spec_name: str, scenario_name: str | None ) -> str: + """Prefer scenario metadata, falling back to HELM run-spec names.""" if scenario_name: return scenario_name @@ -332,20 +341,16 @@ def _extract_dataset_name( return run_spec_name.split(':')[0] - def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: - metric_names = [] - for metric_spec in run_spec.metric_specs: - names = metric_spec.args.get('names') - if names: - metric_names.extend(names) - else: - metric_names.append(metric_spec.class_name.split('.')[-1]) - - return metric_names - def _transform_single( self, raw_data: Dict, metadata_args: Dict[str, Any] - ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]: + ) -> EvaluationLog: + """Convert one HELM run into aggregate JSON plus detail JSONL. + + The aggregate ``evaluation_result_id`` values are generated from + core metrics in ``stats.json`` with the same helper used by the + instance converter so every metric-specific detail row can join + back to an aggregate result. + """ run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict']) # cast=[str] coerces int instance IDs to str; newer HELM versions # (e.g. long-context suite) store instance.id as int in the JSON. @@ -402,80 +407,112 @@ def _transform_single( evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}' - metric_names = self._extract_metric_names(run_spec) - + # Build aggregate results from core HELM stats themselves, not + # only from run_spec.metric_specs. The instance-level converter emits + # one row per core per-instance stat, so aggregate IDs must cover + # the same core namespace for detailed rows to be joinable. + # TODO: Consider promoting bookkeeping telemetry into structured + # fields such as token_usage, performance, metadata, or + # additional_details in a separate follow-up. evaluation_results: List[EvaluationResult] = [] + seen_evaluation_result_ids: set[str] = set() + + for stat in stats_raw: + # The ID helper mirrors the instance-level converter. This is the + # key invariant: detail rows should never introduce metric IDs that + # are absent from aggregate evaluation_results. + metric_name = getattr(getattr(stat, 'name', None), 'name', None) + if not is_core_metric(metric_name): + continue + score = _score_from_stat(stat) + if metric_name is None or score is None: + continue + + stat_count = getattr(stat, 'count', None) + + evaluation_result_id = _evaluation_result_id( + metric_name, + getattr(stat.name, 'split', None), + getattr(stat.name, 'perturbation', None), + ) + if evaluation_result_id is None: + continue + if evaluation_result_id in seen_evaluation_result_ids: + continue + seen_evaluation_result_ids.add(evaluation_result_id) - for metric_name in set(metric_names): metric_config = MetricConfig( evaluation_description=metric_name, lower_is_better=False, # TODO schema.json check score_type=ScoreType.continuous, min_score=0, - max_score=1, + max_score=1.0, ) - matching_stats = [ - s - for s in stats_raw - if s.name.name == metric_name and not s.name.perturbation - ] - - for stat in matching_stats: - evaluation_name = ( - f'{metric_name} on {source_data.dataset_name}' - if not stat.name.split - else f'{metric_name} {stat.name.split} on {source_data.dataset_name}' - ) + split = getattr(stat.name, 'split', None) + perturbation = getattr(stat.name, 'perturbation', None) + name_parts = [metric_name] + if split: + name_parts.append(str(split)) + if perturbation: + name_parts.append(str(perturbation)) + evaluation_name = ( + f'{" ".join(name_parts)} on {source_data.dataset_name}' + ) - evaluation_results.append( - EvaluationResult( - evaluation_name=evaluation_name, - source_data=source_data, - evaluation_timestamp=evaluation_timestamp, - metric_config=metric_config, - score_details=ScoreDetails( - score=stat.mean - or (stat.sum / stat.count if stat.count else 0.0), - uncertainty=Uncertainty( - standard_deviation=stat.stddev, - num_samples=adapter_spec.max_eval_instances - or len(request_states), + evaluation_results.append( + EvaluationResult( + evaluation_result_id=evaluation_result_id, + evaluation_name=evaluation_name, + source_data=source_data, + evaluation_timestamp=evaluation_timestamp, + metric_config=metric_config, + score_details=ScoreDetails( + score=score, + uncertainty=Uncertainty( + standard_deviation=getattr(stat, 'stddev', None), + # Split-specific HELM stats may cover fewer + # examples than the full run, so use the stat's + # own count when it is available. + num_samples=( + stat_count + if stat_count is not None + else adapter_spec.max_eval_instances + or len(request_states) ), - details={ - 'count': str(stat.count), - 'split': str(stat.name.split) - if stat.name.split - else '', - 'perturbation': str(stat.name.perturbation) - if stat.name.perturbation - else '', - }, ), - generation_config=GenerationConfig( - generation_args=self._extract_generation_args( - adapter_spec=adapter_spec, - request_state=request_states[0], - ), - additional_details={ - 'stop_sequences': json.dumps( - request_states[0].request.stop_sequences - ) - if request_states[0].request.stop_sequences - else '[]', - 'presence_penalty': str( - request_states[0].request.presence_penalty - ), - 'frequency_penalty': str( - request_states[0].request.frequency_penalty - ), - 'num_completions': str( - request_states[0].request.num_completions - ), - }, + details={ + 'count': str(getattr(stat, 'count', '')), + 'split': str(split) if split else '', + 'perturbation': str(perturbation) + if perturbation + else '', + }, + ), + generation_config=GenerationConfig( + generation_args=self._extract_generation_args( + adapter_spec=adapter_spec, + request_state=request_states[0], ), - ) + additional_details={ + 'stop_sequences': json.dumps( + request_states[0].request.stop_sequences + ) + if request_states[0].request.stop_sequences + else '[]', + 'presence_penalty': str( + request_states[0].request.presence_penalty + ), + 'frequency_penalty': str( + request_states[0].request.frequency_penalty + ), + 'num_completions': str( + request_states[0].request.num_completions + ), + }, + ), ) + ) if request_states: parent_eval_output_dir = metadata_args.get('parent_eval_output_dir') diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index 037237ecc..bd0442a89 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, List, Tuple +from typing import Any, List, Tuple, cast _HELM_IMPORT_ERROR: Exception | None = None try: @@ -9,7 +9,7 @@ Exception ) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex - RequestState = Any # type: ignore[assignment] + RequestState = cast(Any, None) def _require_helm_dependencies() -> None: @@ -22,6 +22,7 @@ def _require_helm_dependencies() -> None: from every_eval_ever.converters import SCHEMA_VERSION from every_eval_ever.converters.common.utils import sha256_string +from every_eval_ever.converters.helm.metrics import is_core_metric from every_eval_ever.converters.helm.utils import extract_all_reasonings from every_eval_ever.instance_level_types import ( AnswerAttributionItem, @@ -35,6 +36,98 @@ def _require_helm_dependencies() -> None: ) +def _score_from_stat(stat) -> float | None: + """Return a scalar HELM stat value, or None for empty/bad stats. + + HELM usually provides ``mean``; some stats only have ``sum`` and + ``count``. Returning None lets callers skip stat rows that do not + contain a usable scalar value. + """ + value = getattr(stat, 'mean', None) + if value is None: + count = getattr(stat, 'count', None) + total = getattr(stat, 'sum', None) + if count: + try: + value = total / count + except (TypeError, ValueError, ZeroDivisionError): + return None + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _stat_name_part(value) -> str | None: + """Normalize HELM split/perturbation labels for stable IDs.""" + if value is None: + return None + if isinstance(value, str): + return value or None + if isinstance(value, dict): + return value.get('name') or str(value) + return getattr(value, 'name', None) or str(value) + + +def _evaluation_result_id( + metric_name: str | None, + split=None, + perturbation=None, +) -> str | None: + """Build the join key shared by aggregate and instance rows. + + Split and perturbation labels are included so two HELM stats with the + same metric name do not collide in ``evaluation_result_id``. + """ + if metric_name is None: + return None + parts = [metric_name] + split_part = _stat_name_part(split) + perturbation_part = _stat_name_part(perturbation) + if split_part: + parts.append(split_part) + if perturbation_part: + parts.append(perturbation_part) + return ':'.join(parts) + + +# Metric names whose per-instance score is a correctness signal in [0, 1] +# where ``score > 0`` reasonably maps to ``is_correct=True``. Keep this list +# tight: graded core metrics such as rouge/bleu/f1 should stay out of it +# because a positive score is not the same as a binary correctness claim. +_BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({ + 'exact_match', + 'quasi_exact_match', + 'prefix_exact_match', + 'quasi_prefix_exact_match', + 'exact_match@5', + 'quasi_exact_match@5', + 'prefix_exact_match@5', + 'quasi_prefix_exact_match@5', + 'ifeval_strict_accuracy', + 'chain_of_thought_correctness', + 'math_equiv', + 'math_equiv_chain_of_thought', +}) + + +def _is_correct_for_metric(metric_name: str | None, score: float) -> bool: + """Decide ``is_correct`` honestly per metric name. + + For correctness metrics in the allowlist, the HELM convention is that + score==1.0 means correct and 0.0 means wrong, so any positive score + rounds up to "correct". For graded metrics like rouge_l/bleu/f1 where + >0 is not a correctness signal, we deliberately do not claim correctness. + """ + if metric_name is None: + return False + if metric_name in _BINARY_CORRECTNESS_METRIC_NAMES: + return score > 0 + return False + + class HELMInstanceLevelDataAdapter: def __init__( self, @@ -51,6 +144,7 @@ def __init__( self.path = f'{evaluation_dir}/{evaulation_id}.{format}' def _save_json(self, items: List[InstanceLevelEvaluationLog]): + """Write one validated instance-level log per JSONL line.""" eval_dir_path = Path(self.evaluation_dir) eval_dir_path.mkdir(parents=True, exist_ok=True) path = Path(self.path) @@ -73,6 +167,12 @@ def convert_instance_level_logs( request_states: List[RequestState], per_instance_stats_list: List, ) -> Tuple[str, int]: + """Convert HELM request states into per-(sample, metric) rows. + + When HELM per-instance stats are present, each core metric gets + its own detail row. If core metrics are absent, keep the legacy + one-row exact-match fallback so older or partial logs still convert. + """ instance_level_logs: List[InstanceLevelEvaluationLog] = [] for state in request_states: inst_stats = next( @@ -97,28 +197,41 @@ def convert_instance_level_logs( reasoning_traces = extract_all_reasonings(state) if isinstance(reasoning_traces, str): reasoning_traces = [reasoning_traces] + if reasoning_traces is None: + reasoning_traces = [] + reasoning_traces = [ + trace for trace in reasoning_traces if isinstance(trace, str) + ] - is_correct = False - score = 0.0 - if inst_stats: - em_stat = next( - ( - s - for s in inst_stats.stats - if s.name.name == 'exact_match' - ), - None, + all_stats = list(inst_stats.stats) if inst_stats else [] + metric_stats = [ + stat + for stat in all_stats + if is_core_metric( + getattr(getattr(stat, 'name', None), 'name', None) ) - if em_stat: - score = em_stat.mean - is_correct = em_stat.mean > 0 - else: # TODO check for more specific tasks - correct_completions = sum( - 1 for c in completions if c.strip() in correct_refs - ) - score = correct_completions / len(completions) - is_correct = score > 0 + ] + if not metric_stats: + # Preserve the legacy exact-match proxy instead of dropping + # samples that have no per-instance stats or no recognized + # core metric rows. + correct_completions = sum( + 1 for c in completions if c.strip() in correct_refs + ) + fallback_score = ( + correct_completions / len(completions) + if completions + else 0.0 + ) + metric_stats = [None] + + # Scope control: only core HELM metrics become metric rows. + # TODO: Consider preserving additional bookkeeping telemetry in + # token_usage, performance.additional_details, or metadata. + # Token usage is copied to every row for the same sample. This is + # intentionally denormalized so each core metric row is + # independently useful when filtered by metric. token_usage = None if inst_stats: p_tokens = next( @@ -155,56 +268,82 @@ def convert_instance_level_logs( total_tokens=int(p_tokens + c_tokens), ) - instance_level_logs.append( - InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, - evaluation_id=self.evaluation_id, - model_id=model_id, - evaluation_name=evaluation_name, - sample_id=str(state.instance.id), - sample_hash=sha256_string( - state.request.prompt + (correct_refs[0] if correct_refs else '') - ), # TODO use all references - interaction_type=InteractionType.single_turn, - input=Input( - raw=state.request.prompt, - reference=correct_refs if correct_refs else [], - choices=( - list(state.output_mapping.values()) - if state.output_mapping - else [ - ref.output.text - for ref in state.instance.references - ] + for stat in metric_stats: + if stat is None: + metric_name = None + evaluation_result_id = None + score = fallback_score + # Fallback path: ``score`` here is an exact-match + # proxy from completion-vs-reference matching, so + # the correctness claim is honest in the same sense + # as the legacy single-row behavior. + is_correct = score > 0 + else: + stat_name = getattr(stat, 'name', None) + metric_name = getattr(stat_name, 'name', None) + evaluation_result_id = _evaluation_result_id( + metric_name, + getattr(stat_name, 'split', None), + getattr(stat_name, 'perturbation', None), + ) + score = _score_from_stat(stat) + if score is None: + continue + is_correct = _is_correct_for_metric(metric_name, score) + instance_level_logs.append( + InstanceLevelEvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=self.evaluation_id, + model_id=model_id, + evaluation_name=evaluation_name, + evaluation_result_id=evaluation_result_id, + sample_id=str(state.instance.id), + sample_hash=sha256_string( + state.request.prompt + (correct_refs[0] if correct_refs else '') + ), # TODO use all references + interaction_type=InteractionType.single_turn, + input=Input( + raw=state.request.prompt, + reference=correct_refs if correct_refs else [], + choices=( + list(state.output_mapping.values()) + if state.output_mapping + else [ + ref.output.text + for ref in state.instance.references + ] + ), ), - ), - output=Output( - raw=completions, reasoning_trace=reasoning_traces - ), - answer_attribution=[ - AnswerAttributionItem( - turn_idx=0, - source='output.raw', - extracted_value=state.result.completions[ - 0 - ].text.strip() - if state.result and state.result.completions - else '', - extraction_method='exact_match', - is_terminal=True, - ) - ], - evaluation=Evaluation( - score=float(score), is_correct=is_correct - ), - token_usage=token_usage, - performance=Performance( - generation_time_ms=state.result.request_time * 1000 - if state.result.request_time - else None - ), + output=Output( + raw=completions, reasoning_trace=reasoning_traces + ), + answer_attribution=[ + AnswerAttributionItem( + turn_idx=0, + source='output.raw', + extracted_value=state.result.completions[ + 0 + ].text.strip() + if state.result and state.result.completions + else '', + extraction_method='exact_match', + is_terminal=True, + ) + ], + evaluation=Evaluation( + score=float(score), is_correct=is_correct + ), + token_usage=token_usage, + performance=Performance( + generation_time_ms=( + state.result.request_time * 1000 + if state.result + and state.result.request_time + else None + ) + ), + ) ) - ) self._save_json(instance_level_logs) return self.path, len(instance_level_logs) diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py new file mode 100644 index 000000000..10a509697 --- /dev/null +++ b/every_eval_ever/converters/helm/metrics.py @@ -0,0 +1,33 @@ +"""HELM metric filtering helpers.""" + +from __future__ import annotations + +from typing import Optional + + +# HELM emits both benchmark metrics and bookkeeping telemetry in stats.json / +# per_instance_stats.json. In this PR, only benchmark-quality metrics become +# EEE aggregate/detail metric rows. Bookkeeping can be mapped to token_usage, +# performance, metadata, or additional_details in a future follow-up. +CORE_METRIC_PREFIXES: tuple[str, ...] = ( + 'exact_match', + 'quasi_exact_match', + 'prefix_exact_match', + 'quasi_prefix_exact_match', + 'classification_micro_f1', + 'classification_macro_f1', + 'f1_score', + 'rouge_l', + 'bleu_', + 'ifeval_strict_accuracy', + 'chain_of_thought_correctness', + 'math_equiv', + 'math_equiv_chain_of_thought', +) + + +def is_core_metric(metric_name: Optional[str]) -> bool: + """Return True when a HELM stat should become an EEE metric row.""" + return bool(metric_name) and any( + metric_name.startswith(prefix) for prefix in CORE_METRIC_PREFIXES + ) diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index e08f48dcc..5d2bbb9a0 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -1,12 +1,9 @@ import pytest -pytest.importorskip( - 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' -) - import tempfile from pathlib import Path +from every_eval_ever.converters.helm import adapter as helm_adapter_module from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.eval_types import ( EvaluationLog, @@ -16,7 +13,18 @@ ) +pytestmark = pytest.mark.skipif( + helm_adapter_module._HELM_IMPORT_ERROR is not None, + reason=( + 'HELM converter dependencies are missing: ' + f'{helm_adapter_module._HELM_IMPORT_ERROR!r}. ' + 'Install with: uv sync --extra helm' + ), +) + + def _load_eval(adapter, filepath, metadata_args): + """Run the HELM aggregate adapter against one fixture directory.""" eval_dirpath = Path(filepath) with tempfile.TemporaryDirectory() as tmpdir: @@ -39,6 +47,16 @@ def _load_eval(adapter, filepath, metadata_args): return converted_eval +def _assert_unique_evaluation_result_ids(converted_eval): + """Aggregate result IDs must be stable join targets for sample rows.""" + result_ids = [ + result.evaluation_result_id + for result in converted_eval.evaluation_results + ] + assert all(result_ids) + assert len(result_ids) == len(set(result_ids)) + + def test_mmlu_eval(): adapter = HELMAdapter() metadata_args = { @@ -73,10 +91,14 @@ def test_mmlu_eval(): assert len(results) > 0 assert any('mmlu' in r.evaluation_name.lower() for r in results) assert all(r.metric_config is not None for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 10 + # Per-(sample, metric) emission: each of the 10 samples produces one + # row per non-empty stat, so total_rows is much larger than the + # legacy "one row per sample" count. + assert converted_eval.detailed_evaluation_results.total_rows >= 10 def test_hellswag_eval(): @@ -114,10 +136,12 @@ def test_hellswag_eval(): assert len(results) > 0 assert results[0].score_details.score is not None assert any('hellaswag' in r.evaluation_name.lower() for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 10 + # Per-(sample, core metric): >= sample count, not equal to it. + assert converted_eval.detailed_evaluation_results.total_rows >= 10 def test_narrativeqa_eval(): @@ -151,10 +175,12 @@ def test_narrativeqa_eval(): assert len(results) > 0 assert any('narrativeqa' in r.evaluation_name.lower() for r in results) assert all(r.metric_config is not None for r in results) + _assert_unique_evaluation_result_ids(converted_eval) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.format is not None - assert converted_eval.detailed_evaluation_results.total_rows == 5 + # Per-(sample, core metric): >= sample count, not equal to it. + assert converted_eval.detailed_evaluation_results.total_rows >= 5 def test_missing_model_deployment_falls_back_to_model(): diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index 4ee46c6c2..98e21fcc3 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -1,14 +1,20 @@ -import pytest - -pytest.importorskip( - 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' -) - import json import tempfile from pathlib import Path +from types import SimpleNamespace + +import pytest +from every_eval_ever.converters.helm import adapter as helm_adapter_module from every_eval_ever.converters.helm.adapter import HELMAdapter +from every_eval_ever.converters.helm.metrics import is_core_metric +from every_eval_ever.converters.helm.instance_level_adapter import ( + HELMInstanceLevelDataAdapter, + _BINARY_CORRECTNESS_METRIC_NAMES, + _evaluation_result_id, + _is_correct_for_metric, + _score_from_stat, +) from every_eval_ever.eval_types import EvaluatorRelationship from every_eval_ever.instance_level_types import ( InstanceLevelEvaluationLog, @@ -16,7 +22,18 @@ ) +def _require_helm(): + """Skip HELM fixture tests when the optional converter deps are absent.""" + import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None) + if import_error is not None: + pytest.skip( + 'HELM converter dependencies are missing: ' + f'{import_error!r}. Install with: uv sync --extra helm' + ) + + def _load_instance_level_data(adapter, filepath, metadata_args): + """Run the HELM adapter and read back the generated JSONL detail rows.""" eval_dirpath = Path(filepath) converted_eval_list = adapter.transform_from_directory( eval_dirpath, @@ -43,7 +60,57 @@ def _load_instance_level_data(adapter, filepath, metadata_args): return converted_eval, instance_logs +def _by_sample_and_metric( + instance_logs, +) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]: + """Index detail rows by the two fields that should be unique together.""" + return { + (log.sample_id, log.evaluation_result_id): log + for log in instance_logs + } + + +def _metric_name_from_result_id(result_id: str | None) -> str | None: + """Strip split/perturbation suffixes from deterministic result IDs.""" + if result_id is None: + return None + return result_id.split(':', 1)[0] + + +def _json_score_from_stat(stat: dict) -> float | None: + """Mirror converter score extraction for raw JSON fixtures.""" + value = stat.get('mean') + if value is None: + count = stat.get('count') + total = stat.get('sum') + if count: + try: + value = total / count + except (TypeError, ValueError, ZeroDivisionError): + return None + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _expected_core_instance_stat_rows(filepath): + """Count core fixture stats that should become detail rows.""" + per_instance_path = Path(filepath) / 'per_instance_stats.json' + per_instance_stats = json.loads(per_instance_path.read_text()) + return sum( + 1 + for item in per_instance_stats + for stat in item.get('stats', []) + if is_core_metric(stat.get('name', {}).get('name')) + and _json_score_from_stat(stat) is not None + ) + + def test_mmlu_instance_level(): + _require_helm() adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: @@ -60,9 +127,22 @@ def test_mmlu_instance_level(): metadata_args, ) - assert len(instance_logs) == 10 - log = instance_logs[0] + # The converter now emits many rows per sample. Count distinct samples + # separately from rows so this test stays focused on sample coverage. + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 10 + em_rows = [ + log + for log in instance_logs + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' + ] + assert len(em_rows) == 10 + # Pick a specific metric row instead of relying on JSONL order. + log = _by_sample_and_metric(instance_logs)[ + ('id147', 'exact_match:test') + ] assert log.schema_version == '0.2.2' assert log.evaluation_id == 'test_mmlu_samples' assert log.model_id == 'openai/gpt2' @@ -91,8 +171,11 @@ def test_mmlu_instance_level(): assert log.token_usage.output_tokens > 0 assert log.token_usage.total_tokens > 0 + assert converted_eval.evaluation_results + def test_hellaswag_instance_level(): + _require_helm() adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: @@ -103,14 +186,23 @@ def test_hellaswag_instance_level(): 'file_uuid': 'test_hellaswag', } - converted_eval, instance_logs = _load_instance_level_data( + _, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', metadata_args, ) - assert len(instance_logs) == 10 - log = instance_logs[0] + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 10 + + em_rows = [ + log + for log in instance_logs + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' + ] + assert len(em_rows) == 10 + log = em_rows[0] assert log.schema_version == '0.2.2' assert log.model_id == 'eleutherai/pythia-1b-v0' @@ -129,6 +221,7 @@ def test_hellaswag_instance_level(): def test_narrativeqa_instance_level(): + _require_helm() adapter = HELMAdapter() with tempfile.TemporaryDirectory() as tmpdir: @@ -139,14 +232,23 @@ def test_narrativeqa_instance_level(): 'file_uuid': 'test_narrativeqa', } - converted_eval, instance_logs = _load_instance_level_data( + _, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', metadata_args, ) - assert len(instance_logs) == 5 - log = instance_logs[0] + sample_ids = sorted({log.sample_id for log in instance_logs}) + assert len(sample_ids) == 5 + + em_rows = [ + log + for log in instance_logs + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' + ] + assert len(em_rows) == 5 + log = em_rows[0] assert log.schema_version == '0.2.2' assert log.model_id == 'openai/gpt2' @@ -166,3 +268,332 @@ def test_narrativeqa_instance_level(): assert len(log.answer_attribution) == 1 assert log.answer_attribution[0].extraction_method == 'exact_match' + + +def test_per_sample_core_metric_rows_are_emitted(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_grain', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + rows_for_id147 = [ + log for log in instance_logs if log.sample_id == 'id147' + ] + metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147) + # This guards the core PR behavior: HELM evaluation metrics survive + # as separate rows, while bookkeeping stats remain out-of-band. + assert 'exact_match:test' in metric_ids + assert 'quasi_exact_match:test' in metric_ids + assert 'num_prompt_tokens:test' not in metric_ids + assert 'inference_runtime:test' not in metric_ids + assert all(log.evaluation_result_id is not None for log in rows_for_id147) + assert len(metric_ids) == len(set(metric_ids)) + + +def test_bookkeeping_stats_are_not_emitted_as_metric_rows(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_correctness', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + # Positive bookkeeping values are not correctness claims. A token + # count or runtime can be > 0 without the answer being correct, so + # these stats should not become metric rows at all. + bookkeeping_names = { + 'num_references', + 'num_prompt_tokens', + 'num_completion_tokens', + 'num_output_tokens', + 'inference_runtime', + 'max_prob', + 'finish_reason_unknown', + 'logprob', + 'num_bytes', + 'batch_size', + } + emitted_metric_names = { + _metric_name_from_result_id(log.evaluation_result_id) + for log in instance_logs + } + assert emitted_metric_names.isdisjoint(bookkeeping_names) + + +def test_graded_core_metrics_are_not_binary_correctness(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir2: + metadata_args2 = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir2, + 'file_uuid': 'test_correctness_graded', + } + _, narr_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/narrative_qa:model=openai_gpt2', + metadata_args2, + ) + # Graded generation metrics also should not be coerced into a + # binary correctness label just because their scores are positive. + graded = [ + log + for log in narr_logs + if _metric_name_from_result_id(log.evaluation_result_id) + in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'} + ] + assert graded, 'expected graded score rows in narrative_qa fixture' + assert all( + log.evaluation.is_correct is False for log in graded + ), ( + 'graded metrics (rouge_l/f1_score/bleu_*) must not be ' + 'treated as binary correctness' + ) + + +def test_is_correct_is_true_for_correct_exact_match_rows(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_exact_match_true', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + # Correctness metrics are the exception: positive exact-match rows + # should still carry is_correct=True. + positive_em_rows = [ + log + for log in instance_logs + if _metric_name_from_result_id(log.evaluation_result_id) + == 'exact_match' + and log.evaluation.score > 0 + ] + assert positive_em_rows + for log in positive_em_rows: + assert log.evaluation.is_correct is True + + +def test_is_correct_for_metric_helper(): + for name in _BINARY_CORRECTNESS_METRIC_NAMES: + assert _is_correct_for_metric(name, 1.0) is True + assert _is_correct_for_metric(name, 0.0) is False + for name in ( + 'num_prompt_tokens', + 'num_references', + 'inference_runtime', + 'rouge_l', + 'f1_score', + 'bleu_1', + 'logprob', + None, + ): + assert _is_correct_for_metric(name, 1.0) is False + assert _is_correct_for_metric(name, 0.0) is False + + +def test_score_from_stat_helper_edge_cases(): + # Empty or malformed HELM stats should be skipped, not crash conversion. + assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25 + assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5 + assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None + assert _score_from_stat(SimpleNamespace(mean=None, sum=None, count=1)) is None + assert _score_from_stat(SimpleNamespace(mean='bad', sum=1, count=1)) is None + + +def test_evaluation_result_id_helper_disambiguates_split_and_perturbation(): + # Split and perturbation suffixes prevent same-named HELM stats from + # colliding when they are used as join keys. + assert _evaluation_result_id('exact_match') == 'exact_match' + assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test' + assert ( + _evaluation_result_id( + 'exact_match', + 'test', + SimpleNamespace(name='robustness'), + ) + == 'exact_match:test:robustness' + ) + + +def test_total_rows_matches_core_per_instance_stats(): + _require_helm() + fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2' + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_exact_total_rows', + } + converted_eval, instance_logs = _load_instance_level_data( + adapter, fixture, metadata_args + ) + + # Count expected core metric rows from the fixture itself so + # duplication or accidental filtering changes are caught precisely. + expected_rows = _expected_core_instance_stat_rows(fixture) + assert converted_eval.detailed_evaluation_results.total_rows == expected_rows + assert len(instance_logs) == expected_rows + assert len({ + (log.sample_id, log.evaluation_result_id) + for log in instance_logs + }) == len(instance_logs) + + +def test_instance_evaluation_result_ids_join_to_aggregate_results(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_join_keys', + } + converted_eval, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + # This is the most important schema invariant: every metric-specific + # detail row should be joinable to one aggregate evaluation result. + aggregate_ids = { + result.evaluation_result_id + for result in converted_eval.evaluation_results + if result.evaluation_result_id is not None + } + detail_ids = { + log.evaluation_result_id + for log in instance_logs + if log.evaluation_result_id is not None + } + + assert aggregate_ids + assert detail_ids + assert detail_ids <= aggregate_ids + + +def test_aggregate_evaluation_result_ids_are_unique_and_non_null(): + _require_helm() + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_aggregate_ids', + } + converted_eval, _ = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + + # Aggregate IDs are the target side of the join, so they must be + # present and unique. + result_ids = [ + result.evaluation_result_id + for result in converted_eval.evaluation_results + ] + assert all(result_ids) + assert len(result_ids) == len(set(result_ids)) + assert 'exact_match:test' in result_ids + assert 'num_prompt_tokens:test' not in result_ids + + +def test_missing_inst_stats_uses_legacy_exact_match_fallback(): + _require_helm() + # Some old or partial HELM logs may lack per-instance stats. The adapter + # should still emit the legacy one-row exact-match fallback. + completion = SimpleNamespace(text='answer') + state = SimpleNamespace( + instance=SimpleNamespace( + id='sample0', + references=[ + SimpleNamespace( + output=SimpleNamespace(text='answer'), + tags=['correct'], + ) + ], + ), + result=SimpleNamespace(completions=[completion], request_time=0.1), + request=SimpleNamespace(prompt='question'), + output_mapping={}, + ) + + with tempfile.TemporaryDirectory() as tmpdir: + adapter = HELMInstanceLevelDataAdapter( + 'fallback_samples', + 'jsonl', + 'sha256', + tmpdir, + ) + path, count = adapter.convert_instance_level_logs( + 'tiny', + 'dev/model', + [state], + [], + ) + + assert count == 1 + data = json.loads(Path(path).read_text().strip()) + log = InstanceLevelEvaluationLog.model_validate(data) + assert log.evaluation_result_id is None + assert log.evaluation.score == 1.0 + assert log.evaluation.is_correct is True + + +def test_reasoning_traces_none_does_not_break_conversion(monkeypatch): + _require_helm() + from every_eval_ever.converters.helm import instance_level_adapter as mod + + # HELM reasoning extraction may legitimately return None. The converter + # normalizes that case instead of passing an invalid value to the schema. + monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None) + + adapter = HELMAdapter() + with tempfile.TemporaryDirectory() as tmpdir: + metadata_args = { + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmpdir, + 'file_uuid': 'test_reasoning_none', + } + _, instance_logs = _load_instance_level_data( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) + assert instance_logs + for log in instance_logs: + trace = log.output.reasoning_trace + assert trace is None or trace == [] or all( + isinstance(t, str) for t in trace + )