diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py
index e43eb01b9..d57d7f30f 100644
--- a/every_eval_ever/converters/helm/adapter.py
+++ b/every_eval_ever/converters/helm/adapter.py
@@ -3,7 +3,7 @@
 import os
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Union, cast
 
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
@@ -29,14 +29,18 @@
     Exception
 ) as ex:  # pragma: no cover - exercised only when optional deps missing
     _HELM_IMPORT_ERROR = ex
-    DaciteConfig = from_dict = None  # type: ignore[assignment]
-    PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = (
-        RunSpec
-    ) = Any  # type: ignore[assignment]
-    get_model_deployment = register_builtin_configs_from_helm_package = (
-        from_json
-    ) = None  # type: ignore[assignment]
-    ModelDeploymentNotFoundError = Exception  # type: ignore[assignment]
+    DaciteConfig = cast(Any, None)
+    from_dict = cast(Any, None)
+    PerInstanceStats = cast(Any, None)
+    AdapterSpec = cast(Any, None)
+    RequestState = cast(Any, None)
+    ScenarioState = cast(Any, None)
+    Stat = cast(Any, None)
+    RunSpec = cast(Any, None)
+    get_model_deployment = cast(Any, None)
+    register_builtin_configs_from_helm_package = cast(Any, None)
+    from_json = cast(Any, None)
+    ModelDeploymentNotFoundError = cast(Any, Exception)
 
 from every_eval_ever.converters import SCHEMA_VERSION
 from every_eval_ever.converters.common.adapter import (
@@ -45,8 +49,11 @@
     SupportedLibrary,
 )
 from every_eval_ever.converters.common.utils import sha256_file
+from every_eval_ever.converters.helm.metrics import is_core_metric
 from every_eval_ever.converters.helm.instance_level_adapter import (
     HELMInstanceLevelDataAdapter,
+    _evaluation_result_id,
+    _score_from_stat,
 )
 from every_eval_ever.converters.helm.utils import extract_reasoning
 from every_eval_ever.eval_types import (
@@ -67,7 +74,6 @@
     SourceType,
     Uncertainty,
 )
-from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog
 
 
 def _require_helm_dependencies() -> None:
@@ -106,6 +112,7 @@ def metadata(self) -> AdapterMetadata:
         return AdapterMetadata(
             name='HELMAdapter',
             version='0.0.1',
+            supported_library_versions=['helm'],
             description='HELM adapter with dynamic metrics and unified JSONL instance logging',
         )
 
@@ -129,7 +136,8 @@ def _split_model_id(self, model_id: str | None) -> tuple[str, str]:
         if not model_id:
             return ('unknown', 'unknown')
         if '/' in model_id:
-            return tuple(model_id.split('/', 1))
+            developer, name = model_id.split('/', 1)
+            return (developer, name)
         return ('unknown', model_id)
 
     def _extract_model_info(self, adapter_spec: AdapterSpec) -> ModelInfo:
@@ -191,6 +199,7 @@ def _load_file_if_exists(self, dir_path, file_name) -> Any:
         return None
 
     def _load_evaluation_run_logfiles(self, dir_path) -> Dict:
+        """Load the HELM files needed for aggregate and detail conversion."""
         scenario_state_dict = self._load_file_if_exists(
             dir_path, self.SCENARIO_STATE_FILE
         )
@@ -210,14 +219,18 @@ def _load_evaluation_run_logfiles(self, dir_path) -> Dict:
         }
 
     def transform_from_directory(
-        self, dir_path: str, output_path: str, metadata_args: Dict[str, Any]
-    ):
+        self,
+        dir_path: str | Path,
+        metadata_args: Dict[str, Any] | None = None,
+        output_path: str | None = None,
+    ) -> List[EvaluationLog]:
         """
         Transforms HELM results into one aggregate EvaluationLog and one
         instance-level JSONL file containing all samples.
         """
-        # all_instance_logs: List[InstanceLevelEvaluationLog] = []
         aggregate_logs: List[EvaluationLog] = []
+        metadata_args = metadata_args or {}
+        dir_path = str(dir_path)
 
         file_uuids = metadata_args.get('file_uuids')
 
@@ -260,11 +273,6 @@ def transform_from_directory(
                     aggregate_logs.append(agg)
                     converted_idx += 1
 
-        # # Write all consolidated instance logs to JSONL
-        # with open(output_path, 'w', encoding='utf-8') as f:
-        #     for log in all_instance_logs:
-        #         f.write(json.dumps(log.model_dump(), ensure_ascii=False) + '\n')
-
         return aggregate_logs
 
     def _extract_generation_args(
@@ -318,6 +326,7 @@ def _extract_evaluation_time(
     def _extract_dataset_name(
         self, run_spec_name: str, scenario_name: str | None
     ) -> str:
+        """Prefer scenario metadata, falling back to HELM run-spec names."""
         if scenario_name:
             return scenario_name
 
@@ -332,20 +341,16 @@ def _extract_dataset_name(
 
         return run_spec_name.split(':')[0]
 
-    def _extract_metric_names(self, run_spec: RunSpec) -> List[str]:
-        metric_names = []
-        for metric_spec in run_spec.metric_specs:
-            names = metric_spec.args.get('names')
-            if names:
-                metric_names.extend(names)
-            else:
-                metric_names.append(metric_spec.class_name.split('.')[-1])
-
-        return metric_names
-
     def _transform_single(
         self, raw_data: Dict, metadata_args: Dict[str, Any]
-    ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]:
+    ) -> EvaluationLog:
+        """Convert one HELM run into aggregate JSON plus detail JSONL.
+
+        The aggregate ``evaluation_result_id`` values are generated from
+        core metrics in ``stats.json`` with the same helper used by the
+        instance converter so every metric-specific detail row can join
+        back to an aggregate result.
+        """
         run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict'])
         # cast=[str] coerces int instance IDs to str; newer HELM versions
         # (e.g. long-context suite) store instance.id as int in the JSON.
@@ -402,80 +407,112 @@ def _transform_single(
 
         evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}'
 
-        metric_names = self._extract_metric_names(run_spec)
-
+        # Build aggregate results from core HELM stats themselves, not
+        # only from run_spec.metric_specs. The instance-level converter emits
+        # one row per core per-instance stat, so aggregate IDs must cover
+        # the same core namespace for detailed rows to be joinable.
+        # TODO: Consider promoting bookkeeping telemetry into structured
+        # fields such as token_usage, performance, metadata, or
+        # additional_details in a separate follow-up.
         evaluation_results: List[EvaluationResult] = []
+        seen_evaluation_result_ids: set[str] = set()
+
+        for stat in stats_raw:
+            # The ID helper mirrors the instance-level converter. This is the
+            # key invariant: detail rows should never introduce metric IDs that
+            # are absent from aggregate evaluation_results.
+            metric_name = getattr(getattr(stat, 'name', None), 'name', None)
+            if not is_core_metric(metric_name):
+                continue
+            score = _score_from_stat(stat)
+            if metric_name is None or score is None:
+                continue
+
+            stat_count = getattr(stat, 'count', None)
+
+            evaluation_result_id = _evaluation_result_id(
+                metric_name,
+                getattr(stat.name, 'split', None),
+                getattr(stat.name, 'perturbation', None),
+            )
+            if evaluation_result_id is None:
+                continue
+            if evaluation_result_id in seen_evaluation_result_ids:
+                continue
+            seen_evaluation_result_ids.add(evaluation_result_id)
 
-        for metric_name in set(metric_names):
             metric_config = MetricConfig(
                 evaluation_description=metric_name,
                 lower_is_better=False,  # TODO schema.json check
                 score_type=ScoreType.continuous,
                 min_score=0,
-                max_score=1,
+                max_score=1.0,
             )
 
-            matching_stats = [
-                s
-                for s in stats_raw
-                if s.name.name == metric_name and not s.name.perturbation
-            ]
-
-            for stat in matching_stats:
-                evaluation_name = (
-                    f'{metric_name} on {source_data.dataset_name}'
-                    if not stat.name.split
-                    else f'{metric_name} {stat.name.split} on {source_data.dataset_name}'
-                )
+            split = getattr(stat.name, 'split', None)
+            perturbation = getattr(stat.name, 'perturbation', None)
+            name_parts = [metric_name]
+            if split:
+                name_parts.append(str(split))
+            if perturbation:
+                name_parts.append(str(perturbation))
+            evaluation_name = (
+                f'{" ".join(name_parts)} on {source_data.dataset_name}'
+            )
 
-                evaluation_results.append(
-                    EvaluationResult(
-                        evaluation_name=evaluation_name,
-                        source_data=source_data,
-                        evaluation_timestamp=evaluation_timestamp,
-                        metric_config=metric_config,
-                        score_details=ScoreDetails(
-                            score=stat.mean
-                            or (stat.sum / stat.count if stat.count else 0.0),
-                            uncertainty=Uncertainty(
-                                standard_deviation=stat.stddev,
-                                num_samples=adapter_spec.max_eval_instances
-                                or len(request_states),
+            evaluation_results.append(
+                EvaluationResult(
+                    evaluation_result_id=evaluation_result_id,
+                    evaluation_name=evaluation_name,
+                    source_data=source_data,
+                    evaluation_timestamp=evaluation_timestamp,
+                    metric_config=metric_config,
+                    score_details=ScoreDetails(
+                        score=score,
+                        uncertainty=Uncertainty(
+                            standard_deviation=getattr(stat, 'stddev', None),
+                            # Split-specific HELM stats may cover fewer
+                            # examples than the full run, so use the stat's
+                            # own count when it is available.
+                            num_samples=(
+                                stat_count
+                                if stat_count is not None
+                                else adapter_spec.max_eval_instances
+                                or len(request_states)
                             ),
-                            details={
-                                'count': str(stat.count),
-                                'split': str(stat.name.split)
-                                if stat.name.split
-                                else '',
-                                'perturbation': str(stat.name.perturbation)
-                                if stat.name.perturbation
-                                else '',
-                            },
                         ),
-                        generation_config=GenerationConfig(
-                            generation_args=self._extract_generation_args(
-                                adapter_spec=adapter_spec,
-                                request_state=request_states[0],
-                            ),
-                            additional_details={
-                                'stop_sequences': json.dumps(
-                                    request_states[0].request.stop_sequences
-                                )
-                                if request_states[0].request.stop_sequences
-                                else '[]',
-                                'presence_penalty': str(
-                                    request_states[0].request.presence_penalty
-                                ),
-                                'frequency_penalty': str(
-                                    request_states[0].request.frequency_penalty
-                                ),
-                                'num_completions': str(
-                                    request_states[0].request.num_completions
-                                ),
-                            },
+                        details={
+                            'count': str(getattr(stat, 'count', '')),
+                            'split': str(split) if split else '',
+                            'perturbation': str(perturbation)
+                            if perturbation
+                            else '',
+                        },
+                    ),
+                    generation_config=GenerationConfig(
+                        generation_args=self._extract_generation_args(
+                            adapter_spec=adapter_spec,
+                            request_state=request_states[0],
                         ),
-                    )
+                        additional_details={
+                            'stop_sequences': json.dumps(
+                                request_states[0].request.stop_sequences
+                            )
+                            if request_states[0].request.stop_sequences
+                            else '[]',
+                            'presence_penalty': str(
+                                request_states[0].request.presence_penalty
+                            ),
+                            'frequency_penalty': str(
+                                request_states[0].request.frequency_penalty
+                            ),
+                            'num_completions': str(
+                                request_states[0].request.num_completions
+                            ),
+                        },
+                    ),
                 )
+            )
 
         if request_states:
             parent_eval_output_dir = metadata_args.get('parent_eval_output_dir')
diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py
index 037237ecc..bd0442a89 100644
--- a/every_eval_ever/converters/helm/instance_level_adapter.py
+++ b/every_eval_ever/converters/helm/instance_level_adapter.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, List, Tuple
+from typing import Any, List, Tuple, cast
 
 _HELM_IMPORT_ERROR: Exception | None = None
 try:
@@ -9,7 +9,7 @@
     Exception
 ) as ex:  # pragma: no cover - exercised only when optional deps missing
     _HELM_IMPORT_ERROR = ex
-    RequestState = Any  # type: ignore[assignment]
+    RequestState = cast(Any, None)
 
 
 def _require_helm_dependencies() -> None:
@@ -22,6 +22,7 @@ def _require_helm_dependencies() -> None:
 
 from every_eval_ever.converters import SCHEMA_VERSION
 from every_eval_ever.converters.common.utils import sha256_string
+from every_eval_ever.converters.helm.metrics import is_core_metric
 from every_eval_ever.converters.helm.utils import extract_all_reasonings
 from every_eval_ever.instance_level_types import (
     AnswerAttributionItem,
@@ -35,6 +36,98 @@ def _require_helm_dependencies() -> None:
 )
 
 
+def _score_from_stat(stat) -> float | None:
+    """Return a scalar HELM stat value, or None for empty/bad stats.
+
+    HELM usually provides ``mean``; some stats only have ``sum`` and
+    ``count``. Returning None lets callers skip stat rows that do not
+    contain a usable scalar value.
+    """
+    value = getattr(stat, 'mean', None)
+    if value is None:
+        count = getattr(stat, 'count', None)
+        total = getattr(stat, 'sum', None)
+        if count:
+            try:
+                value = total / count
+            except (TypeError, ValueError, ZeroDivisionError):
+                return None
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _stat_name_part(value) -> str | None:
+    """Normalize HELM split/perturbation labels for stable IDs."""
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value or None
+    if isinstance(value, dict):
+        return value.get('name') or str(value)
+    return getattr(value, 'name', None) or str(value)
+
+
+def _evaluation_result_id(
+    metric_name: str | None,
+    split=None,
+    perturbation=None,
+) -> str | None:
+    """Build the join key shared by aggregate and instance rows.
+
+    Split and perturbation labels are included so two HELM stats with the
+    same metric name do not collide in ``evaluation_result_id``.
+    """
+    if metric_name is None:
+        return None
+    parts = [metric_name]
+    split_part = _stat_name_part(split)
+    perturbation_part = _stat_name_part(perturbation)
+    if split_part:
+        parts.append(split_part)
+    if perturbation_part:
+        parts.append(perturbation_part)
+    return ':'.join(parts)
+
+
+# Metric names whose per-instance score is a correctness signal in [0, 1]
+# where ``score > 0`` reasonably maps to ``is_correct=True``. Keep this list
+# tight: graded core metrics such as rouge/bleu/f1 should stay out of it
+# because a positive score is not the same as a binary correctness claim.
+_BINARY_CORRECTNESS_METRIC_NAMES: frozenset[str] = frozenset({
+    'exact_match',
+    'quasi_exact_match',
+    'prefix_exact_match',
+    'quasi_prefix_exact_match',
+    'exact_match@5',
+    'quasi_exact_match@5',
+    'prefix_exact_match@5',
+    'quasi_prefix_exact_match@5',
+    'ifeval_strict_accuracy',
+    'chain_of_thought_correctness',
+    'math_equiv',
+    'math_equiv_chain_of_thought',
+})
+
+
+def _is_correct_for_metric(metric_name: str | None, score: float) -> bool:
+    """Decide ``is_correct`` honestly per metric name.
+
+    For correctness metrics in the allowlist, the HELM convention is that
+    score==1.0 means correct and 0.0 means wrong, so any positive score
+    rounds up to "correct". For graded metrics like rouge_l/bleu/f1 where
+    >0 is not a correctness signal, we deliberately do not claim correctness.
+    """
+    if metric_name is None:
+        return False
+    if metric_name in _BINARY_CORRECTNESS_METRIC_NAMES:
+        return score > 0
+    return False
+
+
 class HELMInstanceLevelDataAdapter:
     def __init__(
         self,
@@ -51,6 +144,7 @@ def __init__(
         self.path = f'{evaluation_dir}/{evaulation_id}.{format}'
 
     def _save_json(self, items: List[InstanceLevelEvaluationLog]):
+        """Write one validated instance-level log per JSONL line."""
         eval_dir_path = Path(self.evaluation_dir)
         eval_dir_path.mkdir(parents=True, exist_ok=True)
         path = Path(self.path)
@@ -73,6 +167,12 @@ def convert_instance_level_logs(
         request_states: List[RequestState],
         per_instance_stats_list: List,
     ) -> Tuple[str, int]:
+        """Convert HELM request states into per-(sample, metric) rows.
+
+        When HELM per-instance stats are present, each core metric gets
+        its own detail row. If core metrics are absent, keep the legacy
+        one-row exact-match fallback so older or partial logs still convert.
+        """
         instance_level_logs: List[InstanceLevelEvaluationLog] = []
         for state in request_states:
             inst_stats = next(
@@ -97,28 +197,41 @@ def convert_instance_level_logs(
             reasoning_traces = extract_all_reasonings(state)
             if isinstance(reasoning_traces, str):
                 reasoning_traces = [reasoning_traces]
+            if reasoning_traces is None:
+                reasoning_traces = []
+            reasoning_traces = [
+                trace for trace in reasoning_traces if isinstance(trace, str)
+            ]
 
-            is_correct = False
-            score = 0.0
-            if inst_stats:
-                em_stat = next(
-                    (
-                        s
-                        for s in inst_stats.stats
-                        if s.name.name == 'exact_match'
-                    ),
-                    None,
+            all_stats = list(inst_stats.stats) if inst_stats else []
+            metric_stats = [
+                stat
+                for stat in all_stats
+                if is_core_metric(
+                    getattr(getattr(stat, 'name', None), 'name', None)
                 )
-                if em_stat:
-                    score = em_stat.mean
-                    is_correct = em_stat.mean > 0
-                else:  # TODO check for more specific tasks
-                    correct_completions = sum(
-                        1 for c in completions if c.strip() in correct_refs
-                    )
-                    score = correct_completions / len(completions)
-                    is_correct = score > 0
+            ]
+            if not metric_stats:
+                # Preserve the legacy exact-match proxy instead of dropping
+                # samples that have no per-instance stats or no recognized
+                # core metric rows.
+                correct_completions = sum(
+                    1 for c in completions if c.strip() in correct_refs
+                )
+                fallback_score = (
+                    correct_completions / len(completions)
+                    if completions
+                    else 0.0
+                )
+                metric_stats = [None]
+
+            # Scope control: only core HELM metrics become metric rows.
+            # TODO: Consider preserving additional bookkeeping telemetry in
+            # token_usage, performance.additional_details, or metadata.
 
+            # Token usage is copied to every row for the same sample. This is
+            # intentionally denormalized so each core metric row is
+            # independently useful when filtered by metric.
             token_usage = None
             if inst_stats:
                 p_tokens = next(
@@ -155,56 +268,82 @@ def convert_instance_level_logs(
                     total_tokens=int(p_tokens + c_tokens),
                 )
 
-            instance_level_logs.append(
-                InstanceLevelEvaluationLog(
-                    schema_version=SCHEMA_VERSION,
-                    evaluation_id=self.evaluation_id,
-                    model_id=model_id,
-                    evaluation_name=evaluation_name,
-                    sample_id=str(state.instance.id),
-                    sample_hash=sha256_string(
-                        state.request.prompt + (correct_refs[0] if correct_refs else '')
-                    ),  # TODO use all references
-                    interaction_type=InteractionType.single_turn,
-                    input=Input(
-                        raw=state.request.prompt,
-                        reference=correct_refs if correct_refs else [],
-                        choices=(
-                            list(state.output_mapping.values())
-                            if state.output_mapping
-                            else [
-                                ref.output.text
-                                for ref in state.instance.references
-                            ]
+            for stat in metric_stats:
+                if stat is None:
+                    metric_name = None
+                    evaluation_result_id = None
+                    score = fallback_score
+                    # Fallback path: ``score`` here is an exact-match
+                    # proxy from completion-vs-reference matching, so
+                    # the correctness claim is honest in the same sense
+                    # as the legacy single-row behavior.
+                    is_correct = score > 0
+                else:
+                    stat_name = getattr(stat, 'name', None)
+                    metric_name = getattr(stat_name, 'name', None)
+                    evaluation_result_id = _evaluation_result_id(
+                        metric_name,
+                        getattr(stat_name, 'split', None),
+                        getattr(stat_name, 'perturbation', None),
+                    )
+                    score = _score_from_stat(stat)
+                    if score is None:
+                        continue
+                    is_correct = _is_correct_for_metric(metric_name, score)
+                instance_level_logs.append(
+                    InstanceLevelEvaluationLog(
+                        schema_version=SCHEMA_VERSION,
+                        evaluation_id=self.evaluation_id,
+                        model_id=model_id,
+                        evaluation_name=evaluation_name,
+                        evaluation_result_id=evaluation_result_id,
+                        sample_id=str(state.instance.id),
+                        sample_hash=sha256_string(
+                            state.request.prompt + (correct_refs[0] if correct_refs else '')
+                        ),  # TODO use all references
+                        interaction_type=InteractionType.single_turn,
+                        input=Input(
+                            raw=state.request.prompt,
+                            reference=correct_refs if correct_refs else [],
+                            choices=(
+                                list(state.output_mapping.values())
+                                if state.output_mapping
+                                else [
+                                    ref.output.text
+                                    for ref in state.instance.references
+                                ]
+                            ),
                         ),
-                    ),
-                    output=Output(
-                        raw=completions, reasoning_trace=reasoning_traces
-                    ),
-                    answer_attribution=[
-                        AnswerAttributionItem(
-                            turn_idx=0,
-                            source='output.raw',
-                            extracted_value=state.result.completions[
-                                0
-                            ].text.strip()
-                            if state.result and state.result.completions
-                            else '',
-                            extraction_method='exact_match',
-                            is_terminal=True,
-                        )
-                    ],
-                    evaluation=Evaluation(
-                        score=float(score), is_correct=is_correct
-                    ),
-                    token_usage=token_usage,
-                    performance=Performance(
-                        generation_time_ms=state.result.request_time * 1000
-                        if state.result.request_time
-                        else None
-                    ),
+                        output=Output(
+                            raw=completions, reasoning_trace=reasoning_traces
+                        ),
+                        answer_attribution=[
+                            AnswerAttributionItem(
+                                turn_idx=0,
+                                source='output.raw',
+                                extracted_value=state.result.completions[
+                                    0
+                                ].text.strip()
+                                if state.result and state.result.completions
+                                else '',
+                                extraction_method='exact_match',
+                                is_terminal=True,
+                            )
+                        ],
+                        evaluation=Evaluation(
+                            score=float(score), is_correct=is_correct
+                        ),
+                        token_usage=token_usage,
+                        performance=Performance(
+                            generation_time_ms=(
+                                state.result.request_time * 1000
+                                if state.result
+                                and state.result.request_time
+                                else None
+                            )
+                        ),
+                    )
                 )
-            )
 
         self._save_json(instance_level_logs)
         return self.path, len(instance_level_logs)
diff --git a/every_eval_ever/converters/helm/metrics.py b/every_eval_ever/converters/helm/metrics.py
new file mode 100644
index 000000000..10a509697
--- /dev/null
+++ b/every_eval_ever/converters/helm/metrics.py
@@ -0,0 +1,33 @@
+"""HELM metric filtering helpers."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+
+# HELM emits both benchmark metrics and bookkeeping telemetry in stats.json /
+# per_instance_stats.json. In this PR, only benchmark-quality metrics become
+# EEE aggregate/detail metric rows. Bookkeeping can be mapped to token_usage,
+# performance, metadata, or additional_details in a future follow-up.
+CORE_METRIC_PREFIXES: tuple[str, ...] = (
+    'exact_match',
+    'quasi_exact_match',
+    'prefix_exact_match',
+    'quasi_prefix_exact_match',
+    'classification_micro_f1',
+    'classification_macro_f1',
+    'f1_score',
+    'rouge_l',
+    'bleu_',
+    'ifeval_strict_accuracy',
+    'chain_of_thought_correctness',
+    'math_equiv',
+    'math_equiv_chain_of_thought',
+)
+
+
+def is_core_metric(metric_name: Optional[str]) -> bool:
+    """Return True when a HELM stat should become an EEE metric row."""
+    return bool(metric_name) and any(
+        metric_name.startswith(prefix) for prefix in CORE_METRIC_PREFIXES
+    )
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index e08f48dcc..5d2bbb9a0 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -1,12 +1,9 @@
 import pytest
 
-pytest.importorskip(
-    'helm', reason='crfm-helm not installed; install with: uv sync --extra helm'
-)
-
 import tempfile
 from pathlib import Path
 
+from every_eval_ever.converters.helm import adapter as helm_adapter_module
 from every_eval_ever.converters.helm.adapter import HELMAdapter
 from every_eval_ever.eval_types import (
     EvaluationLog,
@@ -16,7 +13,18 @@
 )
 
 
+pytestmark = pytest.mark.skipif(
+    helm_adapter_module._HELM_IMPORT_ERROR is not None,
+    reason=(
+        'HELM converter dependencies are missing: '
+        f'{helm_adapter_module._HELM_IMPORT_ERROR!r}. '
+        'Install with: uv sync --extra helm'
+    ),
+)
+
+
 def _load_eval(adapter, filepath, metadata_args):
+    """Run the HELM aggregate adapter against one fixture directory."""
     eval_dirpath = Path(filepath)
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -39,6 +47,16 @@ def _load_eval(adapter, filepath, metadata_args):
     return converted_eval
 
 
+def _assert_unique_evaluation_result_ids(converted_eval):
+    """Aggregate result IDs must be stable join targets for sample rows."""
+    result_ids = [
+        result.evaluation_result_id
+        for result in converted_eval.evaluation_results
+    ]
+    assert all(result_ids)
+    assert len(result_ids) == len(set(result_ids))
+
+
 def test_mmlu_eval():
     adapter = HELMAdapter()
     metadata_args = {
@@ -73,10 +91,14 @@ def test_mmlu_eval():
     assert len(results) > 0
     assert any('mmlu' in r.evaluation_name.lower() for r in results)
     assert all(r.metric_config is not None for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, metric) emission: each of the 10 samples produces one
+    # row per non-empty stat, so total_rows is much larger than the
+    # legacy "one row per sample" count.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_hellswag_eval():
@@ -114,10 +136,12 @@ def test_hellswag_eval():
     assert len(results) > 0
     assert results[0].score_details.score is not None
     assert any('hellaswag' in r.evaluation_name.lower() for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 10
+    # Per-(sample, core metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 10
 
 
 def test_narrativeqa_eval():
@@ -151,10 +175,12 @@ def test_narrativeqa_eval():
     assert len(results) > 0
     assert any('narrativeqa' in r.evaluation_name.lower() for r in results)
     assert all(r.metric_config is not None for r in results)
+    _assert_unique_evaluation_result_ids(converted_eval)
 
     assert converted_eval.detailed_evaluation_results is not None
     assert converted_eval.detailed_evaluation_results.format is not None
-    assert converted_eval.detailed_evaluation_results.total_rows == 5
+    # Per-(sample, core metric): >= sample count, not equal to it.
+    assert converted_eval.detailed_evaluation_results.total_rows >= 5
 
 
 def test_missing_model_deployment_falls_back_to_model():
diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py
index 4ee46c6c2..98e21fcc3 100644
--- a/tests/test_helm_instance_level_adapter.py
+++ b/tests/test_helm_instance_level_adapter.py
@@ -1,14 +1,20 @@
-import pytest
-
-pytest.importorskip(
-    'helm', reason='crfm-helm not installed; install with: uv sync --extra helm'
-)
-
 import json
 import tempfile
 from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
 
+from every_eval_ever.converters.helm import adapter as helm_adapter_module
 from every_eval_ever.converters.helm.adapter import HELMAdapter
+from every_eval_ever.converters.helm.metrics import is_core_metric
+from every_eval_ever.converters.helm.instance_level_adapter import (
+    HELMInstanceLevelDataAdapter,
+    _BINARY_CORRECTNESS_METRIC_NAMES,
+    _evaluation_result_id,
+    _is_correct_for_metric,
+    _score_from_stat,
+)
 from every_eval_ever.eval_types import EvaluatorRelationship
 from every_eval_ever.instance_level_types import (
     InstanceLevelEvaluationLog,
@@ -16,7 +22,18 @@
 )
 
 
+def _require_helm():
+    """Skip HELM fixture tests when the optional converter deps are absent."""
+    import_error = getattr(helm_adapter_module, '_HELM_IMPORT_ERROR', None)
+    if import_error is not None:
+        pytest.skip(
+            'HELM converter dependencies are missing: '
+            f'{import_error!r}. Install with: uv sync --extra helm'
+        )
+
+
 def _load_instance_level_data(adapter, filepath, metadata_args):
+    """Run the HELM adapter and read back the generated JSONL detail rows."""
     eval_dirpath = Path(filepath)
     converted_eval_list = adapter.transform_from_directory(
         eval_dirpath,
@@ -43,7 +60,57 @@ def _load_instance_level_data(adapter, filepath, metadata_args):
     return converted_eval, instance_logs
 
 
+def _by_sample_and_metric(
+    instance_logs,
+) -> dict[tuple[str, str | None], InstanceLevelEvaluationLog]:
+    """Index detail rows by the two fields that should be unique together."""
+    return {
+        (log.sample_id, log.evaluation_result_id): log
+        for log in instance_logs
+    }
+
+
+def _metric_name_from_result_id(result_id: str | None) -> str | None:
+    """Strip split/perturbation suffixes from deterministic result IDs."""
+    if result_id is None:
+        return None
+    return result_id.split(':', 1)[0]
+
+
+def _json_score_from_stat(stat: dict) -> float | None:
+    """Mirror converter score extraction for raw JSON fixtures."""
+    value = stat.get('mean')
+    if value is None:
+        count = stat.get('count')
+        total = stat.get('sum')
+        if count:
+            try:
+                value = total / count
+            except (TypeError, ValueError, ZeroDivisionError):
+                return None
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _expected_core_instance_stat_rows(filepath):
+    """Count core fixture stats that should become detail rows."""
+    per_instance_path = Path(filepath) / 'per_instance_stats.json'
+    per_instance_stats = json.loads(per_instance_path.read_text())
+    return sum(
+        1
+        for item in per_instance_stats
+        for stat in item.get('stats', [])
+        if is_core_metric(stat.get('name', {}).get('name'))
+        and _json_score_from_stat(stat) is not None
+    )
+
+
 def test_mmlu_instance_level():
+    _require_helm()
     adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -60,9 +127,22 @@ def test_mmlu_instance_level():
             metadata_args,
         )
 
-        assert len(instance_logs) == 10
-        log = instance_logs[0]
+        # The converter now emits many rows per sample. Count distinct samples
+        # separately from rows so this test stays focused on sample coverage.
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 10
+        em_rows = [
+            log
+            for log in instance_logs
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
+        ]
+        assert len(em_rows) == 10
 
+        # Pick a specific metric row instead of relying on JSONL order.
+        log = _by_sample_and_metric(instance_logs)[
+            ('id147', 'exact_match:test')
+        ]
         assert log.schema_version == '0.2.2'
         assert log.evaluation_id == 'test_mmlu_samples'
         assert log.model_id == 'openai/gpt2'
@@ -91,8 +171,11 @@ def test_mmlu_instance_level():
         assert log.token_usage.output_tokens > 0
         assert log.token_usage.total_tokens > 0
 
+        assert converted_eval.evaluation_results
+
 
 def test_hellaswag_instance_level():
+    _require_helm()
     adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -103,14 +186,23 @@ def test_hellaswag_instance_level():
             'file_uuid': 'test_hellaswag',
         }
 
-        converted_eval, instance_logs = _load_instance_level_data(
+        _, instance_logs = _load_instance_level_data(
             adapter,
             'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0',
             metadata_args,
         )
 
-        assert len(instance_logs) == 10
-        log = instance_logs[0]
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 10
+
+        em_rows = [
+            log
+            for log in instance_logs
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
+        ]
+        assert len(em_rows) == 10
+        log = em_rows[0]
 
         assert log.schema_version == '0.2.2'
         assert log.model_id == 'eleutherai/pythia-1b-v0'
@@ -129,6 +221,7 @@ def test_hellaswag_instance_level():
 
 
 def test_narrativeqa_instance_level():
+    _require_helm()
     adapter = HELMAdapter()
 
     with tempfile.TemporaryDirectory() as tmpdir:
@@ -139,14 +232,23 @@ def test_narrativeqa_instance_level():
             'file_uuid': 'test_narrativeqa',
         }
 
-        converted_eval, instance_logs = _load_instance_level_data(
+        _, instance_logs = _load_instance_level_data(
             adapter,
             'tests/data/helm/narrative_qa:model=openai_gpt2',
             metadata_args,
         )
 
-        assert len(instance_logs) == 5
-        log = instance_logs[0]
+        sample_ids = sorted({log.sample_id for log in instance_logs})
+        assert len(sample_ids) == 5
+
+        em_rows = [
+            log
+            for log in instance_logs
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
+        ]
+        assert len(em_rows) == 5
+        log = em_rows[0]
 
         assert log.schema_version == '0.2.2'
         assert log.model_id == 'openai/gpt2'
@@ -166,3 +268,332 @@ def test_narrativeqa_instance_level():
 
         assert len(log.answer_attribution) == 1
         assert log.answer_attribution[0].extraction_method == 'exact_match'
+
+
+def test_per_sample_core_metric_rows_are_emitted():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_grain',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        rows_for_id147 = [
+            log for log in instance_logs if log.sample_id == 'id147'
+        ]
+        metric_ids = sorted(log.evaluation_result_id for log in rows_for_id147)
+        # This guards the core PR behavior: HELM evaluation metrics survive
+        # as separate rows, while bookkeeping stats remain out-of-band.
+        assert 'exact_match:test' in metric_ids
+        assert 'quasi_exact_match:test' in metric_ids
+        assert 'num_prompt_tokens:test' not in metric_ids
+        assert 'inference_runtime:test' not in metric_ids
+        assert all(log.evaluation_result_id is not None for log in rows_for_id147)
+        assert len(metric_ids) == len(set(metric_ids))
+
+
+def test_bookkeeping_stats_are_not_emitted_as_metric_rows():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_correctness',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        # Positive bookkeeping values are not correctness claims. A token
+        # count or runtime can be > 0 without the answer being correct, so
+        # these stats should not become metric rows at all.
+        bookkeeping_names = {
+            'num_references',
+            'num_prompt_tokens',
+            'num_completion_tokens',
+            'num_output_tokens',
+            'inference_runtime',
+            'max_prob',
+            'finish_reason_unknown',
+            'logprob',
+            'num_bytes',
+            'batch_size',
+        }
+        emitted_metric_names = {
+            _metric_name_from_result_id(log.evaluation_result_id)
+            for log in instance_logs
+        }
+        assert emitted_metric_names.isdisjoint(bookkeeping_names)
+
+
+def test_graded_core_metrics_are_not_binary_correctness():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir2:
+        metadata_args2 = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir2,
+            'file_uuid': 'test_correctness_graded',
+        }
+        _, narr_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/narrative_qa:model=openai_gpt2',
+            metadata_args2,
+        )
+        # Graded generation metrics also should not be coerced into a
+        # binary correctness label just because their scores are positive.
+        graded = [
+            log
+            for log in narr_logs
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            in {'rouge_l', 'f1_score', 'bleu_1', 'bleu_4'}
+        ]
+        assert graded, 'expected graded score rows in narrative_qa fixture'
+        assert all(
+            log.evaluation.is_correct is False for log in graded
+        ), (
+            'graded metrics (rouge_l/f1_score/bleu_*) must not be '
+            'treated as binary correctness'
+        )
+
+
+def test_is_correct_is_true_for_correct_exact_match_rows():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_exact_match_true',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        # Correctness metrics are the exception: positive exact-match rows
+        # should still carry is_correct=True.
+        positive_em_rows = [
+            log
+            for log in instance_logs
+            if _metric_name_from_result_id(log.evaluation_result_id)
+            == 'exact_match'
+            and log.evaluation.score > 0
+        ]
+        assert positive_em_rows
+        for log in positive_em_rows:
+            assert log.evaluation.is_correct is True
+
+
+def test_is_correct_for_metric_helper():
+    for name in _BINARY_CORRECTNESS_METRIC_NAMES:
+        assert _is_correct_for_metric(name, 1.0) is True
+        assert _is_correct_for_metric(name, 0.0) is False
+    for name in (
+        'num_prompt_tokens',
+        'num_references',
+        'inference_runtime',
+        'rouge_l',
+        'f1_score',
+        'bleu_1',
+        'logprob',
+        None,
+    ):
+        assert _is_correct_for_metric(name, 1.0) is False
+        assert _is_correct_for_metric(name, 0.0) is False
+
+
+def test_score_from_stat_helper_edge_cases():
+    # Empty or malformed HELM stats should be skipped, not crash conversion.
+    assert _score_from_stat(SimpleNamespace(mean=0.25, sum=10, count=2)) == 0.25
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=3, count=2)) == 1.5
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=0, count=0)) is None
+    assert _score_from_stat(SimpleNamespace(mean=None, sum=None, count=1)) is None
+    assert _score_from_stat(SimpleNamespace(mean='bad', sum=1, count=1)) is None
+
+
+def test_evaluation_result_id_helper_disambiguates_split_and_perturbation():
+    # Split and perturbation suffixes prevent same-named HELM stats from
+    # colliding when they are used as join keys.
+    assert _evaluation_result_id('exact_match') == 'exact_match'
+    assert _evaluation_result_id('exact_match', 'test') == 'exact_match:test'
+    assert (
+        _evaluation_result_id(
+            'exact_match',
+            'test',
+            SimpleNamespace(name='robustness'),
+        )
+        == 'exact_match:test:robustness'
+    )
+
+
+def test_total_rows_matches_core_per_instance_stats():
+    _require_helm()
+    fixture = 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2'
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_exact_total_rows',
+        }
+        converted_eval, instance_logs = _load_instance_level_data(
+            adapter, fixture, metadata_args
+        )
+
+        # Count expected core metric rows from the fixture itself so
+        # duplication or accidental filtering changes are caught precisely.
+        expected_rows = _expected_core_instance_stat_rows(fixture)
+        assert converted_eval.detailed_evaluation_results.total_rows == expected_rows
+        assert len(instance_logs) == expected_rows
+        assert len({
+            (log.sample_id, log.evaluation_result_id)
+            for log in instance_logs
+        }) == len(instance_logs)
+
+
+def test_instance_evaluation_result_ids_join_to_aggregate_results():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_join_keys',
+        }
+        converted_eval, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        # This is the most important schema invariant: every metric-specific
+        # detail row should be joinable to one aggregate evaluation result.
+        aggregate_ids = {
+            result.evaluation_result_id
+            for result in converted_eval.evaluation_results
+            if result.evaluation_result_id is not None
+        }
+        detail_ids = {
+            log.evaluation_result_id
+            for log in instance_logs
+            if log.evaluation_result_id is not None
+        }
+
+        assert aggregate_ids
+        assert detail_ids
+        assert detail_ids <= aggregate_ids
+
+
+def test_aggregate_evaluation_result_ids_are_unique_and_non_null():
+    _require_helm()
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_aggregate_ids',
+        }
+        converted_eval, _ = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+
+        # Aggregate IDs are the target side of the join, so they must be
+        # present and unique.
+        result_ids = [
+            result.evaluation_result_id
+            for result in converted_eval.evaluation_results
+        ]
+        assert all(result_ids)
+        assert len(result_ids) == len(set(result_ids))
+        assert 'exact_match:test' in result_ids
+        assert 'num_prompt_tokens:test' not in result_ids
+
+
+def test_missing_inst_stats_uses_legacy_exact_match_fallback():
+    _require_helm()
+    # Some old or partial HELM logs may lack per-instance stats. The adapter
+    # should still emit the legacy one-row exact-match fallback.
+    completion = SimpleNamespace(text='answer')
+    state = SimpleNamespace(
+        instance=SimpleNamespace(
+            id='sample0',
+            references=[
+                SimpleNamespace(
+                    output=SimpleNamespace(text='answer'),
+                    tags=['correct'],
+                )
+            ],
+        ),
+        result=SimpleNamespace(completions=[completion], request_time=0.1),
+        request=SimpleNamespace(prompt='question'),
+        output_mapping={},
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        adapter = HELMInstanceLevelDataAdapter(
+            'fallback_samples',
+            'jsonl',
+            'sha256',
+            tmpdir,
+        )
+        path, count = adapter.convert_instance_level_logs(
+            'tiny',
+            'dev/model',
+            [state],
+            [],
+        )
+
+        assert count == 1
+        data = json.loads(Path(path).read_text().strip())
+        log = InstanceLevelEvaluationLog.model_validate(data)
+        assert log.evaluation_result_id is None
+        assert log.evaluation.score == 1.0
+        assert log.evaluation.is_correct is True
+
+
+def test_reasoning_traces_none_does_not_break_conversion(monkeypatch):
+    _require_helm()
+    from every_eval_ever.converters.helm import instance_level_adapter as mod
+
+    # HELM reasoning extraction may legitimately return None. The converter
+    # normalizes that case instead of passing an invalid value to the schema.
+    monkeypatch.setattr(mod, 'extract_all_reasonings', lambda state: None)
+
+    adapter = HELMAdapter()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        metadata_args = {
+            'source_organization_name': 'TestOrg',
+            'evaluator_relationship': EvaluatorRelationship.first_party,
+            'parent_eval_output_dir': tmpdir,
+            'file_uuid': 'test_reasoning_none',
+        }
+        _, instance_logs = _load_instance_level_data(
+            adapter,
+            'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2',
+            metadata_args,
+        )
+        assert instance_logs
+        for log in instance_logs:
+            trace = log.output.reasoning_trace
+            assert trace is None or trace == [] or all(
+                isinstance(t, str) for t in trace
+            )