PacificAI · pratacosmin · May 7, 2026 · Apr 29, 2026 · May 4, 2026 · May 5, 2026
diff --git a/scripts/heim_human_eval.py b/scripts/heim_human_eval.py
@@ -171,7 +171,6 @@ def write_out_examples_to_jsonl(final_examples: List[Dict[str, Any]], examples_p
 
     for question_type, question_info in QUESTION_TYPE_TO_INFOS.items():
         with htrack_block(f"Processing question type {question_type}"):
-
             # Keep track of the examples for this question type. Use the image url as the key
             examples: Dict[str, Dict[str, Any]] = {}
 

diff --git a/src/helm/benchmark/annotation/health_bench_annotator.py b/src/helm/benchmark/annotation/health_bench_annotator.py
diff --git a/src/helm/benchmark/annotation/model_as_judge.py b/src/helm/benchmark/annotation/model_as_judge.py
@@ -179,27 +179,53 @@ def _interpolate_prompt(
 
         return Template(tmpl_text).substitute(replacements)
 
-    def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
+    def _validate_annotation(
+        self,
+        annotator_criteria: Dict[str, Any],
+        annotator_name: str,
+    ) -> bool:
         """
-        Validate the annotation meets expected criteria.
-
-        :param annotator_criteria: Annotation dictionary to validate
-        :param annotator_name: Name of the annotator model
-        :return: Whether the annotation is valid
+        Recursively validate the annotation meets expected criteria.
         """
-        for key, value in self._annotation_criteria.items():
-            if key not in annotator_criteria:
-                hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.")
-                return False
 
-            for subkey in value:
-                if subkey not in annotator_criteria[key]:
-                    hwarn(
-                        f"Annotator did not find the expected subkey "
-                        f"'{subkey}' in the response from {annotator_name}."
-                    )
+        def validate(schema: Dict[str, Any], data: Dict[str, Any], path: str = "") -> bool:
+            for key, expected in schema.items():
+                current_path = f"{path}.{key}" if path else key
+
+                if key not in data:
+                    hwarn(f"Missing key '{current_path}' in response from {annotator_name}.")
                     return False
-        return True
+
+                value = data[key]
+
+                # Case 1: nested dict → recurse
+                if isinstance(expected, dict):
+                    if not isinstance(value, dict):
+                        hwarn(f"Expected dict at '{current_path}' but got {type(value)} " f"from {annotator_name}.")
+                        return False
+                    if not validate(expected, value, current_path):
+                        return False
+
+                # Case 2: list of required subkeys
+                elif isinstance(expected, list):
+                    for subkey in expected:
+                        if subkey not in value:
+                            hwarn(f"Missing subkey '{current_path}.{subkey}' " f"in response from {annotator_name}.")
+                            return False
+
+                # Case 3: type checking (optional but useful)
+                elif isinstance(expected, type):
+                    if not isinstance(value, expected):
+                        hwarn(
+                            f"Invalid type at '{current_path}'. "
+                            f"Expected {expected}, got {type(value)} "
+                            f"from {annotator_name}."
+                        )
+                        return False
+
+            return True
+
+        return validate(self._annotation_criteria, annotator_criteria)
 
     def annotate(self, request_state: RequestState) -> Dict[str, Any]:
         """

diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py
@@ -24,7 +24,6 @@ class AnnotationExecutorError(Exception):
 
 @dataclass(frozen=True)
 class AnnotationExecutionSpec:
-
     local_path: str
     """Path where API credentials and cache is stored.
 

diff --git a/src/helm/benchmark/executor.py b/src/helm/benchmark/executor.py
@@ -25,7 +25,6 @@ class ExecutorError(Exception):
 
 @dataclass(frozen=True)
 class ExecutionSpec:
-
     url: Optional[str]
     """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
 

diff --git a/src/helm/benchmark/metrics/evaluate_instances_metric.py b/src/helm/benchmark/metrics/evaluate_instances_metric.py
@@ -30,7 +30,6 @@ def evaluate(
         global_stats: Dict[MetricName, Stat] = {}
 
         for train_trial_index in range(adapter_spec.num_train_trials):
-
             # Aggregate these stats
             trial_stats: Dict[MetricName, Stat] = {}  # Statistics just for this trial
 

diff --git a/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py b/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py
@@ -24,7 +24,6 @@ def evaluate(
         eval_cache_path: str,
         parallelism: int,
     ) -> MetricResult:
-
         instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf)
         for request_state in tqdm(scenario_state.request_states):
             assert request_state.result is not None

diff --git a/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py b/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py
@@ -14,7 +14,6 @@
 
 
 class FractalDimensionMetric(Metric):
-
     # From https://www.nature.com/articles/35065154, "participants in the perception study consistently
     # preferred fractals with D values in the range of 1.3 to 1.5, irrespective of the pattern's origin.
     # Significantly, many of the fractal patterns surrounding us in nature have D values in this range.

diff --git a/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py b/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py
@@ -91,7 +91,6 @@ def skin_pixel_from_image(image_path: str) -> List:
                     and (Cr <= ((-1.15 * Cb) + 301.75))
                     and (Cr <= ((-2.2857 * Cb) + 432.85))
                 ):
-
                     blue.append(img_rgba[i, j].item(2))
                     green.append(img_rgba[i, j].item(1))
                     red.append(img_rgba[i, j].item(0))

diff --git a/src/helm/benchmark/metrics/llm_jury_metrics.py b/src/helm/benchmark/metrics/llm_jury_metrics.py
@@ -33,12 +33,12 @@ def evaluate_generation(
     ) -> List[Stat]:
         assert request_state.annotations
         annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
-        scores: List[int] = []
+        scores: List[float] = []
         score = self.default_score
         for annotation_key, annotation_dict in annotations.items():
             if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
                 for val in annotation_dict.values():
-                    scores.append(int(val["score"]))
+                    scores.append(float(val["score"]))
         if scores:
             score = sum(scores) / len(scores)
         return [

diff --git a/src/helm/benchmark/metrics/lmkt_metrics.py b/src/helm/benchmark/metrics/lmkt_metrics.py
@@ -30,7 +30,6 @@ def evaluate_generation(
         metric_service: MetricService,
         eval_cache_path: str,
     ) -> List[Stat]:
-
         assert request_state.result
 
         completions = [c.text for c in request_state.result.completions]

diff --git a/src/helm/benchmark/metrics/seahelm_metrics.py b/src/helm/benchmark/metrics/seahelm_metrics.py
@@ -180,7 +180,6 @@ def evaluate_generation(
         metric_service: MetricService,
         eval_cache_path: str,
     ) -> List[Stat]:
-
         stats: List[Stat] = []
         if len(request_state.instance.references) > 0:
             golds = [reference for reference in request_state.instance.references if reference.is_correct]

diff --git a/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py b/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py
@@ -19,7 +19,6 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path
         y_pred_quasi: List[str] = []
         y_true: List[str] = []
         for request_state in request_states:  # one request state per instance
-
             for reference in request_state.instance.references:
                 if reference.tags == [CORRECT_TAG]:
                     true_label = reference.output.text

diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py
@@ -280,7 +280,6 @@ def validate_args(args):
 
 @htrack(None)
 def helm_run(args):
-
     validate_args(args)
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)

diff --git a/src/helm/benchmark/run_specs/capabilities_run_specs.py b/src/helm/benchmark/run_specs/capabilities_run_specs.py
@@ -190,7 +190,6 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot:
 
 @run_spec_function("ifeval")
 def get_ifeval_spec() -> RunSpec:
-
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario")
 
     adapter_spec = AdapterSpec(
@@ -246,7 +245,6 @@ def get_wildbench_spec(subset: str) -> RunSpec:
 # TODO: Remove BigCodeBench from capabilities_run_specs.py because it is no longer part of HELM Capabilities
 @run_spec_function("bigcodebench")
 def get_bigcodebench_spec(version: str) -> RunSpec:
-
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
     )
@@ -281,7 +279,6 @@ def get_bigcodebench_spec(version: str) -> RunSpec:
 
 @run_spec_function("omni_math")
 def get_omni_math_spec() -> RunSpec:
-
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario")
 
     adapter_spec = AdapterSpec(

diff --git a/src/helm/benchmark/run_specs/long_context_run_specs.py b/src/helm/benchmark/run_specs/long_context_run_specs.py
@@ -145,7 +145,6 @@ def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:
 
 @run_spec_function("infinite_bench_en_sum")
 def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
-
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario",
         args={

diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py
@@ -12,6 +12,8 @@
 
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_CHAT,
+    AdapterSpec,
 )
 from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
@@ -132,7 +134,6 @@ def get_sct_bench_spec(reason: bool = False, few_shot: bool = False) -> RunSpec:
 
 @run_spec_function("medcalc_bench")
 def get_medcalc_bench_spec(version: Optional[str] = None) -> RunSpec:
-
     scenario_args = {} if version is None else {"version": version}
 
     scenario_spec = ScenarioSpec(
@@ -1403,7 +1404,8 @@ def get_medhallu_spec() -> RunSpec:
 Type: Methodological and Evidence Fabrication - Inventing false research methods, statistical data, or specific clinical outcomes
 
 Do not return anything else, just the answer.
-Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""",  # noqa: E501
+Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""",
+        # noqa: E501
         input_noun=None,
         output_noun=(
             """Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated.
@@ -1616,3 +1618,117 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec:
         metric_specs=get_exact_match_metric_specs(),
         groups=["shc_proxy_med"],
     )
+
+
+@run_spec_function("health_bench")
+def get_health_bench_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchScenario",
+        args={},
+    )
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_CHAT,
+        global_prefix="",
+        global_suffix="",
+        instructions="You are a helpful assistant.",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+
+    annotator_models = get_annotator_models_from_config(jury_config_path)
+
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchAnnotator",
+            args={
+                "annotator_models": annotator_models,
+            },
+        )
+    ]
+
+    metric_specs = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
+            args={
+                "metric_name": "health_bench_score",
+                "scenario_name": "health_bench",
+                "annotator_models": annotator_models,
+                "default_score": 0.0,
+            },
+        )
+    ]
+
+    return RunSpec(
+        name="health_bench",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["health_bench"],
+    )
+
+
+@run_spec_function("health_bench_professional")
+def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchProfessionalScenario",
+        args={},
+    )
+
+    adapter_spec = AdapterSpec(
+        method=ADAPT_CHAT,
+        global_prefix="",
+        global_suffix="",
+        instructions="You are a helpful assistant.",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+
+    annotator_models = get_annotator_models_from_config(jury_config_path)
+
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchProfessionalAnnotator",
+            args={
+                "annotator_models": annotator_models,
+            },
+        )
+    ]
+
+    metric_specs = [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
+            args={
+                "metric_name": "health_bench_professional_score",
+                "scenario_name": "health_bench_professional",
+                "annotator_models": annotator_models,
+                "default_score": 0.0,
+            },
+        )
+    ]
+
+    return RunSpec(
+        name="health_bench_professional",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["health_bench_professional"],
+    )
diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py
@@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
         split: str = TEST_SPLIT
 
         for idx, row in enumerate(tqdm(dataset["train"])):
-
             label = row["disorder_class"]
             transcription = row["transcription"]
 

diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py
@@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
 
         # Find all pairs of audio and JSON files
         for idx, row in enumerate(tqdm(dataset["train"])):
-
             # Load the annotation
             # Load the annotation
             label = row["disorder_class"]

diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py
@@ -48,7 +48,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
         split: str = TEST_SPLIT
 
         for idx, row in enumerate(tqdm(dataset["train"])):
-
             # Load the annotation
             label = row["disorder_class"]
             transcription = row["transcription"]