Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion scripts/heim_human_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ def write_out_examples_to_jsonl(final_examples: List[Dict[str, Any]], examples_p

for question_type, question_info in QUESTION_TYPE_TO_INFOS.items():
with htrack_block(f"Processing question type {question_type}"):

# Keep track of the examples for this question type. Use the image url as the key
examples: Dict[str, Dict[str, Any]] = {}

Expand Down
401 changes: 401 additions & 0 deletions src/helm/benchmark/annotation/health_bench_annotator.py

Large diffs are not rendered by default.

60 changes: 43 additions & 17 deletions src/helm/benchmark/annotation/model_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,27 +179,53 @@ def _interpolate_prompt(

return Template(tmpl_text).substitute(replacements)

def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
def _validate_annotation(
self,
annotator_criteria: Dict[str, Any],
annotator_name: str,
) -> bool:
"""
Validate the annotation meets expected criteria.

:param annotator_criteria: Annotation dictionary to validate
:param annotator_name: Name of the annotator model
:return: Whether the annotation is valid
Recursively validate the annotation meets expected criteria.
"""
for key, value in self._annotation_criteria.items():
if key not in annotator_criteria:
hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.")
return False

for subkey in value:
if subkey not in annotator_criteria[key]:
hwarn(
f"Annotator did not find the expected subkey "
f"'{subkey}' in the response from {annotator_name}."
)
def validate(schema: Dict[str, Any], data: Dict[str, Any], path: str = "") -> bool:
for key, expected in schema.items():
current_path = f"{path}.{key}" if path else key

if key not in data:
hwarn(f"Missing key '{current_path}' in response from {annotator_name}.")
return False
return True

value = data[key]

# Case 1: nested dict → recurse
if isinstance(expected, dict):
if not isinstance(value, dict):
hwarn(f"Expected dict at '{current_path}' but got {type(value)} " f"from {annotator_name}.")
return False
if not validate(expected, value, current_path):
return False

# Case 2: list of required subkeys
elif isinstance(expected, list):
for subkey in expected:
if subkey not in value:
hwarn(f"Missing subkey '{current_path}.{subkey}' " f"in response from {annotator_name}.")
return False

# Case 3: type checking (optional but useful)
elif isinstance(expected, type):
if not isinstance(value, expected):
hwarn(
f"Invalid type at '{current_path}'. "
f"Expected {expected}, got {type(value)} "
f"from {annotator_name}."
)
return False

return True

return validate(self._annotation_criteria, annotator_criteria)

def annotate(self, request_state: RequestState) -> Dict[str, Any]:
"""
Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/annotation_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ class AnnotationExecutorError(Exception):

@dataclass(frozen=True)
class AnnotationExecutionSpec:

local_path: str
"""Path where API credentials and cache is stored.

Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class ExecutorError(Exception):

@dataclass(frozen=True)
class ExecutionSpec:

url: Optional[str]
"""If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""

Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/metrics/evaluate_instances_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def evaluate(
global_stats: Dict[MetricName, Stat] = {}

for train_trial_index in range(adapter_spec.num_train_trials):

# Aggregate these stats
trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def evaluate(
eval_cache_path: str,
parallelism: int,
) -> MetricResult:

instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf)
for request_state in tqdm(scenario_state.request_states):
assert request_state.result is not None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@


class FractalDimensionMetric(Metric):

# From https://www.nature.com/articles/35065154, "participants in the perception study consistently
# preferred fractals with D values in the range of 1.3 to 1.5, irrespective of the pattern's origin.
# Significantly, many of the fractal patterns surrounding us in nature have D values in this range.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def skin_pixel_from_image(image_path: str) -> List:
and (Cr <= ((-1.15 * Cb) + 301.75))
and (Cr <= ((-2.2857 * Cb) + 432.85))
):

blue.append(img_rgba[i, j].item(2))
green.append(img_rgba[i, j].item(1))
red.append(img_rgba[i, j].item(0))
Expand Down
4 changes: 2 additions & 2 deletions src/helm/benchmark/metrics/llm_jury_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ def evaluate_generation(
) -> List[Stat]:
assert request_state.annotations
annotations: Dict[str, Any] = request_state.annotations[self.scenario_name]
scores: List[int] = []
scores: List[float] = []
score = self.default_score
for annotation_key, annotation_dict in annotations.items():
if annotation_key in self.annotator_models.keys() and annotation_dict is not None:
for val in annotation_dict.values():
scores.append(int(val["score"]))
scores.append(float(val["score"]))
if scores:
score = sum(scores) / len(scores)
return [
Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/metrics/lmkt_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def evaluate_generation(
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:

assert request_state.result

completions = [c.text for c in request_state.result.completions]
Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/metrics/seahelm_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ def evaluate_generation(
metric_service: MetricService,
eval_cache_path: str,
) -> List[Stat]:

stats: List[Stat] = []
if len(request_state.instance.references) > 0:
golds = [reference for reference in request_state.instance.references if reference.is_correct]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path
y_pred_quasi: List[str] = []
y_true: List[str] = []
for request_state in request_states: # one request state per instance

for reference in request_state.instance.references:
if reference.tags == [CORRECT_TAG]:
true_label = reference.output.text
Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ def validate_args(args):

@htrack(None)
def helm_run(args):

validate_args(args)
register_builtin_configs_from_helm_package()
register_configs_from_directory(args.local_path)
Expand Down
3 changes: 0 additions & 3 deletions src/helm/benchmark/run_specs/capabilities_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot:

@run_spec_function("ifeval")
def get_ifeval_spec() -> RunSpec:

scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario")

adapter_spec = AdapterSpec(
Expand Down Expand Up @@ -246,7 +245,6 @@ def get_wildbench_spec(subset: str) -> RunSpec:
# TODO: Remove BigCodeBench from capabilities_run_specs.py because it is no longer part of HELM Capabilities
@run_spec_function("bigcodebench")
def get_bigcodebench_spec(version: str) -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
)
Expand Down Expand Up @@ -281,7 +279,6 @@ def get_bigcodebench_spec(version: str) -> RunSpec:

@run_spec_function("omni_math")
def get_omni_math_spec() -> RunSpec:

scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario")

adapter_spec = AdapterSpec(
Expand Down
1 change: 0 additions & 1 deletion src/helm/benchmark/run_specs/long_context_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec:

@run_spec_function("infinite_bench_en_sum")
def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario",
args={
Expand Down
120 changes: 118 additions & 2 deletions src/helm/benchmark/run_specs/medhelm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from helm.benchmark.adaptation.adapter_spec import (
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_CHAT,
AdapterSpec,
)
from helm.benchmark.adaptation.common_adapter_specs import (
get_generation_adapter_spec,
Expand Down Expand Up @@ -132,7 +134,6 @@ def get_sct_bench_spec(reason: bool = False, few_shot: bool = False) -> RunSpec:

@run_spec_function("medcalc_bench")
def get_medcalc_bench_spec(version: Optional[str] = None) -> RunSpec:

scenario_args = {} if version is None else {"version": version}

scenario_spec = ScenarioSpec(
Expand Down Expand Up @@ -1403,7 +1404,8 @@ def get_medhallu_spec() -> RunSpec:
Type: Methodological and Evidence Fabrication - Inventing false research methods, statistical data, or specific clinical outcomes

Do not return anything else, just the answer.
Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""", # noqa: E501
Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""",
# noqa: E501
input_noun=None,
output_noun=(
"""Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated.
Expand Down Expand Up @@ -1616,3 +1618,117 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec:
metric_specs=get_exact_match_metric_specs(),
groups=["shc_proxy_med"],
)


@run_spec_function("health_bench")
def get_health_bench_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchScenario",
args={},
)

adapter_spec = AdapterSpec(
method=ADAPT_CHAT,
global_prefix="",
global_suffix="",
instructions="You are a helpful assistant.",
input_prefix="",
input_suffix="",
output_prefix="",
output_suffix="",
instance_prefix="",
max_train_instances=0,
num_outputs=1,
max_tokens=512,
temperature=0.0,
stop_sequences=[],
)

annotator_models = get_annotator_models_from_config(jury_config_path)

annotator_specs = [
AnnotatorSpec(
class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchAnnotator",
args={
"annotator_models": annotator_models,
},
)
]

metric_specs = [
MetricSpec(
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
args={
"metric_name": "health_bench_score",
"scenario_name": "health_bench",
"annotator_models": annotator_models,
"default_score": 0.0,
},
)
]

return RunSpec(
name="health_bench",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["health_bench"],
)


@run_spec_function("health_bench_professional")
def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = None) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchProfessionalScenario",
args={},
)

adapter_spec = AdapterSpec(
method=ADAPT_CHAT,
global_prefix="",
global_suffix="",
instructions="You are a helpful assistant.",
input_prefix="",
input_suffix="",
output_prefix="",
output_suffix="",
instance_prefix="",
max_train_instances=0,
num_outputs=1,
max_tokens=512,
temperature=0.0,
stop_sequences=[],
)

annotator_models = get_annotator_models_from_config(jury_config_path)

annotator_specs = [
AnnotatorSpec(
class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchProfessionalAnnotator",
args={
"annotator_models": annotator_models,
},
)
]

metric_specs = [
MetricSpec(
class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric",
args={
"metric_name": "health_bench_professional_score",
"scenario_name": "health_bench_professional",
"annotator_models": annotator_models,
"default_score": 0.0,
},
)
]

return RunSpec(
name="health_bench_professional",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["health_bench_professional"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
split: str = TEST_SPLIT

for idx, row in enumerate(tqdm(dataset["train"])):

label = row["disorder_class"]
transcription = row["transcription"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]:

# Find all pairs of audio and JSON files
for idx, row in enumerate(tqdm(dataset["train"])):

# Load the annotation
# Load the annotation
label = row["disorder_class"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def get_instances(self, output_path: str) -> List[Instance]:
split: str = TEST_SPLIT

for idx, row in enumerate(tqdm(dataset["train"])):

# Load the annotation
label = row["disorder_class"]
transcription = row["transcription"]
Expand Down
Loading
Loading