diff --git a/scripts/heim_human_eval.py b/scripts/heim_human_eval.py index 7dbf5a4235a..c34e252b4a5 100644 --- a/scripts/heim_human_eval.py +++ b/scripts/heim_human_eval.py @@ -171,7 +171,6 @@ def write_out_examples_to_jsonl(final_examples: List[Dict[str, Any]], examples_p for question_type, question_info in QUESTION_TYPE_TO_INFOS.items(): with htrack_block(f"Processing question type {question_type}"): - # Keep track of the examples for this question type. Use the image url as the key examples: Dict[str, Dict[str, Any]] = {} diff --git a/src/helm/benchmark/annotation/health_bench_annotator.py b/src/helm/benchmark/annotation/health_bench_annotator.py new file mode 100644 index 00000000000..850c25e1996 --- /dev/null +++ b/src/helm/benchmark/annotation/health_bench_annotator.py @@ -0,0 +1,401 @@ +# The following code includes templates and evaluation logic reproduced with minor modifications from: +# https://github.com/openai/simple-evals/blob/main/healthbench_eval.py +# +# MIT License +# +# Copyright (c) 2024 OpenAI +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import json +import re +from typing import Dict, Any, Optional, Set, Union, List + +from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator +from helm.benchmark.adaptation.request_state import RequestState +from helm.clients.auto_client import AutoClient +from helm.common.hierarchical_logger import hwarn, hlog + +PROMPT_TEMPLATE = """ +Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item. + +# Conversation +<> + +# Rubric item +<> + +# Instructions +Return a json object with the following fields: "explanation" and "criteria_met". +- The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item. +- The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item. If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true is all of the criteria are met. +- One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria. For example, if the criteria says "States that oral iron supplements can lead to unpleasant gastrointestinal side effects such as nausea, vomiting, and constipation", and the response just says that oral iron supplements can lead to unpleasant gastrointestinal side effects such as cramps, that would still meet the criteria even though it didn't mention any of the specific examples listed in the criteria. + +# Example 1 +For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this: + +```json +{ + "explanation": "The assistant did not tell the user to go to the hospital if unconscious.", + "criteria_met": false +} +``` + +# Example 2 +Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not. +For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this: + +```json +{ + "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.", + "criteria_met": false +} +``` + +As another example, if the critera says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this: + +```json +{ + "explanation": "The response is concise, so the criteria was not met.", + "criteria_met": false +} +``` + +In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true. + +# Final instruction +Return just the json object in markdown format. Do not include any other text in the response. +""".strip() +EVAL_PROMPT = """ +You are a medical expert evaluator. + +Your task is to evaluate the quality of an LLM-generated medical response by comparing it to a physician's reference answer. + +You will be given: +1. A user question +2. An LLM response +3. A physician (gold-standard) response + +--- + +### Evaluation Criteria + +Score the LLM response on a scale from 1 to 10, where: + +- 10 (Excellent): + Fully correct diagnosis and treatment. Matches the physician response closely. No significant omissions or inaccuracies. + +- 8–9 (Very Good): + Correct diagnosis and mostly correct treatment. Minor omissions or slightly less detail than physician response. + +- 6–7 (Good): + Generally correct but missing important details (e.g., incomplete treatment, lacks nuance, or minor inaccuracies). + +- 4–5 (Fair): + Partially correct. Either diagnosis or treatment is incomplete or somewhat inaccurate. + +- 2–3 (Poor): + Major issues. Incorrect diagnosis or significantly flawed treatment. + +- 1 (Very Poor): + Completely incorrect or unsafe. Misdiagnosis and/or harmful or missing treatment. + +--- + +### Evaluation Instructions + +- Focus on medical correctness, completeness, and safety. +- Compare both diagnosis and treatment recommendations. +- Penalize missing key treatments, incorrect claims, or unsafe advice. +- Do NOT reward verbosity — accuracy matters more than length. +- Minor wording differences are acceptable if meaning is correct. + +--- + +### Output Format (STRICT) + +Return your answer as a JSON object: + +{ + "criteria_met": , + "explanation": "" +} + +--- + +### Input + +User Question: +<> + +LLM Response: +<> +Physician Response: +<> +--- + +### Your Evaluation +""".strip() + + +def parse_json_safe(text: str) -> dict: + text = re.sub(r"^```json\\s*|\\s*```$", "", text.strip()) + try: + return json.loads(text) + except Exception: + return {} + + +ANNOTATION_CRITERIA: Dict[str, Set[str]] = {"explanation": set(), "criteria_met": {"true", "false"}} + + +def calculate_score(rubric_items: List[dict], graded_responses: List[dict]) -> Optional[float]: + rectified_points = [ + 10 + rubric_item["points"] if rubric_item["points"] < 0 else rubric_item["points"] + for rubric_item in rubric_items + ] + + total_possible_points = sum(rectified_points) + if total_possible_points == 0: + return None + + achieved_points = sum( + pts + for pts, grading_response in zip(rectified_points, graded_responses, strict=True) + if str(grading_response.get("criteria_met", "")).lower() == "true" + ) + + return achieved_points / total_possible_points + + +class HealthBenchAnnotator(LLMAsJuryAnnotator): + def __init__( + self, + auto_client: AutoClient, + annotator_models: Dict[str, AnnotatorModelInfo], + template_name: Optional[str] = None, + ): + super().__init__( + name="health_bench", + auto_client=auto_client, + prompt_template=PROMPT_TEMPLATE, + annotation_criteria=ANNOTATION_CRITERIA, + annotator_models=annotator_models, + ) + + def annotate(self, request_state: RequestState) -> Dict[str, Any]: + assert request_state.result + assert len(request_state.result.completions) == 1 + + overall_score: Dict[str, Any] = {} + + model_output_text = request_state.result.completions[0].text + if not model_output_text.strip(): + hwarn("Annotator skipped sending requests because the model response was empty") + return { + "prompt_text": None, + "empty_output_equivalence_judgement": False, + } + + instance = request_state.instance + + failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models} + + annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = { + "prompt_text": str(instance.input) + } + + convo_with_response = ( + instance.input.messages + [{"content": model_output_text, "role": "assistant"}] + if instance.input.messages + else [{"content": model_output_text, "role": "assistant"}] + ) + + convo_str = "\n\n".join([f"{m['role']}: {m['content']}" for m in convo_with_response]) + + grading_responses: List[dict] = [] + + for annotator_name, annotator_model_info in self._annotator_models.items(): + + if instance.extra_data is None or "rubrics" not in instance.extra_data: + hlog( + f"Instance {instance.id} is missing rubric information, skipping annotation for annotator {annotator_name}" + ) + continue + + rubric_items = instance.extra_data.get("rubrics", []) + + for rubric_item in rubric_items: + annotator_prompt = self._prompt_template.replace("<>", convo_str).replace( + "<>", str(rubric_item) + ) + + try: + annotator_criteria = self._annotate_with_model( + annotator_prompt, + annotator_model_info, + annotator_name, + ) + + if annotator_criteria is not None: + annotations[annotator_name] = annotator_criteria + grading_responses.append(annotator_criteria) + else: + failed_counts[annotator_name] += 1 + + except Exception as e: + hlog(f"ERROR annotating with LLM {annotator_name}: {e}") + failed_counts[annotator_name] += 1 + + score = calculate_score( + rubric_items=rubric_items, + graded_responses=grading_responses, + ) + + overall_score[annotator_name] = { + "accuracy": { + "score": score, + "criteria": grading_responses, + } + } + + hlog(f"Failed model annotations: {failed_counts}") + + return overall_score + + +class HealthBenchProfessionalAnnotator(LLMAsJuryAnnotator): + def __init__( + self, + auto_client: AutoClient, + annotator_models: Dict[str, AnnotatorModelInfo], + template_name: Optional[str] = None, + ): + super().__init__( + name="health_bench_professional", + auto_client=auto_client, + prompt_template=PROMPT_TEMPLATE, + annotation_criteria=ANNOTATION_CRITERIA, + annotator_models=annotator_models, + ) + + def annotate(self, request_state: RequestState) -> Dict[str, Any]: + assert request_state.result + assert len(request_state.result.completions) == 1 + + overall_score: Dict[str, Any] = {} + + model_output_text = request_state.result.completions[0].text + if not model_output_text.strip(): + hwarn("Annotator skipped sending requests because the model response was empty") + return { + "prompt_text": None, + "empty_output_equivalence_judgement": False, + } + + instance = request_state.instance + + failed_counts: Dict[str, int] = {name: 0 for name in self._annotator_models} + + annotations: Dict[str, Union[Optional[str], Optional[bool], Dict[str, Any]]] = { + "prompt_text": str(instance.input) + } + + convo_with_response = ( + instance.input.messages + [{"content": model_output_text, "role": "assistant"}] + if instance.input.messages + else [{"content": model_output_text, "role": "assistant"}] + ) + + convo_str = "\n\n".join([f"{m['role']}: {m['content']}" for m in convo_with_response]) + + grading_responses: List[dict] = [] + + for annotator_name, annotator_model_info in self._annotator_models.items(): + + if instance.extra_data is None or "rubrics" not in instance.extra_data: + hlog( + f"Instance {instance.id} is missing rubric information, skipping annotation for annotator {annotator_name}" + ) + continue + rubric_items = instance.extra_data.get("rubrics", []) + + for rubric_item in rubric_items: + annotator_prompt = self._prompt_template.replace("<>", convo_str).replace( + "<>", str(rubric_item) + ) + + try: + annotator_criteria = self._annotate_with_model( + annotator_prompt, + annotator_model_info, + annotator_name, + ) + + if annotator_criteria is not None: + annotations[annotator_name] = annotator_criteria + grading_responses.append(annotator_criteria) + else: + failed_counts[annotator_name] += 1 + + except Exception as e: + hlog(f"ERROR annotating with LLM {annotator_name}: {e}") + failed_counts[annotator_name] += 1 + + score = calculate_score( + rubric_items=rubric_items, + graded_responses=grading_responses, + ) + evaluate_score = None + physician_response = instance.extra_data.get("physician_response", None) + if physician_response is not None: + try: + eval_prompt = ( + EVAL_PROMPT.replace("<>", convo_str) + .replace("<>", model_output_text) + .replace("<>", physician_response) + ) + evaluation_criteria = self._annotate_with_model( + eval_prompt, + annotator_model_info, + annotator_name, + ) + + if evaluation_criteria is not None: + annotations[annotator_name] = evaluation_criteria + evaluate_score = evaluation_criteria.get("criteria_met", None) + grading_responses.append(evaluation_criteria) + else: + failed_counts[annotator_name] += 1 + + except Exception as e: + hlog(f"ERROR evaluating with LLM {annotator_name}: {e}") + failed_counts[annotator_name] += 1 + # merge annotation score with evaluation score + if evaluate_score is not None: + score = (score + evaluate_score / 10) / 2 + overall_score[annotator_name] = { + "accuracy": { + "score": score, + "criteria": grading_responses, + } + } + + hlog(f"Failed model annotations: {failed_counts}") + + return overall_score diff --git a/src/helm/benchmark/annotation/model_as_judge.py b/src/helm/benchmark/annotation/model_as_judge.py index 2c168e394f3..f085b148acf 100644 --- a/src/helm/benchmark/annotation/model_as_judge.py +++ b/src/helm/benchmark/annotation/model_as_judge.py @@ -179,27 +179,53 @@ def _interpolate_prompt( return Template(tmpl_text).substitute(replacements) - def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool: + def _validate_annotation( + self, + annotator_criteria: Dict[str, Any], + annotator_name: str, + ) -> bool: """ - Validate the annotation meets expected criteria. - - :param annotator_criteria: Annotation dictionary to validate - :param annotator_name: Name of the annotator model - :return: Whether the annotation is valid + Recursively validate the annotation meets expected criteria. """ - for key, value in self._annotation_criteria.items(): - if key not in annotator_criteria: - hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.") - return False - for subkey in value: - if subkey not in annotator_criteria[key]: - hwarn( - f"Annotator did not find the expected subkey " - f"'{subkey}' in the response from {annotator_name}." - ) + def validate(schema: Dict[str, Any], data: Dict[str, Any], path: str = "") -> bool: + for key, expected in schema.items(): + current_path = f"{path}.{key}" if path else key + + if key not in data: + hwarn(f"Missing key '{current_path}' in response from {annotator_name}.") return False - return True + + value = data[key] + + # Case 1: nested dict → recurse + if isinstance(expected, dict): + if not isinstance(value, dict): + hwarn(f"Expected dict at '{current_path}' but got {type(value)} " f"from {annotator_name}.") + return False + if not validate(expected, value, current_path): + return False + + # Case 2: list of required subkeys + elif isinstance(expected, list): + for subkey in expected: + if subkey not in value: + hwarn(f"Missing subkey '{current_path}.{subkey}' " f"in response from {annotator_name}.") + return False + + # Case 3: type checking (optional but useful) + elif isinstance(expected, type): + if not isinstance(value, expected): + hwarn( + f"Invalid type at '{current_path}'. " + f"Expected {expected}, got {type(value)} " + f"from {annotator_name}." + ) + return False + + return True + + return validate(self._annotation_criteria, annotator_criteria) def annotate(self, request_state: RequestState) -> Dict[str, Any]: """ diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py index d8dde73a127..24d0f2bf782 100644 --- a/src/helm/benchmark/annotation_executor.py +++ b/src/helm/benchmark/annotation_executor.py @@ -24,7 +24,6 @@ class AnnotationExecutorError(Exception): @dataclass(frozen=True) class AnnotationExecutionSpec: - local_path: str """Path where API credentials and cache is stored. diff --git a/src/helm/benchmark/executor.py b/src/helm/benchmark/executor.py index 29f86a67ea1..ef7a5d5e596 100644 --- a/src/helm/benchmark/executor.py +++ b/src/helm/benchmark/executor.py @@ -25,7 +25,6 @@ class ExecutorError(Exception): @dataclass(frozen=True) class ExecutionSpec: - url: Optional[str] """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959).""" diff --git a/src/helm/benchmark/metrics/evaluate_instances_metric.py b/src/helm/benchmark/metrics/evaluate_instances_metric.py index f72e16a03ec..9da8482c6f7 100644 --- a/src/helm/benchmark/metrics/evaluate_instances_metric.py +++ b/src/helm/benchmark/metrics/evaluate_instances_metric.py @@ -30,7 +30,6 @@ def evaluate( global_stats: Dict[MetricName, Stat] = {} for train_trial_index in range(adapter_spec.num_train_trials): - # Aggregate these stats trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial diff --git a/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py b/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py index eb3bece3bce..8fb9eb9324d 100644 --- a/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +++ b/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py @@ -24,7 +24,6 @@ def evaluate( eval_cache_path: str, parallelism: int, ) -> MetricResult: - instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf) for request_state in tqdm(scenario_state.request_states): assert request_state.result is not None diff --git a/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py b/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py index d33cc187cb7..f699166b786 100644 --- a/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +++ b/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py @@ -14,7 +14,6 @@ class FractalDimensionMetric(Metric): - # From https://www.nature.com/articles/35065154, "participants in the perception study consistently # preferred fractals with D values in the range of 1.3 to 1.5, irrespective of the pattern's origin. # Significantly, many of the fractal patterns surrounding us in nature have D values in this range. diff --git a/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py b/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py index ec95df0c9a2..99acfeb667c 100644 --- a/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py @@ -91,7 +91,6 @@ def skin_pixel_from_image(image_path: str) -> List: and (Cr <= ((-1.15 * Cb) + 301.75)) and (Cr <= ((-2.2857 * Cb) + 432.85)) ): - blue.append(img_rgba[i, j].item(2)) green.append(img_rgba[i, j].item(1)) red.append(img_rgba[i, j].item(0)) diff --git a/src/helm/benchmark/metrics/llm_jury_metrics.py b/src/helm/benchmark/metrics/llm_jury_metrics.py index dfec8f61b2e..d5292ee8c80 100644 --- a/src/helm/benchmark/metrics/llm_jury_metrics.py +++ b/src/helm/benchmark/metrics/llm_jury_metrics.py @@ -33,12 +33,12 @@ def evaluate_generation( ) -> List[Stat]: assert request_state.annotations annotations: Dict[str, Any] = request_state.annotations[self.scenario_name] - scores: List[int] = [] + scores: List[float] = [] score = self.default_score for annotation_key, annotation_dict in annotations.items(): if annotation_key in self.annotator_models.keys() and annotation_dict is not None: for val in annotation_dict.values(): - scores.append(int(val["score"])) + scores.append(float(val["score"])) if scores: score = sum(scores) / len(scores) return [ diff --git a/src/helm/benchmark/metrics/lmkt_metrics.py b/src/helm/benchmark/metrics/lmkt_metrics.py index 0b65bae1fb8..20adf1b0824 100644 --- a/src/helm/benchmark/metrics/lmkt_metrics.py +++ b/src/helm/benchmark/metrics/lmkt_metrics.py @@ -30,7 +30,6 @@ def evaluate_generation( metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: - assert request_state.result completions = [c.text for c in request_state.result.completions] diff --git a/src/helm/benchmark/metrics/seahelm_metrics.py b/src/helm/benchmark/metrics/seahelm_metrics.py index 5f1a4697af5..b87a92db95e 100644 --- a/src/helm/benchmark/metrics/seahelm_metrics.py +++ b/src/helm/benchmark/metrics/seahelm_metrics.py @@ -180,7 +180,6 @@ def evaluate_generation( metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: - stats: List[Stat] = [] if len(request_state.instance.references) > 0: golds = [reference for reference in request_state.instance.references if reference.is_correct] diff --git a/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py b/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py index 3614e250e22..116e550c702 100644 --- a/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +++ b/src/helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py @@ -19,7 +19,6 @@ def evaluate_instances(self, request_states: List[RequestState], eval_cache_path y_pred_quasi: List[str] = [] y_true: List[str] = [] for request_state in request_states: # one request state per instance - for reference in request_state.instance.references: if reference.tags == [CORRECT_TAG]: true_label = reference.output.text diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py index 5cbeaf9230a..ed52a8facdf 100644 --- a/src/helm/benchmark/run.py +++ b/src/helm/benchmark/run.py @@ -280,7 +280,6 @@ def validate_args(args): @htrack(None) def helm_run(args): - validate_args(args) register_builtin_configs_from_helm_package() register_configs_from_directory(args.local_path) diff --git a/src/helm/benchmark/run_specs/capabilities_run_specs.py b/src/helm/benchmark/run_specs/capabilities_run_specs.py index 8ef584b4e77..632b3222acf 100644 --- a/src/helm/benchmark/run_specs/capabilities_run_specs.py +++ b/src/helm/benchmark/run_specs/capabilities_run_specs.py @@ -190,7 +190,6 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot: @run_spec_function("ifeval") def get_ifeval_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario") adapter_spec = AdapterSpec( @@ -246,7 +245,6 @@ def get_wildbench_spec(subset: str) -> RunSpec: # TODO: Remove BigCodeBench from capabilities_run_specs.py because it is no longer part of HELM Capabilities @run_spec_function("bigcodebench") def get_bigcodebench_spec(version: str) -> RunSpec: - scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version} ) @@ -281,7 +279,6 @@ def get_bigcodebench_spec(version: str) -> RunSpec: @run_spec_function("omni_math") def get_omni_math_spec() -> RunSpec: - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario") adapter_spec = AdapterSpec( diff --git a/src/helm/benchmark/run_specs/long_context_run_specs.py b/src/helm/benchmark/run_specs/long_context_run_specs.py index 7545641f0d4..c712640d5b0 100644 --- a/src/helm/benchmark/run_specs/long_context_run_specs.py +++ b/src/helm/benchmark/run_specs/long_context_run_specs.py @@ -145,7 +145,6 @@ def get_infinite_bench_en_mc_spec(max_num_words: int = 131072) -> RunSpec: @run_spec_function("infinite_bench_en_sum") def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec: - scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario", args={ diff --git a/src/helm/benchmark/run_specs/medhelm_run_specs.py b/src/helm/benchmark/run_specs/medhelm_run_specs.py index 18d7e415bf0..cb3ef0fa0ad 100644 --- a/src/helm/benchmark/run_specs/medhelm_run_specs.py +++ b/src/helm/benchmark/run_specs/medhelm_run_specs.py @@ -12,6 +12,8 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_CHAT, + AdapterSpec, ) from helm.benchmark.adaptation.common_adapter_specs import ( get_generation_adapter_spec, @@ -132,7 +134,6 @@ def get_sct_bench_spec(reason: bool = False, few_shot: bool = False) -> RunSpec: @run_spec_function("medcalc_bench") def get_medcalc_bench_spec(version: Optional[str] = None) -> RunSpec: - scenario_args = {} if version is None else {"version": version} scenario_spec = ScenarioSpec( @@ -1403,7 +1404,8 @@ def get_medhallu_spec() -> RunSpec: Type: Methodological and Evidence Fabrication - Inventing false research methods, statistical data, or specific clinical outcomes Do not return anything else, just the answer. -Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""", # noqa: E501 +Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. No letter or word, just the integer value.""", + # noqa: E501 input_noun=None, output_noun=( """Return just an integer value, '0' if the answer is factual and '1' if the answer is hallucinated. @@ -1616,3 +1618,117 @@ def get_shc_proxy_spec(data_path: str) -> RunSpec: metric_specs=get_exact_match_metric_specs(), groups=["shc_proxy_med"], ) + + +@run_spec_function("health_bench") +def get_health_bench_run_spec(jury_config_path: Optional[str] = None) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchScenario", + args={}, + ) + + adapter_spec = AdapterSpec( + method=ADAPT_CHAT, + global_prefix="", + global_suffix="", + instructions="You are a helpful assistant.", + input_prefix="", + input_suffix="", + output_prefix="", + output_suffix="", + instance_prefix="", + max_train_instances=0, + num_outputs=1, + max_tokens=512, + temperature=0.0, + stop_sequences=[], + ) + + annotator_models = get_annotator_models_from_config(jury_config_path) + + annotator_specs = [ + AnnotatorSpec( + class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchAnnotator", + args={ + "annotator_models": annotator_models, + }, + ) + ] + + metric_specs = [ + MetricSpec( + class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric", + args={ + "metric_name": "health_bench_score", + "scenario_name": "health_bench", + "annotator_models": annotator_models, + "default_score": 0.0, + }, + ) + ] + + return RunSpec( + name="health_bench", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["health_bench"], + ) + + +@run_spec_function("health_bench_professional") +def get_health_bench_professional_run_spec(jury_config_path: Optional[str] = None) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.health_bench_scenario.HealthBenchProfessionalScenario", + args={}, + ) + + adapter_spec = AdapterSpec( + method=ADAPT_CHAT, + global_prefix="", + global_suffix="", + instructions="You are a helpful assistant.", + input_prefix="", + input_suffix="", + output_prefix="", + output_suffix="", + instance_prefix="", + max_train_instances=0, + num_outputs=1, + max_tokens=512, + temperature=0.0, + stop_sequences=[], + ) + + annotator_models = get_annotator_models_from_config(jury_config_path) + + annotator_specs = [ + AnnotatorSpec( + class_name="helm.benchmark.annotation.health_bench_annotator.HealthBenchProfessionalAnnotator", + args={ + "annotator_models": annotator_models, + }, + ) + ] + + metric_specs = [ + MetricSpec( + class_name="helm.benchmark.metrics.llm_jury_metrics.LLMJuryMetric", + args={ + "metric_name": "health_bench_professional_score", + "scenario_name": "health_bench_professional", + "annotator_models": annotator_models, + "default_score": 0.0, + }, + ) + ] + + return RunSpec( + name="health_bench_professional", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["health_bench_professional"], + ) diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py index 5960ecfcd04..0b0a482e8e6 100644 --- a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py @@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]: split: str = TEST_SPLIT for idx, row in enumerate(tqdm(dataset["train"])): - label = row["disorder_class"] transcription = row["transcription"] diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py index 170ef0e7e3c..188cf0963e1 100644 --- a/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py @@ -46,7 +46,6 @@ def get_instances(self, output_path: str) -> List[Instance]: # Find all pairs of audio and JSON files for idx, row in enumerate(tqdm(dataset["train"])): - # Load the annotation # Load the annotation label = row["disorder_class"] diff --git a/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py b/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py index 45d776f5a7f..cd4abd00e01 100644 --- a/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +++ b/src/helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py @@ -48,7 +48,6 @@ def get_instances(self, output_path: str) -> List[Instance]: split: str = TEST_SPLIT for idx, row in enumerate(tqdm(dataset["train"])): - # Load the annotation label = row["disorder_class"] transcription = row["transcription"] diff --git a/src/helm/benchmark/scenarios/health_bench_scenario.py b/src/helm/benchmark/scenarios/health_bench_scenario.py new file mode 100644 index 00000000000..8ffbd00a9d3 --- /dev/null +++ b/src/helm/benchmark/scenarios/health_bench_scenario.py @@ -0,0 +1,174 @@ +# The following code includes templates and evaluation logic reproduced with minor modifications from: +# https://github.com/openai/simple-evals/blob/main/healthbench_eval.py +# +# MIT License +# +# Copyright (c) 2024 OpenAI +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import json +import os +from typing import List +import requests + +from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + TEST_SPLIT, + Input, + ScenarioMetadata, +) +from helm.common.general import ensure_directory_exists + +from helm.common.general import ensure_file_downloaded + + +class HealthBenchScenario(Scenario): + name = "health_bench" + description = "HealthBench-style rubric evaluation (LLM-as-judge)" + tags = ["health", "rubric", "llm-judge"] + + DATASET_DOWNLOAD_URL = ( + "https://openaipublic.blob.core.windows.net/simple-evals/healthbench/2025-05-07-06-14-12_oss_eval.jsonl" + ) + FILENAME = "healthbench.jsonl" + + def download_data(self, cache_path: str) -> str: + file_path = os.path.join(cache_path, "healthbench.jsonl") + + if os.path.exists(file_path): + return file_path + + print(f"Downloading dataset to {file_path}...") + response = requests.get(self.DATASET_DOWNLOAD_URL, stream=True) + response.raise_for_status() + + with open(file_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + return file_path + + def get_instances(self, output_path: str) -> List[Instance]: + data_path = os.path.join(output_path, self.FILENAME) + ensure_file_downloaded( + source_url=self.DATASET_DOWNLOAD_URL, + target_path=data_path, + unpack=False, + ) + + instances: List[Instance] = [] + + with open(data_path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + row = json.loads(line) + + messages = row["prompt"] + + instances.append( + Instance( + input=Input(messages=messages), + references=[], + split=TEST_SPLIT, + id=str(idx), + extra_data={ + "rubrics": row.get("rubrics", []), + "example_tags": row.get("example_tags", []), + "prompt_id": row.get("prompt_id"), + }, + ) + ) + + return instances + + def get_metadata(self) -> ScenarioMetadata: + return ScenarioMetadata( + name="health_bench", + display_name="HealthBench", + description="HealthBench: a new benchmark designed to better measure the capabilities of AI systems for health. Built in partnership with 262 physicians who have practiced in 60 countries, HealthBench includes 5,000 realistic health conversations, each with a custom physician-created rubric to grade model responses.", + taxonomy=TaxonomyInfo( + task="Classification", + what="Verify whether answers to questions from LLMs are correct according to a rubric", + when="Any", + who="Researcher", + language="Any", + ), + main_metric="medhelm_health_score", + main_split="test", + ) + + +class HealthBenchProfessionalScenario(Scenario): + name = "health_bench_professional" + description = "HealthBench Professional rubric evaluation (LLM-as-judge)" + tags = ["health", "rubric", "llm-judge"] + + DATASET_DOWNLOAD_URL = "https://openaipublic.blob.core.windows.net/simple-evals/healthbench_professional/assets.zip" + FILENAME = "assets.zip" + DATA_FILE = "healthbench_professional_eval.jsonl" + + def get_instances(self, output_path: str) -> List[Instance]: + data_path = os.path.join(output_path, self.FILENAME) + ensure_file_downloaded( + source_url=self.DATASET_DOWNLOAD_URL, + target_path=data_path, + unpack=True, + ) + + instances: List[Instance] = [] + ensure_directory_exists(data_path) + data_file_path = os.path.join(data_path, self.DATA_FILE) + with open(data_file_path, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + row = json.loads(line) + + messages = row["conversation"].get("messages", []) + + instances.append( + Instance( + input=Input(messages=messages), + references=[], + split=TEST_SPLIT, + id=str(idx), + extra_data={ + "rubrics": row.get("rubric_items", []), + "prompt_id": row.get("id"), + "physician_response": row.get("physician_response", ""), + }, + ) + ) + + return instances + + def get_metadata(self) -> ScenarioMetadata: + return ScenarioMetadata( + name="health_bench_professional", + display_name="HealthBench Professional", + description="HealthBenchProfessional: a new benchmark designed to better measure the capabilities of AI systems for health. Built in partnership with 262 physicians who have practiced in 60 countries", + taxonomy=TaxonomyInfo( + task="Classification", + what="Verify whether answers to questions from LLMs are correct according to a rubric", + when="Any", + who="Researcher", + language="Any", + ), + main_metric="health_bench_professional_score", + main_split="test", + ) diff --git a/src/helm/benchmark/scenarios/melt_ir_scenario.py b/src/helm/benchmark/scenarios/melt_ir_scenario.py index 7f4bf9cffd8..e94dc316e88 100644 --- a/src/helm/benchmark/scenarios/melt_ir_scenario.py +++ b/src/helm/benchmark/scenarios/melt_ir_scenario.py @@ -72,7 +72,6 @@ def get_train_instances(self) -> List[Instance]: ) instances = [] for i, sample in enumerate(dataset["train"]): - if i >= self.NUM_TRAIN_QUERIES: break diff --git a/src/helm/benchmark/scenarios/melt_knowledge_scenario.py b/src/helm/benchmark/scenarios/melt_knowledge_scenario.py index 3d7ebb00f24..a27a16174a9 100644 --- a/src/helm/benchmark/scenarios/melt_knowledge_scenario.py +++ b/src/helm/benchmark/scenarios/melt_knowledge_scenario.py @@ -60,7 +60,6 @@ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]: trust_remote_code=True, ) for dataset_split_name, helm_split_name in splits.items(): - for sample in dataset[dataset_split_name]: instance = Instance( input=Input(text=sample["question"]), diff --git a/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py b/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py index 82e0ba8e80d..332a394bd95 100644 --- a/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +++ b/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py @@ -35,7 +35,6 @@ def process_csv(self, csv_path: str, split: str) -> List[Instance]: with open(csv_path) as f: reader = csv.reader(f, delimiter=",") for row in reader: - question, answers, correct_choice = row[0], row[1:-1], row[-1] answers_dict = dict(zip(["A", "B", "C", "D"], answers)) correct_answer: str = answers_dict[correct_choice] diff --git a/src/helm/benchmark/scenarios/omni_math_scenario.py b/src/helm/benchmark/scenarios/omni_math_scenario.py index 93237ad39d2..69a2782fdbb 100644 --- a/src/helm/benchmark/scenarios/omni_math_scenario.py +++ b/src/helm/benchmark/scenarios/omni_math_scenario.py @@ -43,7 +43,6 @@ def get_instances(self, output_path: str) -> List[Instance]: # Read all instances instances: List[Instance] = [] for idx, row in enumerate(dataset): - input = Input(text=row["problem"]) instance = Instance( input=input, diff --git a/src/helm/benchmark/scenarios/vision_language/pairs_scenario.py b/src/helm/benchmark/scenarios/vision_language/pairs_scenario.py index 95660bdc6d6..3b24442e1f6 100644 --- a/src/helm/benchmark/scenarios/vision_language/pairs_scenario.py +++ b/src/helm/benchmark/scenarios/vision_language/pairs_scenario.py @@ -213,7 +213,6 @@ def get_instances(self, output_path: str) -> List[Instance]: instances: List[Instance] = [] for question_type, questions in self._type_to_questions.items(): - image_url: str = self.IMAGE_URL.format(subset=self._subset, type=question_type, person=self._person) # We have to add this due to a mistake in naming in the original dataset if self._subset == "status" and question_type == "phone" and self._person == "white_man": diff --git a/src/helm/benchmark/static/schema_medhelm.yaml b/src/helm/benchmark/static/schema_medhelm.yaml index a86ebfd55b4..34e5dbdc99c 100644 --- a/src/helm/benchmark/static/schema_medhelm.yaml +++ b/src/helm/benchmark/static/schema_medhelm.yaml @@ -269,6 +269,19 @@ metrics: description: Measures the average score assigned by an LLM-based jury evaluating task performance. lower_is_better: false + - name: health_bench_score + display_name: HealthBench Jury Score + short_display_name: Jury Score + description: Measures the average score assigned by an LLM-based jury evaluating task performance for HealthBench scenarios. + lower_is_better: false + + - name: health_bench_professional_score + display_name: HealthBench Professional Jury Score + short_display_name: Jury Score + description: Measures the average score assigned by an LLM-based jury evaluating task performance for HealthBench Professional scenarios and a score comparring the response with the one from a physicians + lower_is_better: false + + # Summariazation metrics - name: summac display_name: SummaC @@ -311,7 +324,7 @@ metrics: display_name: HumanEval-coherence description: Human evaluation score for coherence. lower_is_better: false - + # Toxicity metrics - name: expected_max_toxicity display_name: Average max toxicity @@ -402,13 +415,13 @@ metrics: short_display_name: Platt Intercept description: Intercept of the Platt scaling classifier (can compare this across tasks). lower_is_better: false - + - name: ehr_sql_total_predicted_answerable display_name: Total Predicted Answerable short_display_name: Total Pred Ans description: Total number of questions predicted to be answerable by the model. lower_is_better: false - + - name: ehr_sql_total_ground_truth_answerable display_name: Total Ground Truth Answerable short_display_name: Total GT Ans @@ -502,6 +515,8 @@ run_groups: - shc_ptbm_med - shc_sei_med - sct_bench + - health_bench + - health_bench_professional - name: clinical_note_generation display_name: Clinical Note Generation @@ -514,7 +529,7 @@ run_groups: - mimic_rrs - mimic_bhc - chw_care_plan - + - name: patient_communication display_name: Patient Communication and Education description: Scenarios for patient communication and education @@ -528,7 +543,7 @@ run_groups: - mental_health - shc_proxy_med - shc_privacy_med - + - name: medical_research display_name: Medical Research Assistance description: Scenarios for medical research assistance @@ -540,7 +555,7 @@ run_groups: - race_based_med - n2c2_ct_matching - medhallu - + - name: administration_and_workflow display_name: Administration and Workflow description: Scenarios for administration and workflow @@ -586,6 +601,41 @@ run_groups: when: "Any" language: English + - name: health_bench + display_name: HealthBench + description: "HealthBench is a rubric-based benchmark containing 5,000 multi-turn health conversations. It is used to evaluate a model's ability to provide safe, accurate, and contextually appropriate medical information and triage to diverse user personas [(OpenAI, 2025)](https://openai.com/index/healthbench/)." + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: health_bench_score + main_split: test + taxonomy: + task: Text generation + what: "Provide context-tailored medical advice and triage based on user inquiries" + who: "Patient, Clinician" + when: "Any" + language: "Multilingual" + + - name: health_bench_professional + display_name: HealthBench Professional + short_display_name: HealthBench Pro + description: "HealthBench Professional is a curated benchmark consisting of 525 physician-authored tasks. It is used to evaluate a frontier model's ability to execute complex clinical workflows, including advanced diagnostic reasoning, medical documentation, and evidence synthesis [(OpenAI, 2026)](https://openai.com/index/making-chatgpt-better-for-clinicians/)." + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: health_bench_professional_score + main_split: test + taxonomy: + task: Text generation + what: "Execute diagnostic reasoning and documentation tasks based on clinical workflows" + who: "Clinician" + when: "Any" + language: "English" + - name: clear display_name: CLEAR description: CLEAR is a benchmark designed to evaluate models on their ability to detect medical conditions from patient notes using categorical responses. Each instance consists of a clinical note and a target condition, requiring the model to classify the patient's history as either affirmative, negative, or uncertain [(Lopez et al., 2025)](https://www.nature.com/articles/s41746-024-01377-1). diff --git a/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py b/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py index c1e5471da2e..c6a0c31d92b 100644 --- a/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +++ b/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py @@ -60,12 +60,16 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = ( - self.prepare_inputs_labels_for_speech_and_text( - input_ids, position_ids, attention_mask, past_key_values, labels, speech, speech_lengths - ) + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels, + ) = self.prepare_inputs_labels_for_speech_and_text( + input_ids, position_ids, attention_mask, past_key_values, labels, speech, speech_lengths ) if self.training: @@ -148,10 +152,15 @@ def generate( raise NotImplementedError("`inputs_embeds` is not supported") if speech is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = ( - self.prepare_inputs_labels_for_speech_and_text( - inputs, position_ids, attention_mask, None, None, speech, speech_lengths - ) + ( + inputs, + position_ids, + attention_mask, + _, + inputs_embeds, + _, + ) = self.prepare_inputs_labels_for_speech_and_text( + inputs, position_ids, attention_mask, None, None, speech, speech_lengths ) else: inputs_embeds = self.get_model().embed_tokens(inputs) diff --git a/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py b/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py index 2bd5e606013..5986a3b9cbb 100644 --- a/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +++ b/src/helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py @@ -55,12 +55,16 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: - if inputs_embeds is None: - (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = ( - self.prepare_inputs_labels_for_speech_and_text( - input_ids, position_ids, attention_mask, past_key_values, labels, speech, speech_lengths - ) + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels, + ) = self.prepare_inputs_labels_for_speech_and_text( + input_ids, position_ids, attention_mask, past_key_values, labels, speech, speech_lengths ) return super().forward( @@ -90,10 +94,15 @@ def generate( raise NotImplementedError("`inputs_embeds` is not supported") if speech is not None: - (inputs, position_ids, attention_mask, _, inputs_embeds, _) = ( - self.prepare_inputs_labels_for_speech_and_text( - inputs, position_ids, attention_mask, None, None, speech, speech_lengths - ) + ( + inputs, + position_ids, + attention_mask, + _, + inputs_embeds, + _, + ) = self.prepare_inputs_labels_for_speech_and_text( + inputs, position_ids, attention_mask, None, None, speech, speech_lengths ) else: inputs_embeds = self.get_model().embed_tokens(inputs) diff --git a/src/helm/clients/audio_language/llama_omni/model/omni_speech_arch.py b/src/helm/clients/audio_language/llama_omni/model/omni_speech_arch.py index d0260f84bed..75e4f25f539 100644 --- a/src/helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +++ b/src/helm/clients/audio_language/llama_omni/model/omni_speech_arch.py @@ -9,7 +9,6 @@ class OmniSpeechMetaModel(nn.Module): - def __init__(self, config): super(OmniSpeechMetaModel, self).__init__(config) self.config = config diff --git a/src/helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py b/src/helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py index d436e32e644..0d5ad5ce3cd 100644 --- a/src/helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +++ b/src/helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py @@ -4,10 +4,8 @@ class WhisperWrappedEncoder: - @classmethod def load(cls, model_config): - def replace_layer_norm(module): from whisper.model import LayerNorm diff --git a/src/helm/clients/audio_language/llama_omni/model/speech_generator/generation.py b/src/helm/clients/audio_language/llama_omni/model/speech_generator/generation.py index d02b14a836d..660b8ecb68c 100644 --- a/src/helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +++ b/src/helm/clients/audio_language/llama_omni/model/speech_generator/generation.py @@ -32,7 +32,6 @@ class GenerationWithCTC(GenerationMixin): - @torch.no_grad() def generate( self, @@ -52,7 +51,6 @@ def generate( negative_prompt_attention_mask: Optional[torch.Tensor] = None, **kwargs, ): - # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call self._validate_model_class() tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria diff --git a/src/helm/clients/audio_language/llama_omni/preprocess.py b/src/helm/clients/audio_language/llama_omni/preprocess.py index ff46d53d771..a0f6bf4a701 100644 --- a/src/helm/clients/audio_language/llama_omni/preprocess.py +++ b/src/helm/clients/audio_language/llama_omni/preprocess.py @@ -165,7 +165,6 @@ def preprocess_llama_3(sources, tokenizer: transformers.PreTrainedTokenizer, has # Mask targets sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>\n\n" for conversation, target in zip(conversations, targets): - cur_len = 1 target[:cur_len] = IGNORE_INDEX parts = conversation.split(sep) diff --git a/src/helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py b/src/helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py index 35523785eef..0b6451d1f27 100644 --- a/src/helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +++ b/src/helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py @@ -108,7 +108,6 @@ def __init__( class Qwen2_5OmniAudioEncoderConfig(PretrainedConfig): - model_type = "qwen2_5_omni_audio_encoder" def __init__( @@ -149,7 +148,6 @@ def __init__( class Qwen2_5OmniTextConfig(PretrainedConfig): - model_type = "qwen2_5_omni_text" is_composition = False @@ -261,7 +259,6 @@ def __init__( class Qwen2_5OmniTalkerConfig(PretrainedConfig): - model_type = "qwen2_5_omni_talker" is_composition = False @@ -357,7 +354,6 @@ def __init__( class Qwen2_5OmniDiTConfig(PretrainedConfig): - model_type = "qwen2_5_omni_dit" def __init__( @@ -414,7 +410,6 @@ def __init__( class Qwen2_5OmniBigVGANConfig(PretrainedConfig): - model_type = "qwen2_5_omni_bigvgan" def __init__( @@ -437,7 +432,6 @@ def __init__( class Qwen2_5OmniToken2WavConfig(PretrainedConfig): - model_type = "qwen2_5_omni_token2wav" sub_configs = { "dit_config": Qwen2_5OmniDiTConfig, @@ -456,7 +450,6 @@ def __init__(self, dit_config=None, bigvgan_config=None, **kwargs): class Qwen2_5OmniConfig(PretrainedConfig): - model_type = "qwen2_5_omni" sub_configs = { "thinker_config": Qwen2_5OmniThinkerConfig, diff --git a/src/helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py b/src/helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py index 658347c2d99..045f83e7ae4 100644 --- a/src/helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +++ b/src/helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py @@ -430,7 +430,6 @@ def get_rope_index( @dataclass class Qwen2_5OmniThinkerCausalLMOutputWithPast(ModelOutput): - loss: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None past_key_values: Optional[List[torch.FloatTensor]] = None @@ -566,7 +565,6 @@ def forward( class Qwen2_5OmniAudioFlashAttention2(Qwen2_5OmniAudioAttention): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() @@ -871,7 +869,6 @@ def forward( output_hidden_states=None, return_dict=None, ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1503,7 +1500,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class Qwen2_5OmniAttention(nn.Module): - def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config @@ -1622,7 +1618,6 @@ def forward(self, hidden_state): class Qwen2_5OmniFlashAttention2(Qwen2_5OmniAttention): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() @@ -1850,7 +1845,6 @@ def forward( position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC **kwargs, ): - residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -2141,7 +2135,6 @@ def _prepare_4d_causal_attention_mask_with_cache_position( causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) if config.sliding_window is not None: - if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length: sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window @@ -2215,7 +2208,6 @@ def forward( cache_position: Optional[torch.Tensor] = None, video_second_per_grid: Optional[torch.LongTensor] = None, ) -> Union[Tuple, Qwen2_5OmniThinkerCausalLMOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -2413,7 +2405,6 @@ def _update_model_kwargs_for_generation( @dataclass class Qwen2_5OmniTalkerCausalLMOutputWithPast(ModelOutput): - loss: Optional[torch.FloatTensor] = None logits: Optional[torch.FloatTensor] = None past_key_values: Optional[List[torch.FloatTensor]] = None @@ -2743,7 +2734,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Qwen2_5OmniTalkerCausalLMOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -3177,7 +3167,6 @@ def forward(self, x): class ECAPA_TDNN(torch.nn.Module): - def __init__(self, config: Qwen2_5OmniDiTConfig): super().__init__() assert len(config.enc_channels) == len(config.enc_kernel_sizes) @@ -3362,7 +3351,6 @@ def forward(self, x): # Modified from Llama with a different rotate function, will fixed in next release def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - def rotate_half_codec(x): # x = rearrange(x, "... (d r) -> ... d r", r=2) x = x.reshape(*x.shape[:-1], -1, 2) @@ -3506,7 +3494,6 @@ def forward(self, x, t, rope=None, block_diff=None): # x: noised input, t: time class SnakeBeta(nn.Module): - def __init__(self, in_features, alpha=1.0): super().__init__() self.in_features = in_features diff --git a/src/helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py b/src/helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py index dc299bcaa9d..a716b80239d 100644 --- a/src/helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +++ b/src/helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py @@ -39,7 +39,6 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs): class Qwen2_5OmniProcessor(ProcessorMixin): - attributes = ["omni_processor", "feature_extractor", "tokenizer"] omni_processor_class = "Qwen2VLImageProcessor" feature_extractor_class = "WhisperFeatureExtractor" @@ -70,7 +69,6 @@ def __call__( seconds_per_chunk: float = 2.0, **kwargs: Unpack[Qwen2_5OmniProcessorKwargs], ) -> BatchFeature: - output_kwargs = self._merge_kwargs( Qwen2_5OmniProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, diff --git a/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py b/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py index c88e8ce0a70..e95323d1e56 100644 --- a/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +++ b/src/helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py @@ -1,380 +1,380 @@ -from __future__ import annotations - -import base64 -import logging -import math -import os -import sys -import time -import warnings -from functools import lru_cache -from io import BytesIO - -import requests -import torch -import torchvision -from packaging import version -from PIL import Image -from torchvision import io, transforms -from torchvision.transforms import InterpolationMode -from typing import List, Optional, Union - - -logger = logging.getLogger(__name__) - -IMAGE_FACTOR = 28 -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 -MAX_RATIO = 200 - -VIDEO_MIN_PIXELS = 128 * 28 * 28 -VIDEO_MAX_PIXELS = 768 * 28 * 28 -FRAME_FACTOR = 2 -FPS = 2.0 -FPS_MIN_FRAMES = 4 -FPS_MAX_FRAMES = 768 - -# Set the maximum number of video token inputs. -# Here, 128K represents the maximum number of input tokens for the VLLM model. -# Remember to adjust it according to your own configuration. -VIDEO_TOTAL_PIXELS = int(float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))) -logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}") - - -def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - -def smart_resize( - height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS -) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" - ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(int(height / beta), factor) - w_bar = floor_by_factor(int(width / beta), factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(int(height * beta), factor) - w_bar = ceil_by_factor(int(width * beta), factor) - return h_bar, w_bar - - -def to_rgb(pil_image: Image.Image) -> Image.Image: - if pil_image.mode == "RGBA": - white_background = Image.new("RGB", pil_image.size, (255, 255, 255)) - white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask - return white_background - else: - return pil_image.convert("RGB") - - -def fetch_image(ele, size_factor: int = IMAGE_FACTOR) -> Image.Image: - if "image" in ele: - image = ele["image"] - else: - image = ele["image_url"] - image_obj = None - if isinstance(image, Image.Image): - image_obj = image - elif image.startswith("http://") or image.startswith("https://"): - response = requests.get(image, stream=True) - image_obj = Image.open(BytesIO(response.content)) - elif image.startswith("file://"): - image_obj = Image.open(image[7:]) - elif image.startswith("data:image"): - if "base64," in image: - _, base64_data = image.split("base64,", 1) - data = base64.b64decode(base64_data) - image_obj = Image.open(BytesIO(data)) - else: - image_obj = Image.open(image) - if image_obj is None: - raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}") - image = to_rgb(image_obj) - # resize - if "resized_height" in ele and "resized_width" in ele: - resized_height, resized_width = smart_resize( - int(ele["resized_height"]), - int(ele["resized_width"]), - factor=size_factor, - ) - else: - width, height = image.size - min_pixels = int(ele.get("min_pixels", MIN_PIXELS)) - max_pixels = int(ele.get("max_pixels", MAX_PIXELS)) - resized_height, resized_width = smart_resize( - height, - width, - factor=size_factor, - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - image = image.resize((resized_width, resized_height)) - - return image - - -def smart_nframes( - ele: dict, - total_frames: int, - video_fps: Union[int, float], -) -> int: - """calculate the number of frames for video used for model inputs. - - Args: - ele (dict): a dict contains the configuration of video. - support either `fps` or `nframes`: - - nframes: the number of frames to extract for model inputs. - - fps: the fps to extract frames for model inputs. - - min_frames: the minimum number of frames of the video, only used when fps is provided. - - max_frames: the maximum number of frames of the video, only used when fps is provided. - total_frames (int): the original total number of frames of the video. - video_fps (int | float): the original fps of the video. - - Raises: - ValueError: nframes should in interval [FRAME_FACTOR, total_frames]. - - Returns: - int: the number of frames for video used for model inputs. - """ - assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`" - if "nframes" in ele: - nframes = round_by_factor(ele["nframes"], FRAME_FACTOR) - else: - fps = ele.get("fps", FPS) - min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR) - max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR) - nframes = total_frames / video_fps * fps - if nframes > total_frames: - logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]") - nframes = min(min(max(nframes, min_frames), max_frames), total_frames) - nframes = floor_by_factor(nframes, FRAME_FACTOR) - if not (FRAME_FACTOR <= nframes and nframes <= total_frames): - raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.") - return nframes - - -def _read_video_torchvision( - ele: dict, -): - """read video using torchvision.io.read_video - - Args: - ele (dict): a dict contains the configuration of video. - support keys: - - video: the path of video. support "file://", "http://", "https://" and local path. - - video_start: the start time of video. - - video_end: the end time of video. - Returns: - torch.Tensor: the video tensor with shape (T, C, H, W). - """ - video_path = ele["video"] - if version.parse(torchvision.__version__) < version.parse("0.19.0"): - if "http://" in video_path or "https://" in video_path: - warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.") - if "file://" in video_path: - video_path = video_path[7:] - st = time.time() - video, audio, info = io.read_video( - video_path, - start_pts=ele.get("video_start", 0.0), - end_pts=ele.get("video_end", None), - pts_unit="sec", - output_format="TCHW", - ) - total_frames, video_fps = video.size(0), info["video_fps"] - logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") - nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps) - idx = torch.linspace(0, total_frames - 1, nframes).round().long() - sample_fps = nframes / max(total_frames, 1e-6) * video_fps - video = video[idx] - return video, sample_fps - - -def is_decord_available() -> bool: - import importlib.util - - return importlib.util.find_spec("decord") is not None - - -def _read_video_decord( - ele: dict, -): - """read video using decord.VideoReader - - Args: - ele (dict): a dict contains the configuration of video. - support keys: - - video: the path of video. support "file://", "http://", "https://" and local path. - - video_start: the start time of video. - - video_end: the end time of video. - Returns: - torch.Tensor: the video tensor with shape (T, C, H, W). - """ - import decord - - video_path = ele["video"] - st = time.time() - vr = decord.VideoReader(video_path) - # TODO: support start_pts and end_pts - if "video_start" in ele or "video_end" in ele: - raise NotImplementedError("not support start_pts and end_pts in decord for now.") - total_frames, video_fps = len(vr), vr.get_avg_fps() - logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") - nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps) - idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist() - video = vr.get_batch(idx).asnumpy() - video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format - sample_fps = nframes / max(total_frames, 1e-6) * video_fps - return video, sample_fps - - -VIDEO_READER_BACKENDS = { - "decord": _read_video_decord, - "torchvision": _read_video_torchvision, -} - -FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None) - - -@lru_cache(maxsize=1) -def get_video_reader_backend() -> str: - if FORCE_QWENVL_VIDEO_READER is not None: - video_reader_backend = FORCE_QWENVL_VIDEO_READER - elif is_decord_available(): - video_reader_backend = "decord" - else: - video_reader_backend = "torchvision" - print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr) - return video_reader_backend - - -def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False): - if isinstance(ele["video"], str): - video_reader_backend = get_video_reader_backend() - try: - video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele) - except Exception as e: - logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}") - video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele) - - nframes, _, height, width = video.shape - min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS) - total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS) - max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05)) - max_pixels_supposed = ele.get("max_pixels", max_pixels) - if max_pixels_supposed > max_pixels: - logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].") - max_pixels = min(max_pixels_supposed, max_pixels) - if "resized_height" in ele and "resized_width" in ele: - resized_height, resized_width = smart_resize( - ele["resized_height"], - ele["resized_width"], - factor=image_factor, - ) - else: - resized_height, resized_width = smart_resize( - height, - width, - factor=image_factor, - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - video = transforms.functional.resize( - video, - [resized_height, resized_width], - interpolation=InterpolationMode.BICUBIC, - antialias=True, - ).float() - if return_video_sample_fps: - return video, sample_fps - return video - else: - assert isinstance(ele["video"], (list, tuple)) - process_info = ele.copy() - process_info.pop("type", None) - process_info.pop("video", None) - images = [ - fetch_image({"image": video_element, **process_info}, size_factor=image_factor) - for video_element in ele["video"] - ] - nframes = ceil_by_factor(len(images), FRAME_FACTOR) - if len(images) < nframes: - images.extend([images[-1]] * (nframes - len(images))) - if return_video_sample_fps: - return images, process_info.pop("fps", 2.0) - return images - - -def extract_vision_info(conversations) -> list[dict]: - vision_infos = [] - if isinstance(conversations[0], dict): - conversations_p = [conversations] - for conversation in conversations_p: - for message in conversation: - if isinstance(message["content"], list): - for ele in message["content"]: - if ( - "image" in ele - or "image_url" in ele - or "video" in ele - or ele["type"] in ("image", "image_url", "video") - ): - vision_infos.append(ele) - return vision_infos - - -def process_vision_info( - conversations: list[dict] | list[list[dict]], - return_video_kwargs: bool = False, -): - - vision_infos = extract_vision_info(conversations) - # Read images or videos - image_inputs: Optional[List] = [] - video_inputs: Optional[List] = [] - video_sample_fps_list = [] - for vision_info in vision_infos: - if "image" in vision_info or "image_url" in vision_info: - assert image_inputs is not None - image_inputs.append(fetch_image(vision_info)) - elif "video" in vision_info: - assert video_inputs is not None - video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True) - video_sample_fps_list.append(video_sample_fps) - video_inputs.append(video_input) - else: - raise ValueError("image, image_url or video should in content.") - if image_inputs is not None and len(image_inputs) == 0: - image_inputs = None - if video_inputs is not None and len(video_inputs) == 0: - video_inputs = None - if return_video_kwargs: - return image_inputs, video_inputs, {"fps": video_sample_fps_list} - return image_inputs, video_inputs +# from __future__ import annotations +# +# import base64 +# import logging +# import math +# import os +# import sys +# import time +# import warnings +# from functools import lru_cache +# from io import BytesIO +# +# import requests +# import torch +# import torchvision +# from packaging import version +# from PIL import Image +# from torchvision import io, transforms +# from torchvision.transforms import InterpolationMode +# from typing import List, Optional, Union +# +# +# logger = logging.getLogger(__name__) +# +# IMAGE_FACTOR = 28 +# MIN_PIXELS = 4 * 28 * 28 +# MAX_PIXELS = 16384 * 28 * 28 +# MAX_RATIO = 200 +# +# VIDEO_MIN_PIXELS = 128 * 28 * 28 +# VIDEO_MAX_PIXELS = 768 * 28 * 28 +# FRAME_FACTOR = 2 +# FPS = 2.0 +# FPS_MIN_FRAMES = 4 +# FPS_MAX_FRAMES = 768 +# +# # Set the maximum number of video token inputs. +# # Here, 128K represents the maximum number of input tokens for the VLLM model. +# # Remember to adjust it according to your own configuration. +# VIDEO_TOTAL_PIXELS = int(float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))) +# logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}") +# +# +# def round_by_factor(number: int, factor: int) -> int: +# """Returns the closest integer to 'number' that is divisible by 'factor'.""" +# return round(number / factor) * factor +# +# +# def ceil_by_factor(number: int, factor: int) -> int: +# """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" +# return math.ceil(number / factor) * factor +# +# +# def floor_by_factor(number: int, factor: int) -> int: +# """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" +# return math.floor(number / factor) * factor +# +# +# def smart_resize( +# height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS +# ) -> tuple[int, int]: +# """ +# Rescales the image so that the following conditions are met: +# +# 1. Both dimensions (height and width) are divisible by 'factor'. +# +# 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. +# +# 3. The aspect ratio of the image is maintained as closely as possible. +# """ +# if max(height, width) / min(height, width) > MAX_RATIO: +# raise ValueError( +# f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" +# ) +# h_bar = max(factor, round_by_factor(height, factor)) +# w_bar = max(factor, round_by_factor(width, factor)) +# if h_bar * w_bar > max_pixels: +# beta = math.sqrt((height * width) / max_pixels) +# h_bar = floor_by_factor(int(height / beta), factor) +# w_bar = floor_by_factor(int(width / beta), factor) +# elif h_bar * w_bar < min_pixels: +# beta = math.sqrt(min_pixels / (height * width)) +# h_bar = ceil_by_factor(int(height * beta), factor) +# w_bar = ceil_by_factor(int(width * beta), factor) +# return h_bar, w_bar +# +# +# def to_rgb(pil_image: Image.Image) -> Image.Image: +# if pil_image.mode == "RGBA": +# white_background = Image.new("RGB", pil_image.size, (255, 255, 255)) +# white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask +# return white_background +# else: +# return pil_image.convert("RGB") +# +# +# def fetch_image(ele, size_factor: int = IMAGE_FACTOR) -> Image.Image: +# if "image" in ele: +# image = ele["image"] +# else: +# image = ele["image_url"] +# image_obj = None +# if isinstance(image, Image.Image): +# image_obj = image +# elif image.startswith("http://") or image.startswith("https://"): +# response = requests.get(image, stream=True) +# image_obj = Image.open(BytesIO(response.content)) +# elif image.startswith("file://"): +# image_obj = Image.open(image[7:]) +# elif image.startswith("data:image"): +# if "base64," in image: +# _, base64_data = image.split("base64,", 1) +# data = base64.b64decode(base64_data) +# image_obj = Image.open(BytesIO(data)) +# else: +# image_obj = Image.open(image) +# if image_obj is None: +# raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}") +# image = to_rgb(image_obj) +# # resize +# if "resized_height" in ele and "resized_width" in ele: +# resized_height, resized_width = smart_resize( +# int(ele["resized_height"]), +# int(ele["resized_width"]), +# factor=size_factor, +# ) +# else: +# width, height = image.size +# min_pixels = int(ele.get("min_pixels", MIN_PIXELS)) +# max_pixels = int(ele.get("max_pixels", MAX_PIXELS)) +# resized_height, resized_width = smart_resize( +# height, +# width, +# factor=size_factor, +# min_pixels=min_pixels, +# max_pixels=max_pixels, +# ) +# image = image.resize((resized_width, resized_height)) +# +# return image +# +# +# def smart_nframes( +# ele: dict, +# total_frames: int, +# video_fps: Union[int, float], +# ) -> int: +# """calculate the number of frames for video used for model inputs. +# +# Args: +# ele (dict): a dict contains the configuration of video. +# support either `fps` or `nframes`: +# - nframes: the number of frames to extract for model inputs. +# - fps: the fps to extract frames for model inputs. +# - min_frames: the minimum number of frames of the video, only used when fps is provided. +# - max_frames: the maximum number of frames of the video, only used when fps is provided. +# total_frames (int): the original total number of frames of the video. +# video_fps (int | float): the original fps of the video. +# +# Raises: +# ValueError: nframes should in interval [FRAME_FACTOR, total_frames]. +# +# Returns: +# int: the number of frames for video used for model inputs. +# """ +# assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`" +# if "nframes" in ele: +# nframes = round_by_factor(ele["nframes"], FRAME_FACTOR) +# else: +# fps = ele.get("fps", FPS) +# min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR) +# max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR) +# nframes = total_frames / video_fps * fps +# if nframes > total_frames: +# logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]") +# nframes = min(min(max(nframes, min_frames), max_frames), total_frames) +# nframes = floor_by_factor(nframes, FRAME_FACTOR) +# if not (FRAME_FACTOR <= nframes and nframes <= total_frames): +# raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.") +# return nframes +# +# +# def _read_video_torchvision( +# ele: dict, +# ): +# """read video using torchvision.io.read_video +# +# Args: +# ele (dict): a dict contains the configuration of video. +# support keys: +# - video: the path of video. support "file://", "http://", "https://" and local path. +# - video_start: the start time of video. +# - video_end: the end time of video. +# Returns: +# torch.Tensor: the video tensor with shape (T, C, H, W). +# """ +# video_path = ele["video"] +# if version.parse(torchvision.__version__) < version.parse("0.19.0"): +# if "http://" in video_path or "https://" in video_path: +# warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.") +# if "file://" in video_path: +# video_path = video_path[7:] +# st = time.time() +# video, audio, info = io.read_video( +# video_path, +# start_pts=ele.get("video_start", 0.0), +# end_pts=ele.get("video_end", None), +# pts_unit="sec", +# output_format="TCHW", +# ) +# total_frames, video_fps = video.size(0), info["video_fps"] +# logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") +# nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps) +# idx = torch.linspace(0, total_frames - 1, nframes).round().long() +# sample_fps = nframes / max(total_frames, 1e-6) * video_fps +# video = video[idx] +# return video, sample_fps +# +# +# def is_decord_available() -> bool: +# import importlib.util +# +# return importlib.util.find_spec("decord") is not None +# +# +# def _read_video_decord( +# ele: dict, +# ): +# """read video using decord.VideoReader +# +# Args: +# ele (dict): a dict contains the configuration of video. +# support keys: +# - video: the path of video. support "file://", "http://", "https://" and local path. +# - video_start: the start time of video. +# - video_end: the end time of video. +# Returns: +# torch.Tensor: the video tensor with shape (T, C, H, W). +# """ +# import decord +# +# video_path = ele["video"] +# st = time.time() +# vr = decord.VideoReader(video_path) +# # TODO: support start_pts and end_pts +# if "video_start" in ele or "video_end" in ele: +# raise NotImplementedError("not support start_pts and end_pts in decord for now.") +# total_frames, video_fps = len(vr), vr.get_avg_fps() +# logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s") +# nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps) +# idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist() +# video = vr.get_batch(idx).asnumpy() +# video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format +# sample_fps = nframes / max(total_frames, 1e-6) * video_fps +# return video, sample_fps +# +# +# VIDEO_READER_BACKENDS = { +# "decord": _read_video_decord, +# "torchvision": _read_video_torchvision, +# } +# +# FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None) +# +# +# @lru_cache(maxsize=1) +# def get_video_reader_backend() -> str: +# if FORCE_QWENVL_VIDEO_READER is not None: +# video_reader_backend = FORCE_QWENVL_VIDEO_READER +# elif is_decord_available(): +# video_reader_backend = "decord" +# else: +# video_reader_backend = "torchvision" +# print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr) +# return video_reader_backend +# +# +# def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False): +# if isinstance(ele["video"], str): +# video_reader_backend = get_video_reader_backend() +# try: +# video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele) +# except Exception as e: +# logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}") +# video, sample_fps = VIDEO_READER_BACKENDS["torchvision"](ele) +# +# nframes, _, height, width = video.shape +# min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS) +# total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS) +# max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05)) +# max_pixels_supposed = ele.get("max_pixels", max_pixels) +# if max_pixels_supposed > max_pixels: +# logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].") +# max_pixels = min(max_pixels_supposed, max_pixels) +# if "resized_height" in ele and "resized_width" in ele: +# resized_height, resized_width = smart_resize( +# ele["resized_height"], +# ele["resized_width"], +# factor=image_factor, +# ) +# else: +# resized_height, resized_width = smart_resize( +# height, +# width, +# factor=image_factor, +# min_pixels=min_pixels, +# max_pixels=max_pixels, +# ) +# video = transforms.functional.resize( +# video, +# [resized_height, resized_width], +# interpolation=InterpolationMode.BICUBIC, +# antialias=True, +# ).float() +# if return_video_sample_fps: +# return video, sample_fps +# return video +# else: +# assert isinstance(ele["video"], (list, tuple)) +# process_info = ele.copy() +# process_info.pop("type", None) +# process_info.pop("video", None) +# images = [ +# fetch_image({"image": video_element, **process_info}, size_factor=image_factor) +# for video_element in ele["video"] +# ] +# nframes = ceil_by_factor(len(images), FRAME_FACTOR) +# if len(images) < nframes: +# images.extend([images[-1]] * (nframes - len(images))) +# if return_video_sample_fps: +# return images, process_info.pop("fps", 2.0) +# return images +# +# +# def extract_vision_info(conversations) -> list[dict]: +# vision_infos = [] +# if isinstance(conversations[0], dict): +# conversations_p = [conversations] +# for conversation in conversations_p: +# for message in conversation: +# if isinstance(message["content"], list): +# for ele in message["content"]: +# if ( +# "image" in ele +# or "image_url" in ele +# or "video" in ele +# or ele["type"] in ("image", "image_url", "video") +# ): +# vision_infos.append(ele) +# return vision_infos +# +# +# def process_vision_info( +# conversations: list[dict] | list[list[dict]], +# return_video_kwargs: bool = False, +# ): +# +# vision_infos = extract_vision_info(conversations) +# # Read images or videos +# image_inputs: Optional[List] = [] +# video_inputs: Optional[List] = [] +# video_sample_fps_list = [] +# for vision_info in vision_infos: +# if "image" in vision_info or "image_url" in vision_info: +# assert image_inputs is not None +# image_inputs.append(fetch_image(vision_info)) +# elif "video" in vision_info: +# assert video_inputs is not None +# video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True) +# video_sample_fps_list.append(video_sample_fps) +# video_inputs.append(video_input) +# else: +# raise ValueError("image, image_url or video should in content.") +# if image_inputs is not None and len(image_inputs) == 0: +# image_inputs = None +# if video_inputs is not None and len(video_inputs) == 0: +# video_inputs = None +# if return_video_kwargs: +# return image_inputs, video_inputs, {"fps": video_sample_fps_list} +# return image_inputs, video_inputs diff --git a/src/helm/clients/grok_client.py b/src/helm/clients/grok_client.py index ac1bb3f90dd..fb5a5311649 100644 --- a/src/helm/clients/grok_client.py +++ b/src/helm/clients/grok_client.py @@ -7,7 +7,6 @@ class GrokChatClient(OpenAIClient): - BASE_URL = "https://api.x.ai/v1" _UNSUPPORTED_ARGUMENTS = ["presence_penalty", "frequency_penalty"] diff --git a/src/helm/clients/huggingface_inference_providers_client.py b/src/helm/clients/huggingface_inference_providers_client.py index d9aaf8f4584..b488d6f9ef7 100644 --- a/src/helm/clients/huggingface_inference_providers_client.py +++ b/src/helm/clients/huggingface_inference_providers_client.py @@ -30,7 +30,6 @@ class HuggingFaceInferenceProvidersChatCompletionRequest(TypedDict): class HuggingFaceInferenceProvidersClient(CachingClient): - def __init__( self, cache_config: CacheConfig, diff --git a/src/helm/clients/ibm_client.py b/src/helm/clients/ibm_client.py index f13a536b087..411e9e5e4a7 100644 --- a/src/helm/clients/ibm_client.py +++ b/src/helm/clients/ibm_client.py @@ -71,7 +71,6 @@ def create_params(self, request: Request) -> T: class GenerateInferenceHandler(ModelInferenceHandler[TextGenParameters]): - def __init__(self, inference_engine: ModelInference): self.inference_engine = inference_engine @@ -238,7 +237,6 @@ def do_it() -> Dict[str, Any]: class IbmChatClient(IbmClient): - def make_request(self, request: Request) -> RequestResult: # Embedding not supported for this model if request.embedding: diff --git a/src/helm/clients/image_generation/dalle_mini/model/modeling.py b/src/helm/clients/image_generation/dalle_mini/model/modeling.py index 7aebf119a4c..492300cb4f4 100644 --- a/src/helm/clients/image_generation/dalle_mini/model/modeling.py +++ b/src/helm/clients/image_generation/dalle_mini/model/modeling.py @@ -475,7 +475,6 @@ class GLU(nn.Module): @nn.compact def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray: - if self.config.use_deepnet_scaling: gain = deepnet_gain["encoder" if self.is_encoder else "decoder"]["beta"](self.config) elif self.config.use_subln_init: @@ -546,7 +545,6 @@ class FFN(nn.Module): @nn.compact def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray: - if self.config.use_deepnet_scaling: gain = deepnet_gain["encoder" if self.is_encoder else "decoder"]["beta"](self.config) elif self.config.use_subln_init: @@ -613,7 +611,6 @@ def __call__( output_attentions: bool = True, deterministic: bool = True, ) -> Tuple[jnp.ndarray]: - if self.config.use_scan: hidden_states = hidden_states[0] @@ -711,7 +708,6 @@ def __call__( output_attentions: bool = True, deterministic: bool = True, ) -> Tuple[jnp.ndarray]: - if self.config.use_scan: hidden_states = hidden_states[0] diff --git a/src/helm/clients/litellm_client.py b/src/helm/clients/litellm_client.py index 49ab5f477c1..b81faf0908d 100644 --- a/src/helm/clients/litellm_client.py +++ b/src/helm/clients/litellm_client.py @@ -33,7 +33,6 @@ class LiteLLMCompletionRequest(TypedDict): class LiteLLMCompletionClient(CachingClient): - def __init__( self, cache_config: CacheConfig, diff --git a/src/helm/clients/nvidia_nim_client.py b/src/helm/clients/nvidia_nim_client.py index ba632f3ef6b..3ea57dc53d1 100644 --- a/src/helm/clients/nvidia_nim_client.py +++ b/src/helm/clients/nvidia_nim_client.py @@ -7,7 +7,6 @@ class NvidiaNimClient(OpenAIClient): - BASE_URL = "https://integrate.api.nvidia.com/v1" def __init__( diff --git a/src/helm/clients/openai_responses_client.py b/src/helm/clients/openai_responses_client.py index e8143e7a1f8..c4a189b7491 100644 --- a/src/helm/clients/openai_responses_client.py +++ b/src/helm/clients/openai_responses_client.py @@ -179,7 +179,6 @@ def do_it() -> Dict[str, Any]: if request.echo_prompt: text_output_parts.append(request.prompt) for output in response.output: - if output.type == "reasoning": for summary in output.summary: reasoning_output_parts.append(summary.text) diff --git a/src/helm/clients/yi_client.py b/src/helm/clients/yi_client.py index 9bba787e96a..d469cb2cf5e 100644 --- a/src/helm/clients/yi_client.py +++ b/src/helm/clients/yi_client.py @@ -6,7 +6,6 @@ class YiChatClient(OpenAIClient): - BASE_URL = "http://api.01ww.xyz/v1" def __init__(