-
Notifications
You must be signed in to change notification settings - Fork 35
[Feature] Add text-to-image modality support #137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,20 +1,13 @@ | ||
| # generated by datamodel-codegen: | ||
| # filename: instance_level_eval.schema.json | ||
| # timestamp: 2026-03-19T20:30:15+00:00 | ||
| # timestamp: 2026-05-18T09:44:23+00:00 | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from enum import Enum | ||
| from typing import Any | ||
|
|
||
| from pydantic import ( | ||
| BaseModel, | ||
| ConfigDict, | ||
| Field, | ||
| confloat, | ||
| conint, | ||
| model_validator, | ||
| ) | ||
| from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator | ||
|
|
||
|
|
||
| class InteractionType(Enum): | ||
|
|
@@ -23,6 +16,11 @@ class InteractionType(Enum): | |
| agentic = 'agentic' | ||
|
|
||
|
|
||
| class Modality(Enum): | ||
| text = 'text' | ||
| text_to_image = 'text_to_image' | ||
|
|
||
|
|
||
| class Input(BaseModel): | ||
| raw: str = Field(..., description='The raw input as defined in the eval') | ||
| formatted: str | None = Field( | ||
|
|
@@ -39,14 +37,6 @@ class Input(BaseModel): | |
| ) | ||
|
|
||
|
|
||
| class Output(BaseModel): | ||
| raw: list[str] = Field(..., description='Complete model responses') | ||
| reasoning_trace: list[str] | None = Field( | ||
| None, | ||
| description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)', | ||
| ) | ||
|
|
||
|
|
||
| class ToolCall(BaseModel): | ||
| id: str = Field(..., description='Unique identifier for the tool call') | ||
| name: str = Field(..., description='Name of tool/function') | ||
|
|
@@ -104,8 +94,9 @@ class AnswerAttributionItem(BaseModel): | |
|
|
||
| class Evaluation(BaseModel): | ||
| score: float = Field(..., description='Instance-level score') | ||
| is_correct: bool = Field( | ||
| ..., description='Whether the final answer is correct' | ||
| is_correct: bool | None = Field( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a better way that this can be expressed that generalizes better? I don't have an immediate idea, but this seems like the stat of a boolean explosion to me. |
||
| ..., | ||
| description='Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore).', | ||
| ) | ||
| num_turns: conint(ge=1) | None = Field( | ||
| None, description='Number of turns in the interaction' | ||
|
|
@@ -150,6 +141,42 @@ class Performance(BaseModel): | |
| ) | ||
|
|
||
|
|
||
| class MediaType(Enum): | ||
| image = 'image' | ||
| video = 'video' | ||
| audio = 'audio' | ||
|
|
||
|
|
||
| class MediaRef(BaseModel): | ||
| model_config = ConfigDict( | ||
| extra='forbid', | ||
| ) | ||
| media_type: MediaType | ||
| uri: str = Field( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar URI comment |
||
| ..., | ||
| description="Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline.", | ||
| ) | ||
| additional_details: dict[str, str] | None = Field( | ||
| None, | ||
| description='Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.', | ||
| ) | ||
|
|
||
|
|
||
| class Output(BaseModel): | ||
| raw: list[str] = Field( | ||
| ..., | ||
| description="Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].", | ||
| ) | ||
| reasoning_trace: list[str] | None = Field( | ||
| None, | ||
| description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)', | ||
| ) | ||
| media: list[MediaRef] | None = Field( | ||
| None, | ||
| description="Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.", | ||
| ) | ||
|
|
||
|
|
||
| class InstanceLevelEvaluationLog(BaseModel): | ||
| model_config = ConfigDict( | ||
| extra='forbid', | ||
|
|
@@ -183,7 +210,11 @@ class InstanceLevelEvaluationLog(BaseModel): | |
| ) | ||
| interaction_type: InteractionType = Field( | ||
| ..., | ||
| description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents', | ||
| description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis.", | ||
| ) | ||
| modality: Modality | None = Field( | ||
| None, | ||
| description="Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility.", | ||
| ) | ||
| input: Input = Field( | ||
| ..., description='Input data for the evaluation sample' | ||
|
|
@@ -220,22 +251,31 @@ class InstanceLevelEvaluationLog(BaseModel): | |
|
|
||
| # --- validators (added by post_codegen.py) --- | ||
|
|
||
| @model_validator(mode='after') | ||
| @model_validator(mode="after") | ||
| def validate_interaction_type_consistency(self): | ||
| if self.interaction_type == InteractionType.single_turn: | ||
| if self.output is None: | ||
| raise ValueError('single_turn interaction_type requires output') | ||
| raise ValueError("single_turn interaction_type requires output") | ||
| if self.messages is not None: | ||
| raise ValueError( | ||
| 'single_turn interaction_type must not have messages' | ||
| "single_turn interaction_type must not have messages" | ||
| ) | ||
| else: | ||
| if self.messages is None: | ||
| raise ValueError( | ||
| f'{self.interaction_type.value} interaction_type requires messages' | ||
| f"{self.interaction_type.value} interaction_type requires messages" | ||
| ) | ||
| if self.output is not None: | ||
| raise ValueError( | ||
| f'{self.interaction_type.value} interaction_type must not have output' | ||
| f"{self.interaction_type.value} interaction_type must not have output" | ||
| ) | ||
| return self | ||
|
|
||
| @model_validator(mode="after") | ||
| def validate_modality_consistency(self): | ||
| if self.modality == Modality.text_to_image: | ||
| if self.output is None or not self.output.media: | ||
| raise ValueError( | ||
| "modality 'text_to_image' requires output.media to be a non-empty list" | ||
| ) | ||
| return self | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,7 +46,12 @@ | |
| "interaction_type": { | ||
| "type": "string", | ||
| "enum": ["single_turn", "multi_turn", "agentic"], | ||
| "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents" | ||
| "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis." | ||
| }, | ||
| "modality": { | ||
| "type": "string", | ||
| "enum": ["text", "text_to_image"], | ||
| "description": "Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility." | ||
| }, | ||
| "input": { | ||
| "type": "object", | ||
|
|
@@ -82,13 +87,18 @@ | |
| "properties": { | ||
| "raw": { | ||
| "type": "array", | ||
| "description": "Complete model responses", | ||
| "description": "Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].", | ||
| "items": { "type": "string" } | ||
| }, | ||
| "reasoning_trace": { | ||
| "type": ["array", "null"], | ||
| "description": "Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)", | ||
| "items": { "type": "string" } | ||
| }, | ||
| "media": { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this doesn't correspond with the pydantic validator. I'm not sure if it should. If it did it would get a bit wordy, i.e. The larger issue is having two sources of truth for the schema. Does it make sense to go all in on pydantic and then have it generate the jsonschema? |
||
| "type": ["array", "null"], | ||
| "description": "Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.", | ||
| "items": { "$ref": "#/$defs/media_ref" } | ||
| } | ||
| } | ||
| }, | ||
|
|
@@ -192,8 +202,8 @@ | |
| "description": "Instance-level score" | ||
| }, | ||
| "is_correct": { | ||
| "type": "boolean", | ||
| "description": "Whether the final answer is correct" | ||
| "type": ["boolean", "null"], | ||
| "description": "Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore)." | ||
| }, | ||
| "num_turns": { | ||
| "type": ["integer", "null"], | ||
|
|
@@ -334,5 +344,28 @@ | |
| } | ||
| } | ||
| } | ||
| ] | ||
| ], | ||
| "$defs": { | ||
| "media_ref": { | ||
| "type": "object", | ||
| "description": "Reference to a generated media artifact. Required fields are intentionally minimal; record extras (mime_type, sha256, width/height, seed, index, ...) in additional_details.", | ||
| "required": ["media_type", "uri"], | ||
| "additionalProperties": false, | ||
| "properties": { | ||
| "media_type": { | ||
| "type": "string", | ||
| "enum": ["image", "video", "audio"] | ||
| }, | ||
| "uri": { | ||
| "type": "string", | ||
| "description": "Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline." | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the I think one URI is also a bad idea. There should be a list of suggested ways to access the data . Otherwise you run into a dead URL issue. It happens too often, and it limits the reproducibility value. URLs rot, they can be changed to point to something else (where the hash is important). I would also suggest thinking about allowing distributed content addressed references like IPFS CIDs or BitTorrent magnet URIs here, which avoid the problem where the content at an address can change, but do not address the link-rot issue (which is just always going to be a fundamental limitation of any reference based scheme). |
||
| }, | ||
| "additional_details": { | ||
| "type": "object", | ||
| "description": "Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.", | ||
| "additionalProperties": {"type": "string"} | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,6 +66,21 @@ def validate_score_type_requirements(self): | |
| if self.max_score is None: | ||
| raise ValueError("score_type 'continuous' requires max_score") | ||
| return self | ||
| """, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably not an issue with this PR, but python code in a string can be a smell. I don't quite understand the purpose of this file yet, but we may want to revisit the design that requires this ATM. |
||
| }, | ||
| { | ||
| 'file': 'every_eval_ever/instance_level_types.py', | ||
| 'import_add': 'model_validator', | ||
| 'class_name': 'InstanceLevelEvaluationLog', | ||
| 'validator': """ | ||
| @model_validator(mode="after") | ||
| def validate_modality_consistency(self): | ||
| if self.modality == Modality.text_to_image: | ||
| if self.output is None or not self.output.media: | ||
| raise ValueError( | ||
| "modality 'text_to_image' requires output.media to be a non-empty list" | ||
| ) | ||
| return self | ||
| """, | ||
| }, | ||
| ] | ||
|
|
@@ -126,9 +141,9 @@ def patch_file(patch: dict) -> None: | |
| path = Path(__file__).parent / patch['file'] | ||
| content = path.read_text() | ||
|
|
||
| # Check if already patched | ||
| if 'post_codegen.py' in content: | ||
| print(f' {patch["file"]}: already patched, skipping') | ||
| validator_def = re.search(r'def (\w+)\(self', patch['validator']) | ||
| if validator_def and f'def {validator_def.group(1)}(self' in content: | ||
| print(f' {patch["file"]}: {patch["class_name"]}.{validator_def.group(1)} already patched, skipping') | ||
| return | ||
|
|
||
| content = add_import(content, patch['import_add']) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should just use
ruff formatand settle on a quote style (I like single quotes) to avoid diffs like these. Not a blocker here, but something that should happen soon as the repo grows.