evaleval · felifri · May 18, 2026 · May 18, 2026 · May 18, 2026 · Erotemic
diff --git a/README.md b/README.md
@@ -155,6 +155,18 @@ For agentic evaluations (e.g., SWE-Bench, GAIA), the aggregate schema captures c
 
 At the instance level, agentic evaluations use `interaction_type: "agentic"` with full tool call traces recorded in the `messages` array. See the [Inspect AI test fixture](tests/data/inspect/) for a GAIA example with docker sandbox and tool usage.
 
+### Text-to-Image Evaluations
+
+The schema supports text-to-image (T2I) generation models (FLUX, SDXL, Imagen, …) alongside LLMs. Three small additions cover it; everything else (sampler args, image dimensions, sha256, rater pools, …) goes through the existing `additional_details` escape hatches.
+
+- **`modality`** — optional enum (`"text"` | `"text_to_image"`) on each `evaluation_results[]` entry and on each instance record. Absent means `"text"` (backwards compatibility).
+- **`output.media: MediaRef[]`** — generated artifacts on the instance record. A `MediaRef` is just `{media_type, uri}` plus an `additional_details` bag (sha256, mime_type, width/height, seed, index, …). Required when `modality == "text_to_image"`.
+- **`evaluation.is_correct`** is now `boolean | null` — set to `null` when the metric is continuous (FID, CLIPScore, ImageReward, etc.).
+
+T2I uses `interaction_type: "single_turn"`; `modality` is the orthogonal axis. Sampler args (`num_inference_steps`, `guidance_scale`, `width/height`, `scheduler`, `seed`, …) go in `generation_config.additional_details` as stringified key-value pairs. Human-rater pools (MTurk Likert critique, pairwise photorealism comparisons à la HEIM) go in `metric_config.additional_details` until a follow-up PR adds first-class structure for them.
+
+See [`tests/data/t2i/`](tests/data/t2i/) for a GenEval / SDXL worked example.
+
 ## ✅ Data Validation
 
 Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/).

diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py
@@ -1,21 +1,13 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2026-03-19T20:30:15+00:00
+#   timestamp: 2026-05-18T09:44:21+00:00
 
 from __future__ import annotations
 
 from enum import Enum
 from typing import Annotated, Literal
 
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Discriminator,
-    Field,
-    confloat,
-    conint,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator, Discriminator
 
 
 class SourceType(Enum):
@@ -73,6 +65,11 @@ class EvalLibrary(BaseModel):
     )
 
 
+class Modality(Enum):
+    text = 'text'
+    text_to_image = 'text_to_image'
+
+
 class ScoreType(Enum):
     binary = 'binary'
     continuous = 'continuous'
@@ -443,33 +440,31 @@ class MetricConfig(BaseModel):
 
     # --- validators (added by post_codegen.py) ---
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_score_type_requirements(self):
         if self.score_type == ScoreType.levels:
             if self.level_names is None:
                 raise ValueError("score_type 'levels' requires level_names")
             if self.has_unknown_level is None:
-                raise ValueError(
-                    "score_type 'levels' requires has_unknown_level"
-                )
+                raise ValueError("score_type 'levels' requires has_unknown_level")
         elif self.score_type == ScoreType.continuous:
             if self.min_score is None:
                 raise ValueError("score_type 'continuous' requires min_score")
             if self.max_score is None:
                 raise ValueError("score_type 'continuous' requires max_score")
         return self
 
-
 class EvaluationResult(BaseModel):
     evaluation_result_id: str | None = Field(
         None,
         description='Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.',
     )
     evaluation_name: str = Field(..., description='Name of the evaluation')
-    source_data: Annotated[
-        SourceDataUrl | SourceDataHf | SourceDataPrivate,
-        Discriminator('source_type'),
-    ] = Field(
+    modality: Modality | None = Field(
+        None,
+        description="Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum.",
+    )
+    source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field(
         ...,
         description='Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.',
     )

diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py
@@ -1,20 +1,13 @@
 # generated by datamodel-codegen:
 #   filename:  instance_level_eval.schema.json
-#   timestamp: 2026-03-19T20:30:15+00:00
+#   timestamp: 2026-05-18T09:44:23+00:00
 
 from __future__ import annotations
 
 from enum import Enum
 from typing import Any
 
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    confloat,
-    conint,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator
 
 
 class InteractionType(Enum):
@@ -23,6 +16,11 @@ class InteractionType(Enum):
     agentic = 'agentic'
 
 
+class Modality(Enum):
+    text = 'text'
+    text_to_image = 'text_to_image'
+
+
 class Input(BaseModel):
     raw: str = Field(..., description='The raw input as defined in the eval')
     formatted: str | None = Field(
@@ -39,14 +37,6 @@ class Input(BaseModel):
     )
 
 
-class Output(BaseModel):
-    raw: list[str] = Field(..., description='Complete model responses')
-    reasoning_trace: list[str] | None = Field(
-        None,
-        description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
-    )
-
-
 class ToolCall(BaseModel):
     id: str = Field(..., description='Unique identifier for the tool call')
     name: str = Field(..., description='Name of tool/function')
@@ -104,8 +94,9 @@ class AnswerAttributionItem(BaseModel):
 
 class Evaluation(BaseModel):
     score: float = Field(..., description='Instance-level score')
-    is_correct: bool = Field(
-        ..., description='Whether the final answer is correct'
+    is_correct: bool | None = Field(
+        ...,
+        description='Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore).',
     )
     num_turns: conint(ge=1) | None = Field(
         None, description='Number of turns in the interaction'
@@ -150,6 +141,42 @@ class Performance(BaseModel):
     )
 
 
+class MediaType(Enum):
+    image = 'image'
+    video = 'video'
+    audio = 'audio'
+
+
+class MediaRef(BaseModel):
+    model_config = ConfigDict(
+        extra='forbid',
+    )
+    media_type: MediaType
+    uri: str = Field(
+        ...,
+        description="Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline.",
+    )
+    additional_details: dict[str, str] | None = Field(
+        None,
+        description='Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.',
+    )
+
+
+class Output(BaseModel):
+    raw: list[str] = Field(
+        ...,
+        description="Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
+    )
+    reasoning_trace: list[str] | None = Field(
+        None,
+        description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
+    )
+    media: list[MediaRef] | None = Field(
+        None,
+        description="Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
+    )
+
+
 class InstanceLevelEvaluationLog(BaseModel):
     model_config = ConfigDict(
         extra='forbid',
@@ -183,7 +210,11 @@ class InstanceLevelEvaluationLog(BaseModel):
     )
     interaction_type: InteractionType = Field(
         ...,
-        description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents',
+        description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis.",
+    )
+    modality: Modality | None = Field(
+        None,
+        description="Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility.",
     )
     input: Input = Field(
         ..., description='Input data for the evaluation sample'
@@ -220,22 +251,31 @@ class InstanceLevelEvaluationLog(BaseModel):
 
     # --- validators (added by post_codegen.py) ---
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_interaction_type_consistency(self):
         if self.interaction_type == InteractionType.single_turn:
             if self.output is None:
-                raise ValueError('single_turn interaction_type requires output')
+                raise ValueError("single_turn interaction_type requires output")
             if self.messages is not None:
                 raise ValueError(
-                    'single_turn interaction_type must not have messages'
+                    "single_turn interaction_type must not have messages"
                 )
         else:
             if self.messages is None:
                 raise ValueError(
-                    f'{self.interaction_type.value} interaction_type requires messages'
+                    f"{self.interaction_type.value} interaction_type requires messages"
                 )
             if self.output is not None:
                 raise ValueError(
-                    f'{self.interaction_type.value} interaction_type must not have output'
+                    f"{self.interaction_type.value} interaction_type must not have output"
+                )
+        return self
+
+    @model_validator(mode="after")
+    def validate_modality_consistency(self):
+        if self.modality == Modality.text_to_image:
+            if self.output is None or not self.output.media:
+                raise ValueError(
+                    "modality 'text_to_image' requires output.media to be a non-empty list"
                 )
         return self
diff --git a/every_eval_ever/schemas/eval.schema.json b/every_eval_ever/schemas/eval.schema.json
@@ -130,6 +130,14 @@
                         "type": "string",
                         "description": "Name of the evaluation"
                     },
+                    "modality": {
+                        "type": "string",
+                        "enum": [
+                            "text",
+                            "text_to_image"
+                        ],
+                        "description": "Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum."
+                    },
                     "source_data": {
                         "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
                         "oneOf": [

diff --git a/every_eval_ever/schemas/instance_level_eval.schema.json b/every_eval_ever/schemas/instance_level_eval.schema.json
@@ -46,7 +46,12 @@
         "interaction_type": {
             "type": "string",
             "enum": ["single_turn", "multi_turn", "agentic"],
-            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
+            "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis."
+        },
+        "modality": {
+            "type": "string",
+            "enum": ["text", "text_to_image"],
+            "description": "Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility."
         },
         "input": {
             "type": "object",
@@ -82,13 +87,18 @@
             "properties": {
                 "raw": {
                     "type": "array",
-                    "description": "Complete model responses",
+                    "description": "Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
                     "items": { "type": "string" }
                 },
                 "reasoning_trace": {
                     "type": ["array", "null"],
                     "description": "Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)",
                     "items": { "type": "string" }
+                },
+                "media": {
+                    "type": ["array", "null"],
+                    "description": "Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
+                    "items": { "$ref": "#/$defs/media_ref" }
                 }
             }
         },
@@ -192,8 +202,8 @@
                     "description": "Instance-level score"
                 },
                 "is_correct": {
-                    "type": "boolean",
-                    "description": "Whether the final answer is correct"
+                    "type": ["boolean", "null"],
+                    "description": "Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore)."
                 },
                 "num_turns": {
                     "type": ["integer", "null"],
@@ -334,5 +344,28 @@
                 }
             }
         }
-    ]
+    ],
+    "$defs": {
+        "media_ref": {
+            "type": "object",
+            "description": "Reference to a generated media artifact. Required fields are intentionally minimal; record extras (mime_type, sha256, width/height, seed, index, ...) in additional_details.",
+            "required": ["media_type", "uri"],
+            "additionalProperties": false,
+            "properties": {
+                "media_type": {
+                    "type": "string",
+                    "enum": ["image", "video", "audio"]
+                },
+                "uri": {
+                    "type": "string",
+                    "description": "Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline."
+                },
+                "additional_details": {
+                    "type": "object",
+                    "description": "Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.",
+                    "additionalProperties": {"type": "string"}
+                }
+            }
+        }
+    }
 }
diff --git a/post_codegen.py b/post_codegen.py
@@ -66,6 +66,21 @@ def validate_score_type_requirements(self):
             if self.max_score is None:
                 raise ValueError("score_type 'continuous' requires max_score")
         return self
+""",
+    },
+    {
+        'file': 'every_eval_ever/instance_level_types.py',
+        'import_add': 'model_validator',
+        'class_name': 'InstanceLevelEvaluationLog',
+        'validator': """
+    @model_validator(mode="after")
+    def validate_modality_consistency(self):
+        if self.modality == Modality.text_to_image:
+            if self.output is None or not self.output.media:
+                raise ValueError(
+                    "modality 'text_to_image' requires output.media to be a non-empty list"
+                )
+        return self
 """,
     },
 ]
@@ -126,9 +141,9 @@ def patch_file(patch: dict) -> None:
     path = Path(__file__).parent / patch['file']
     content = path.read_text()
 
-    # Check if already patched
-    if 'post_codegen.py' in content:
-        print(f'  {patch["file"]}: already patched, skipping')
+    validator_def = re.search(r'def (\w+)\(self', patch['validator'])
+    if validator_def and f'def {validator_def.group(1)}(self' in content:
+        print(f'  {patch["file"]}: {patch["class_name"]}.{validator_def.group(1)} already patched, skipping')
         return
 
     content = add_import(content, patch['import_add'])