Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ For agentic evaluations (e.g., SWE-Bench, GAIA), the aggregate schema captures c

At the instance level, agentic evaluations use `interaction_type: "agentic"` with full tool call traces recorded in the `messages` array. See the [Inspect AI test fixture](tests/data/inspect/) for a GAIA example with docker sandbox and tool usage.

### Text-to-Image Evaluations

The schema supports text-to-image (T2I) generation models (FLUX, SDXL, Imagen, …) alongside LLMs. Three small additions cover it; everything else (sampler args, image dimensions, sha256, rater pools, …) goes through the existing `additional_details` escape hatches.

- **`modality`** — optional enum (`"text"` | `"text_to_image"`) on each `evaluation_results[]` entry and on each instance record. Absent means `"text"` (backwards compatibility).
- **`output.media: MediaRef[]`** — generated artifacts on the instance record. A `MediaRef` is just `{media_type, uri}` plus an `additional_details` bag (sha256, mime_type, width/height, seed, index, …). Required when `modality == "text_to_image"`.
- **`evaluation.is_correct`** is now `boolean | null` — set to `null` when the metric is continuous (FID, CLIPScore, ImageReward, etc.).

T2I uses `interaction_type: "single_turn"`; `modality` is the orthogonal axis. Sampler args (`num_inference_steps`, `guidance_scale`, `width/height`, `scheduler`, `seed`, …) go in `generation_config.additional_details` as stringified key-value pairs. Human-rater pools (MTurk Likert critique, pairwise photorealism comparisons à la HEIM) go in `metric_config.additional_details` until a follow-up PR adds first-class structure for them.

See [`tests/data/t2i/`](tests/data/t2i/) for a GenEval / SDXL worked example.

## ✅ Data Validation

Validation uses Pydantic models generated from the JSON schemas. This validates aggregate `.json` files against `EvaluationLog` and instance-level `_samples.jsonl` files line-by-line against `InstanceLevelEvaluationLog`. Requires [uv](https://docs.astral.sh/uv/).
Expand Down
33 changes: 14 additions & 19 deletions every_eval_ever/eval_types.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,13 @@
# generated by datamodel-codegen:
# filename: eval.schema.json
# timestamp: 2026-03-19T20:30:15+00:00
# timestamp: 2026-05-18T09:44:21+00:00

from __future__ import annotations

from enum import Enum
from typing import Annotated, Literal

from pydantic import (
BaseModel,
ConfigDict,
Discriminator,
Field,
confloat,
conint,
model_validator,
)
from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator, Discriminator


class SourceType(Enum):
Expand Down Expand Up @@ -73,6 +65,11 @@ class EvalLibrary(BaseModel):
)


class Modality(Enum):
text = 'text'
text_to_image = 'text_to_image'


class ScoreType(Enum):
binary = 'binary'
continuous = 'continuous'
Expand Down Expand Up @@ -443,33 +440,31 @@ class MetricConfig(BaseModel):

# --- validators (added by post_codegen.py) ---

@model_validator(mode='after')
@model_validator(mode="after")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should just use ruff format and settle on a quote style (I like single quotes) to avoid diffs like these. Not a blocker here, but something that should happen soon as the repo grows.

def validate_score_type_requirements(self):
if self.score_type == ScoreType.levels:
if self.level_names is None:
raise ValueError("score_type 'levels' requires level_names")
if self.has_unknown_level is None:
raise ValueError(
"score_type 'levels' requires has_unknown_level"
)
raise ValueError("score_type 'levels' requires has_unknown_level")
elif self.score_type == ScoreType.continuous:
if self.min_score is None:
raise ValueError("score_type 'continuous' requires min_score")
if self.max_score is None:
raise ValueError("score_type 'continuous' requires max_score")
return self


class EvaluationResult(BaseModel):
evaluation_result_id: str | None = Field(
None,
description='Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.',
)
evaluation_name: str = Field(..., description='Name of the evaluation')
source_data: Annotated[
SourceDataUrl | SourceDataHf | SourceDataPrivate,
Discriminator('source_type'),
] = Field(
modality: Modality | None = Field(
None,
description="Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum.",
)
source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field(
...,
description='Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.',
)
Expand Down
90 changes: 65 additions & 25 deletions every_eval_ever/instance_level_types.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
# generated by datamodel-codegen:
# filename: instance_level_eval.schema.json
# timestamp: 2026-03-19T20:30:15+00:00
# timestamp: 2026-05-18T09:44:23+00:00

from __future__ import annotations

from enum import Enum
from typing import Any

from pydantic import (
BaseModel,
ConfigDict,
Field,
confloat,
conint,
model_validator,
)
from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator


class InteractionType(Enum):
Expand All @@ -23,6 +16,11 @@ class InteractionType(Enum):
agentic = 'agentic'


class Modality(Enum):
text = 'text'
text_to_image = 'text_to_image'


class Input(BaseModel):
raw: str = Field(..., description='The raw input as defined in the eval')
formatted: str | None = Field(
Expand All @@ -39,14 +37,6 @@ class Input(BaseModel):
)


class Output(BaseModel):
raw: list[str] = Field(..., description='Complete model responses')
reasoning_trace: list[str] | None = Field(
None,
description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
)


class ToolCall(BaseModel):
id: str = Field(..., description='Unique identifier for the tool call')
name: str = Field(..., description='Name of tool/function')
Expand Down Expand Up @@ -104,8 +94,9 @@ class AnswerAttributionItem(BaseModel):

class Evaluation(BaseModel):
score: float = Field(..., description='Instance-level score')
is_correct: bool = Field(
..., description='Whether the final answer is correct'
is_correct: bool | None = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better way that this can be expressed that generalizes better? I don't have an immediate idea, but this seems like the stat of a boolean explosion to me.

...,
description='Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore).',
)
num_turns: conint(ge=1) | None = Field(
None, description='Number of turns in the interaction'
Expand Down Expand Up @@ -150,6 +141,42 @@ class Performance(BaseModel):
)


class MediaType(Enum):
image = 'image'
video = 'video'
audio = 'audio'


class MediaRef(BaseModel):
model_config = ConfigDict(
extra='forbid',
)
media_type: MediaType
uri: str = Field(
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar URI comment

...,
description="Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline.",
)
additional_details: dict[str, str] | None = Field(
None,
description='Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.',
)


class Output(BaseModel):
raw: list[str] = Field(
...,
description="Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
)
reasoning_trace: list[str] | None = Field(
None,
description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)',
)
media: list[MediaRef] | None = Field(
None,
description="Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
)


class InstanceLevelEvaluationLog(BaseModel):
model_config = ConfigDict(
extra='forbid',
Expand Down Expand Up @@ -183,7 +210,11 @@ class InstanceLevelEvaluationLog(BaseModel):
)
interaction_type: InteractionType = Field(
...,
description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents',
description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis.",
)
modality: Modality | None = Field(
None,
description="Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility.",
)
input: Input = Field(
..., description='Input data for the evaluation sample'
Expand Down Expand Up @@ -220,22 +251,31 @@ class InstanceLevelEvaluationLog(BaseModel):

# --- validators (added by post_codegen.py) ---

@model_validator(mode='after')
@model_validator(mode="after")
def validate_interaction_type_consistency(self):
if self.interaction_type == InteractionType.single_turn:
if self.output is None:
raise ValueError('single_turn interaction_type requires output')
raise ValueError("single_turn interaction_type requires output")
if self.messages is not None:
raise ValueError(
'single_turn interaction_type must not have messages'
"single_turn interaction_type must not have messages"
)
else:
if self.messages is None:
raise ValueError(
f'{self.interaction_type.value} interaction_type requires messages'
f"{self.interaction_type.value} interaction_type requires messages"
)
if self.output is not None:
raise ValueError(
f'{self.interaction_type.value} interaction_type must not have output'
f"{self.interaction_type.value} interaction_type must not have output"
)
return self

@model_validator(mode="after")
def validate_modality_consistency(self):
if self.modality == Modality.text_to_image:
if self.output is None or not self.output.media:
raise ValueError(
"modality 'text_to_image' requires output.media to be a non-empty list"
)
return self
8 changes: 8 additions & 0 deletions every_eval_ever/schemas/eval.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@
"type": "string",
"description": "Name of the evaluation"
},
"modality": {
"type": "string",
"enum": [
"text",
"text_to_image"
],
"description": "Modality of the task being evaluated. Absent means 'text' for backwards compatibility. Use 'text_to_image' for prompt-to-image generation tasks. Future modalities will be added to this enum."
},
"source_data": {
"description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
"oneOf": [
Expand Down
43 changes: 38 additions & 5 deletions every_eval_ever/schemas/instance_level_eval.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,12 @@
"interaction_type": {
"type": "string",
"enum": ["single_turn", "multi_turn", "agentic"],
"description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
"description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents. Text-to-image evaluations use 'single_turn'; modality is the orthogonal axis."
},
"modality": {
"type": "string",
"enum": ["text", "text_to_image"],
"description": "Modality of this sample. Should match the modality on the linked aggregate evaluation_result. Absent means 'text' for backwards compatibility."
},
"input": {
"type": "object",
Expand Down Expand Up @@ -82,13 +87,18 @@
"properties": {
"raw": {
"type": "array",
"description": "Complete model responses",
"description": "Complete model responses. For text_to_image modality, populate with one placeholder string per generated artifact (e.g. '<image:0>') so indexes align with output.media[].",
"items": { "type": "string" }
},
"reasoning_trace": {
"type": ["array", "null"],
"description": "Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)",
"items": { "type": "string" }
},
"media": {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this doesn't correspond with the pydantic validator. I'm not sure if it should. If it did it would get a bit wordy, i.e.

{
  "if": {
    "required": ["modality"],
    "properties": {
      "modality": { "const": "text_to_image" }
    }
  },
  "then": {
    "required": ["output"],
    "properties": {
      "output": {
        "type": "object",
        "required": ["media"],
        "properties": {
          "media": {
            "type": "array",
            "minItems": 1
          }
        }
      }
    }
  }
}

The larger issue is having two sources of truth for the schema. Does it make sense to go all in on pydantic and then have it generate the jsonschema?

"type": ["array", "null"],
"description": "Generated media artifacts (image, video, audio). Required when modality == 'text_to_image'.",
"items": { "$ref": "#/$defs/media_ref" }
}
}
},
Expand Down Expand Up @@ -192,8 +202,8 @@
"description": "Instance-level score"
},
"is_correct": {
"type": "boolean",
"description": "Whether the final answer is correct"
"type": ["boolean", "null"],
"description": "Whether the final answer is correct. Required to be present, but may be null when correctness is not well-defined (e.g. continuous T2I metrics like FID or CLIPScore)."
},
"num_turns": {
"type": ["integer", "null"],
Expand Down Expand Up @@ -334,5 +344,28 @@
}
}
}
]
],
"$defs": {
"media_ref": {
"type": "object",
"description": "Reference to a generated media artifact. Required fields are intentionally minimal; record extras (mime_type, sha256, width/height, seed, index, ...) in additional_details.",
"required": ["media_type", "uri"],
"additionalProperties": false,
"properties": {
"media_type": {
"type": "string",
"enum": ["image", "video", "audio"]
},
"uri": {
"type": "string",
"description": "Location of the artifact: 'file://...', 'https://...', 'hf://...', 's3://...', or 'data:...;base64,...' for inline."
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the data:... option is a BAD idea here. This overscopes what this is. This should be a always be a reference to the data, not the data itself. If you want to support embedding the data itself it should be a separate optional field.

I think one URI is also a bad idea. There should be a list of suggested ways to access the data . Otherwise you run into a dead URL issue. It happens too often, and it limits the reproducibility value. URLs rot, they can be changed to point to something else (where the hash is important). I would also suggest thinking about allowing distributed content addressed references like IPFS CIDs or BitTorrent magnet URIs here, which avoid the problem where the content at an address can change, but do not address the link-rot issue (which is just always going to be a fundamental limitation of any reference based scheme).

},
"additional_details": {
"type": "object",
"description": "Per-artifact extras (key-value pairs, all values must be strings). Use for sha256, mime_type, width/height, seed, index, etc.",
"additionalProperties": {"type": "string"}
}
}
}
}
}
21 changes: 18 additions & 3 deletions post_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ def validate_score_type_requirements(self):
if self.max_score is None:
raise ValueError("score_type 'continuous' requires max_score")
return self
""",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably not an issue with this PR, but python code in a string can be a smell. I don't quite understand the purpose of this file yet, but we may want to revisit the design that requires this ATM.

},
{
'file': 'every_eval_ever/instance_level_types.py',
'import_add': 'model_validator',
'class_name': 'InstanceLevelEvaluationLog',
'validator': """
@model_validator(mode="after")
def validate_modality_consistency(self):
if self.modality == Modality.text_to_image:
if self.output is None or not self.output.media:
raise ValueError(
"modality 'text_to_image' requires output.media to be a non-empty list"
)
return self
""",
},
]
Expand Down Expand Up @@ -126,9 +141,9 @@ def patch_file(patch: dict) -> None:
path = Path(__file__).parent / patch['file']
content = path.read_text()

# Check if already patched
if 'post_codegen.py' in content:
print(f' {patch["file"]}: already patched, skipping')
validator_def = re.search(r'def (\w+)\(self', patch['validator'])
if validator_def and f'def {validator_def.group(1)}(self' in content:
print(f' {patch["file"]}: {patch["class_name"]}.{validator_def.group(1)} already patched, skipping')
return

content = add_import(content, patch['import_add'])
Expand Down
Loading