From 6808f926ad34d36f0c90aea5d25ff6430b60599a Mon Sep 17 00:00:00 2001 From: Codex Date: Fri, 19 Jun 2026 16:30:00 +0000 Subject: [PATCH] Add truncated reasoning response error --- tests/test_error_chain.py | 15 ++++++++++++++- tests/test_renderer_client.py | 15 ++++++++++++++- verifiers/clients/renderer_client.py | 10 +++++++++- verifiers/errors.py | 6 ++++++ verifiers/utils/error_utils.py | 1 + 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/tests/test_error_chain.py b/tests/test_error_chain.py index 1b88ea5cfe..4a095c1fbb 100644 --- a/tests/test_error_chain.py +++ b/tests/test_error_chain.py @@ -1,7 +1,12 @@ """Tests for verifiers.utils.error_utils.ErrorChain.""" import verifiers as vf -from verifiers.utils.error_utils import ErrorChain, get_vf_error_chain +from verifiers.utils.error_utils import ( + ErrorChain, + error_data, + error_from_data, + get_vf_error_chain, +) class TestErrorChain: @@ -147,3 +152,11 @@ def test_hashable_for_counter(self): # error1 and error2 have same type, should be counted together assert counter[ErrorChain(ValueError("any"))] == 2 assert counter[ErrorChain(TypeError("any"))] == 1 + + def test_truncated_reasoning_error_round_trips(self): + error = vf.TruncatedReasoningError("truncated reasoning") + + rebuilt = error_from_data(error_data(error)) + + assert isinstance(rebuilt, vf.TruncatedReasoningError) + assert isinstance(rebuilt, vf.EmptyModelResponseError) diff --git a/tests/test_renderer_client.py b/tests/test_renderer_client.py index 39efe2cd95..5f9b7c955f 100644 --- a/tests/test_renderer_client.py +++ b/tests/test_renderer_client.py @@ -16,7 +16,7 @@ _step_token_ids, _to_renderer_message, ) -from verifiers.errors import EmptyModelResponseError +from verifiers.errors import EmptyModelResponseError, TruncatedReasoningError from verifiers.types import ( AssistantMessage, SystemMessage, @@ -363,6 +363,19 @@ async def test_renderer_client_rejects_reasoning_only_native_response(): await client.raise_from_native_response({"reasoning_content": "hidden chain"}) +@pytest.mark.asyncio +async def test_renderer_client_rejects_truncated_reasoning_native_response(): + client = object.__new__(RendererClient) + + with pytest.raises( + TruncatedReasoningError, match="length limit after reasoning" + ) as exc_info: + await client.raise_from_native_response( + {"reasoning_content": "hidden chain", "finish_reason": "length"} + ) + assert isinstance(exc_info.value, EmptyModelResponseError) + + @pytest.mark.asyncio async def test_from_native_response_uses_request_id_and_token_lengths(): """vLLM's /inference/v1/generate returns ``request_id`` (not ``id``) and diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 64ca4ec89d..c32c3f6bb6 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -39,7 +39,11 @@ from verifiers.clients.openai_chat_completions_client import ( handle_openai_overlong_prompt, ) -from verifiers.errors import EmptyModelResponseError, OverlongPromptError +from verifiers.errors import ( + EmptyModelResponseError, + OverlongPromptError, + TruncatedReasoningError, +) from verifiers.types import ( AssistantMessage, ClientConfig, @@ -643,6 +647,10 @@ async def raise_from_native_response(self, response: dict[str, Any]) -> None: has_reasoning = bool(response.get("reasoning_content")) if not (has_content or has_tool_calls): if has_reasoning: + if response.get("finish_reason") == "length": + raise TruncatedReasoningError( + "Model hit length limit after reasoning but before content or tool calls" + ) raise EmptyModelResponseError( "Model returned reasoning but no content and did not call any tools" ) diff --git a/verifiers/errors.py b/verifiers/errors.py index e725580e48..f55fae4f0b 100644 --- a/verifiers/errors.py +++ b/verifiers/errors.py @@ -20,6 +20,12 @@ class EmptyModelResponseError(InvalidModelResponseError): pass +class TruncatedReasoningError(EmptyModelResponseError): + """Model hit a length limit before returning content or tool calls.""" + + pass + + class OverlongPromptError(Error): """Used to catch overlong prompt errors (e.g. prompt + requested number of tokens exceeds model context length)""" diff --git a/verifiers/utils/error_utils.py b/verifiers/utils/error_utils.py index 37ac1a4058..ed93a13730 100644 --- a/verifiers/utils/error_utils.py +++ b/verifiers/utils/error_utils.py @@ -115,6 +115,7 @@ def vf_error_types() -> tuple[type[vf.Error], ...]: vf.SandboxError, vf.TunnelError, vf.InfraError, + vf.TruncatedReasoningError, vf.EmptyModelResponseError, vf.InvalidModelResponseError, vf.ModelError,