vectorize-io · r266-tech · Jun 17, 2026
diff --git a/hindsight-api-slim/hindsight_api/engine/reflect/agent.py b/hindsight-api-slim/hindsight_api/engine/reflect/agent.py
@@ -109,6 +109,72 @@ def _clean_answer_text(text: str) -> str:
     return cleaned if cleaned else text
 
 
+# Raw, unparsed tool-call scaffolding that some providers leak as answer text
+# when they fail to turn a model's tool-call turn into structured tool_calls
+# (notably gpt-oss / harmony-format models served via litellm Vertex MaaS).
+# See issue #2222.
+#
+# Harmony frame tokens delimit a real harmony turn. A frame token must be present
+# before anything is flagged, so prose that merely quotes the routing namespace
+# (``to=functions.x``) or a lone control token while explaining the format is not
+# mistaken for a leak.
+_HARMONY_FRAME_TOKEN = re.compile(r"<\|(?:start|channel|message)\|>")
+# Evidence of an actual tool call inside that frame: the ``<|call|>`` commit token
+# that terminates a call, or the tool-routing namespace (e.g. ``to=functions.recall``).
+_HARMONY_CALL_TOKEN = re.compile(r"<\|call\|>")
+_HARMONY_TOOL_ROUTING = re.compile(r"(?:^|[\s>])to=(?:functions|tool)\b", re.IGNORECASE)
+# The done() tool's internal citation fields. They never appear in a legitimate
+# free-text answer, so a JSON object carrying any of them is an unambiguous
+# leaked done() payload (vs. a plain ``{"answer": ...}`` blob with no id fields,
+# which is intentionally left alone since it is indistinguishable from a valid
+# JSON answer).
+_DONE_TOOL_ID_FIELDS = ("memory_ids", "mental_model_ids", "observation_ids")
+
+
+def _looks_like_unparsed_tool_call(text: str) -> bool:
+    """Return True if ``text`` is raw, unparsed tool-call scaffolding.
+
+    Some providers intermittently fail to parse a model's tool-call turn into
+    structured ``tool_calls`` (notably gpt-oss / harmony-format models served
+    through litellm Vertex MaaS). The raw harmony tool-call structure
+    (``<|channel|>commentary to=functions.recall<|message|>{...}<|call|>``), or a
+    done() payload emitted as bare JSON that carries the tool's internal citation
+    id fields, then land in ``message.content`` with no tool executed. Surfacing
+    that verbatim returns corrupt, unsynthesized output to the user (issue
+    #2222), so it must be treated as a failed parse and routed to final synthesis
+    instead of accepted as an answer.
+
+    Detection is intentionally narrow to avoid rejecting valid answers. Harmony
+    scaffolding is only flagged when a harmony frame token is present together
+    with call evidence (a ``<|call|>`` commit token or the ``to=functions/tool``
+    routing namespace); an answer that merely quotes one of those tokens while
+    explaining the format has no such pairing and is left untouched. A bare
+    ``{"answer": ...}`` object with no id fields is likewise left alone, being
+    indistinguishable from a valid JSON answer. (An answer that embeds a full
+    harmony tool-call example verbatim would be re-synthesized rather than echoed
+    — an accepted, non-destructive trade-off given how rare that is for a
+    memory-synthesis answer and that the fallback reuses the same evidence.)
+    """
+    if not text:
+        return False
+    stripped = text.strip()
+    if _HARMONY_FRAME_TOKEN.search(stripped) and (
+        _HARMONY_CALL_TOKEN.search(stripped) or _HARMONY_TOOL_ROUTING.search(stripped)
+    ):
+        return True
+    # done() emitted as bare JSON text instead of a structured tool call. Key on
+    # the internal id fields (which never appear in a real answer); the ``answer``
+    # field is optional so a leaked payload that omits it is still caught.
+    if stripped.startswith("{") and stripped.endswith("}"):
+        try:
+            obj = json.loads(stripped)
+        except (ValueError, TypeError):
+            return False
+        if isinstance(obj, dict) and any(f in obj for f in _DONE_TOOL_ID_FIELDS):
+            return True
+    return False
+
+
 def _clean_done_answer(text: str) -> str:
     """Clean up the answer field from a done() tool call.
 
@@ -750,7 +816,19 @@ def _log_completion(answer: str, iterations: int, forced: bool = False):
                 bool(available_memory_ids) or bool(available_mental_model_ids) or bool(available_observation_ids)
             )
             directive_leak_risk = directives and not has_gathered_evidence
-            if result.content and not directive_leak_risk:
+            # Some providers (e.g. gpt-oss / harmony models via litellm Vertex
+            # MaaS) intermittently fail to parse a tool-call turn into structured
+            # ``tool_calls`` and leave raw harmony scaffolding (or a done() payload
+            # carrying internal citation ids) in ``content`` with ``tool_calls``
+            # empty. Accepting that as the answer surfaces corrupt, unsynthesized
+            # text to the user (issue #2222), so route it to final synthesis.
+            unparsed_tool_call = bool(result.content) and _looks_like_unparsed_tool_call(result.content)
+            if unparsed_tool_call:
+                logger.warning(
+                    f"[REFLECT {reflect_id}] Discarding unparsed tool-call scaffolding in answer "
+                    f"text on iteration {iteration + 1}; forcing final synthesis."
+                )
+            if result.content and not directive_leak_risk and not unparsed_tool_call:
                 answer = _clean_answer_text(result.content.strip())
 
                 # The call_with_tools call above is intentionally uncapped so the

diff --git a/hindsight-api-slim/tests/test_reflect_agent.py b/hindsight-api-slim/tests/test_reflect_agent.py
@@ -20,6 +20,7 @@
     _count_messages_tokens,
     _is_context_overflow_error,
     _is_done_tool,
+    _looks_like_unparsed_tool_call,
     _normalize_tool_name,
     run_reflect_agent,
 )
@@ -142,6 +143,53 @@ def test_clean_answer_multiline_with_markdown(self):
         assert "mental_model_ids" not in cleaned
 
 
+class TestUnparsedToolCallDetection:
+    """Detect raw harmony / tool-call scaffolding leaking as answer text (issue #2222)."""
+
+    @pytest.mark.parametrize(
+        "text",
+        [
+            # Full leaked harmony tool call: frame tokens + routing + commit.
+            "<|start|>assistant<|channel|>commentary to=functions.recall<|message|>{\"query\": \"x\"}<|call|>",
+            # Frame token + <|call|> commit (no routing namespace visible).
+            "<|channel|>commentary<|message|>{\"name\": \"recall\"}<|call|>",
+            # Frame token + routing, truncated before the commit token.
+            "<|channel|>commentary to=functions.recall<|message|>{\"query\": \"x\"}",
+            # done() emitted as bare JSON carrying the tool's internal id fields.
+            '{"answer": "Confirmed.", "memory_ids": ["mem-1"]}',
+            '  {"answer": "no data", "observation_ids": [], "mental_model_ids": []}  ',
+            # Leaked done payload that omits "answer" but keeps the id fields.
+            '{"memory_ids": ["mem-1"], "mental_model_ids": []}',
+        ],
+    )
+    def test_detects_scaffolding(self, text):
+        assert _looks_like_unparsed_tool_call(text) is True
+
+    @pytest.mark.parametrize(
+        "text",
+        [
+            "",
+            None,
+            "The team meets every Tuesday to review the roadmap.",
+            "Here is the answer: the launch is on Friday.",
+            "I found that the user prefers concise answers and dislikes long emails.",
+            "The function to=string mapping was discussed, but no tool was called.",
+            # Legitimate answers that merely *explain* harmony syntax must not be
+            # flagged: a frame token alone, or the routing namespace alone, is not
+            # a leaked call.
+            "The `<|channel|>` token marks a section boundary in the harmony format.",
+            "Use `<|start|>` and `<|message|>` to frame a turn; they are control tokens.",
+            "Use `to=functions.recall` for the recall tool and `to=tool.foo` for foo.",
+            # A bare answer object with no internal id fields is indistinguishable
+            # from a valid JSON answer, so it is deliberately NOT flagged.
+            '{"answer": "I\'m sorry, but I don\'t have that information."}',
+            '{"result": 42, "status": "ok"}',
+        ],
+    )
+    def test_passes_clean_prose(self, text):
+        assert _looks_like_unparsed_tool_call(text) is False
+
+
 class TestToolNameNormalization:
     """Test tool name normalization for various LLM output formats."""
 
@@ -750,6 +798,65 @@ async def test_short_circuit_answer_under_cap_is_not_rewritten(self, mock_llm, m
         assert result.text == short_answer
         mock_llm.call.assert_not_called()
 
+    @pytest.mark.asyncio
+    async def test_unparsed_harmony_scaffolding_is_not_returned_as_answer(self, mock_llm, mock_functions):
+        """If a provider leaves raw harmony tool-call scaffolding in ``content`` with empty
+        ``tool_calls`` (litellm failing to parse a gpt-oss tool turn, issue #2222), the agent
+        must NOT surface that scaffolding; it routes to final synthesis instead.
+        """
+        leaked = (
+            "<|start|>assistant<|channel|>commentary "
+            'to=functions.recall<|message|>{"query": "recent activity"}<|call|>'
+        )
+        # Every agent turn leaks scaffolding instead of a parsed tool call.
+        mock_llm.call_with_tools.return_value = LLMToolCallResult(
+            tool_calls=[],
+            content=leaked,
+            finish_reason="stop",
+            input_tokens=10,
+            output_tokens=40,
+        )
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="summarize recent activity",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            **mock_functions,
+        )
+
+        # The corrupt scaffolding must never reach the user-visible answer.
+        assert "<|" not in result.text
+        assert "to=functions" not in result.text
+        # Instead the agent fell through to the forced final-synthesis path.
+        assert result.text == "Fallback answer from final iteration"
+        mock_llm.call.assert_called()
+
+    @pytest.mark.asyncio
+    async def test_leaked_done_payload_with_ids_is_not_returned_as_answer(self, mock_llm, mock_functions):
+        """A done() payload emitted as bare JSON carrying the tool's internal id fields
+        (no real tool call) must be rejected and routed to final synthesis, not surfaced.
+        """
+        mock_llm.call_with_tools.return_value = LLMToolCallResult(
+            tool_calls=[],
+            content='{"answer": "I\'m sorry, but I don\'t have that information.", "memory_ids": []}',
+            finish_reason="stop",
+            input_tokens=10,
+            output_tokens=15,
+        )
+
+        result = await run_reflect_agent(
+            llm_config=mock_llm,
+            bank_id="test-bank",
+            query="what is the status?",
+            bank_profile={"name": "Test", "mission": "Testing"},
+            **mock_functions,
+        )
+
+        assert "memory_ids" not in result.text
+        assert result.text == "Fallback answer from final iteration"
+        mock_llm.call.assert_called()
+
     @pytest.mark.asyncio
     async def test_max_iterations_reached(self, mock_llm, mock_functions):
         """Test that agent stops after max iterations even with errors."""