From 25299065fd1e815e96860fd63b4f7d10f0e9ba4b Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Mon, 27 Apr 2026 12:45:34 +0900 Subject: [PATCH 1/7] fix(BA-1929): Send Accept: application/json from AppProxy client The AppProxy coordinator's exception middleware returned an HTML error page when the request did not specify an Accept header, which caused endpoint create/delete failures to surface as unparseable HTML in the manager logs (see issue #5228). The status check path already sets Accept: application/json correctly; this aligns the create/delete and bulk endpoint calls with the same behavior by routing all requests through a shared header helper. --- .../manager/clients/appproxy/client.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/ai/backend/manager/clients/appproxy/client.py b/src/ai/backend/manager/clients/appproxy/client.py index 6e2e1306481..2a5bbe947e8 100644 --- a/src/ai/backend/manager/clients/appproxy/client.py +++ b/src/ai/backend/manager/clients/appproxy/client.py @@ -97,6 +97,12 @@ async def fetch_status(self) -> AppProxyStatusResponse: extra_msg=f"Invalid response from AppProxy at {self._address}" ) from e + def _auth_headers(self) -> dict[str, str]: + return { + "Accept": "application/json", + "X-BackendAI-Token": self._token, + } + @appproxy_client_resilience.apply() async def create_endpoint( self, @@ -106,9 +112,7 @@ async def create_endpoint( async with self._client_session.post( f"/v2/endpoints/{endpoint_id}", json=body.model_dump(mode="json"), - headers={ - "X-BackendAI-Token": self._token, - }, + headers=self._auth_headers(), ) as resp: resp.raise_for_status() result: dict[str, Any] = await resp.json() @@ -129,9 +133,7 @@ async def create_endpoints_bulk( async with self._client_session.post( "/v2/endpoints/bulk", json=body.model_dump(mode="json"), - headers={ - "X-BackendAI-Token": self._token, - }, + headers=self._auth_headers(), ) as resp: resp.raise_for_status() payload = await resp.json() @@ -144,9 +146,7 @@ async def delete_endpoint( ) -> None: async with self._client_session.delete( f"/v2/endpoints/{endpoint_id}", - headers={ - "X-BackendAI-Token": self._token, - }, + headers=self._auth_headers(), ): pass @@ -165,9 +165,7 @@ async def delete_endpoints_bulk( "DELETE", "/v2/endpoints/bulk", json=body.model_dump(mode="json"), - headers={ - "X-BackendAI-Token": self._token, - }, + headers=self._auth_headers(), ) as resp: resp.raise_for_status() payload = await resp.json() From 67be543748ea9f1f72b0ba1ecc75efad11b9a77c Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Mon, 27 Apr 2026 12:46:50 +0900 Subject: [PATCH 2/7] docs(BA-1929): Add news fragment for #11328 --- changes/11328.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/11328.fix.md diff --git a/changes/11328.fix.md b/changes/11328.fix.md new file mode 100644 index 00000000000..c614d5362b0 --- /dev/null +++ b/changes/11328.fix.md @@ -0,0 +1 @@ +Send `Accept: application/json` from the manager's AppProxy client so endpoint create/delete failures return parseable JSON instead of HTML error pages. From 9e7255019dd4cea33fa872695e8deb82032b30f4 Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Mon, 27 Apr 2026 13:10:02 +0900 Subject: [PATCH 3/7] refactor(BA-1929): Inline Accept header per endpoint method The previous commit grouped Accept and X-BackendAI-Token in a _auth_headers helper, but Accept is content negotiation rather than authentication, and the grouping is misleading. Drop the helper and inline the headers dict at each of the four endpoint methods so the intent at each call site is local and explicit. --- .../manager/clients/appproxy/client.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/ai/backend/manager/clients/appproxy/client.py b/src/ai/backend/manager/clients/appproxy/client.py index 2a5bbe947e8..f57d975e769 100644 --- a/src/ai/backend/manager/clients/appproxy/client.py +++ b/src/ai/backend/manager/clients/appproxy/client.py @@ -97,12 +97,6 @@ async def fetch_status(self) -> AppProxyStatusResponse: extra_msg=f"Invalid response from AppProxy at {self._address}" ) from e - def _auth_headers(self) -> dict[str, str]: - return { - "Accept": "application/json", - "X-BackendAI-Token": self._token, - } - @appproxy_client_resilience.apply() async def create_endpoint( self, @@ -112,7 +106,10 @@ async def create_endpoint( async with self._client_session.post( f"/v2/endpoints/{endpoint_id}", json=body.model_dump(mode="json"), - headers=self._auth_headers(), + headers={ + "Accept": "application/json", + "X-BackendAI-Token": self._token, + }, ) as resp: resp.raise_for_status() result: dict[str, Any] = await resp.json() @@ -133,7 +130,10 @@ async def create_endpoints_bulk( async with self._client_session.post( "/v2/endpoints/bulk", json=body.model_dump(mode="json"), - headers=self._auth_headers(), + headers={ + "Accept": "application/json", + "X-BackendAI-Token": self._token, + }, ) as resp: resp.raise_for_status() payload = await resp.json() @@ -146,7 +146,10 @@ async def delete_endpoint( ) -> None: async with self._client_session.delete( f"/v2/endpoints/{endpoint_id}", - headers=self._auth_headers(), + headers={ + "Accept": "application/json", + "X-BackendAI-Token": self._token, + }, ): pass @@ -165,7 +168,10 @@ async def delete_endpoints_bulk( "DELETE", "/v2/endpoints/bulk", json=body.model_dump(mode="json"), - headers=self._auth_headers(), + headers={ + "Accept": "application/json", + "X-BackendAI-Token": self._token, + }, ) as resp: resp.raise_for_status() payload = await resp.json() From caafcc19c8d435240fbaaea2ce5e3befcc2527c0 Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Mon, 27 Apr 2026 13:01:42 +0900 Subject: [PATCH 4/7] fix(manager): Surface AppProxy client endpoint errors as domain exceptions The four mutating methods on AppProxyClient (create_endpoint, create_endpoints_bulk, delete_endpoint, delete_endpoints_bulk) either silently swallowed non-2xx responses (delete_endpoint) or leaked raw aiohttp.ClientResponseError / ContentTypeError to callers, neither of which inherits from BackendAIError. As a result, deletion failures were lost and other failures arrived at the deployment executor as non-domain exceptions, dropping the AppProxy error context. Introduce a shared `_request` async context manager that wraps ClientConnectorError into AppProxyConnectionError and any non-2xx status into AppProxyResponseError, attaching the upstream response body (parsed JSON when possible, raw text otherwise) as `extra_data` so a structured BackendAIError payload from the coordinator survives the translation. Add a `_parse_json` helper for the success path that maps ContentTypeError / JSONDecodeError to AppProxyResponseError. `fetch_status` keeps its existing handler since it talks to a different endpoint and is already aligned with the domain exceptions. Refs #11331, builds on #11328. --- .../manager/clients/appproxy/client.py | 122 +++++++++++++----- 1 file changed, 93 insertions(+), 29 deletions(-) diff --git a/src/ai/backend/manager/clients/appproxy/client.py b/src/ai/backend/manager/clients/appproxy/client.py index f57d975e769..de983d64ea7 100644 --- a/src/ai/backend/manager/clients/appproxy/client.py +++ b/src/ai/backend/manager/clients/appproxy/client.py @@ -2,6 +2,8 @@ import json import logging +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager from typing import Any from uuid import UUID @@ -97,22 +99,93 @@ async def fetch_status(self) -> AppProxyStatusResponse: extra_msg=f"Invalid response from AppProxy at {self._address}" ) from e + @asynccontextmanager + async def _request( + self, + method: str, + path: str, + *, + operation: str, + json_body: Any = None, + ) -> AsyncIterator[aiohttp.ClientResponse]: + """Issue an authenticated request and translate transport errors. + + Connection failures become ``AppProxyConnectionError``. Non-2xx + responses become ``AppProxyResponseError`` with the upstream body + attached as ``extra_data`` so a structured ``BackendAIError`` + payload returned by the coordinator survives the translation. + """ + try: + async with self._client_session.request( + method, + path, + headers={ + "Accept": "application/json", + "X-BackendAI-Token": self._token, + }, + json=json_body, + ) as resp: + if resp.status >= 400: + text = await resp.text() + try: + error_body: Any = json.loads(text) if text else None + except json.JSONDecodeError: + error_body = text + log.error( + "AppProxy at {} returned {} during {}: {!r}", + self._address, + resp.status, + operation, + error_body, + ) + raise AppProxyResponseError( + extra_msg=(f"AppProxy returned HTTP {resp.status} during {operation}"), + extra_data={"status": resp.status, "body": error_body}, + ) + yield resp + except aiohttp.ClientConnectorError as e: + log.error( + "Failed to connect to AppProxy at {} during {}: {}", + self._address, + operation, + e, + ) + raise AppProxyConnectionError( + extra_msg=f"Failed to connect to AppProxy at {self._address}" + ) from e + + async def _parse_json( + self, + resp: aiohttp.ClientResponse, + *, + operation: str, + ) -> Any: + try: + return await resp.json() + except (aiohttp.ContentTypeError, json.JSONDecodeError) as e: + log.error( + "Failed to parse AppProxy {} response from {}: {}", + operation, + self._address, + e, + ) + raise AppProxyResponseError( + extra_msg=(f"Invalid response from AppProxy at {self._address} during {operation}"), + ) from e + @appproxy_client_resilience.apply() async def create_endpoint( self, endpoint_id: UUID, body: CreateEndpointRequestBody, ) -> dict[str, Any]: - async with self._client_session.post( + async with self._request( + "POST", f"/v2/endpoints/{endpoint_id}", - json=body.model_dump(mode="json"), - headers={ - "Accept": "application/json", - "X-BackendAI-Token": self._token, - }, + operation="create_endpoint", + json_body=body.model_dump(mode="json"), ) as resp: - resp.raise_for_status() - result: dict[str, Any] = await resp.json() + result: dict[str, Any] = await self._parse_json(resp, operation="create_endpoint") return result @appproxy_client_resilience.apply() @@ -127,16 +200,13 @@ async def create_endpoints_bulk( so this is the preferred way to register many deployments at once (e.g. from the deployment provisioning handler). """ - async with self._client_session.post( + async with self._request( + "POST", "/v2/endpoints/bulk", - json=body.model_dump(mode="json"), - headers={ - "Accept": "application/json", - "X-BackendAI-Token": self._token, - }, + operation="create_endpoints_bulk", + json_body=body.model_dump(mode="json"), ) as resp: - resp.raise_for_status() - payload = await resp.json() + payload = await self._parse_json(resp, operation="create_endpoints_bulk") return BulkCreateEndpointResponse.model_validate(payload) @appproxy_client_resilience.apply() @@ -144,12 +214,10 @@ async def delete_endpoint( self, endpoint_id: UUID, ) -> None: - async with self._client_session.delete( + async with self._request( + "DELETE", f"/v2/endpoints/{endpoint_id}", - headers={ - "Accept": "application/json", - "X-BackendAI-Token": self._token, - }, + operation="delete_endpoint", ): pass @@ -164,15 +232,11 @@ async def delete_endpoints_bulk( per-endpoint result in input order, so the caller can decide how to treat partial failures (retry, log, etc.). """ - async with self._client_session.request( + async with self._request( "DELETE", "/v2/endpoints/bulk", - json=body.model_dump(mode="json"), - headers={ - "Accept": "application/json", - "X-BackendAI-Token": self._token, - }, + operation="delete_endpoints_bulk", + json_body=body.model_dump(mode="json"), ) as resp: - resp.raise_for_status() - payload = await resp.json() + payload = await self._parse_json(resp, operation="delete_endpoints_bulk") return BulkDeleteEndpointResponse.model_validate(payload) From 295882c7848b872fdbd5709c3a8f3b2dc6d3fd43 Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Mon, 27 Apr 2026 13:02:27 +0900 Subject: [PATCH 5/7] docs: Add news fragment for #11333 --- changes/11333.fix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/11333.fix.md diff --git a/changes/11333.fix.md b/changes/11333.fix.md new file mode 100644 index 00000000000..97cde474555 --- /dev/null +++ b/changes/11333.fix.md @@ -0,0 +1 @@ +Surface AppProxy coordinator failures from `AppProxyClient` as `AppProxyConnectionError` / `AppProxyResponseError` instead of silently dropping deletion errors or leaking raw `aiohttp` exceptions, and preserve the upstream error body as `extra_data` for diagnostics. From d78292bffb88d219d1ae1ddf656d990a241c6322 Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Wed, 13 May 2026 12:47:23 +0900 Subject: [PATCH 6/7] Update src/ai/backend/manager/clients/appproxy/client.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bokyum Kim | 김보겸 --- src/ai/backend/manager/clients/appproxy/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ai/backend/manager/clients/appproxy/client.py b/src/ai/backend/manager/clients/appproxy/client.py index de983d64ea7..9db0cfaee4e 100644 --- a/src/ai/backend/manager/clients/appproxy/client.py +++ b/src/ai/backend/manager/clients/appproxy/client.py @@ -3,7 +3,7 @@ import json import logging from collections.abc import AsyncIterator -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager as actxmgr from typing import Any from uuid import UUID From ac0a57413586973f58485718a4b1f3be9f369e87 Mon Sep 17 00:00:00 2001 From: Jeongseok Kang Date: Tue, 26 May 2026 12:44:26 +0900 Subject: [PATCH 7/7] Delete changes/11328.fix.md --- changes/11328.fix.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 changes/11328.fix.md diff --git a/changes/11328.fix.md b/changes/11328.fix.md deleted file mode 100644 index c614d5362b0..00000000000 --- a/changes/11328.fix.md +++ /dev/null @@ -1 +0,0 @@ -Send `Accept: application/json` from the manager's AppProxy client so endpoint create/delete failures return parseable JSON instead of HTML error pages.