lm-sys · wenze2527 · Apr 18, 2026
diff --git a/gemini_proxy.py b/gemini_proxy.py
@@ -0,0 +1,102 @@
+"""
+OpenAI-compatible proxy that routes requests to the Gemini CLI.
+Runs on http://localhost:8080/v1 — RouteLLM uses this as its strong model.
+"""
+
+import asyncio
+import json
+import shutil
+import subprocess
+import time
+import uuid
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+import uvicorn
+
+app = FastAPI(title="Gemini CLI Proxy")
+
+# Resolve the full path to gemini (handles Windows .cmd wrappers)
+GEMINI_CMD = shutil.which("gemini") or "gemini"
+REQUEST_TIMEOUT = 120
+
+
+def messages_to_prompt(messages: list[dict]) -> str:
+    """Flatten OpenAI message list into a single prompt string for Gemini CLI."""
+    parts = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        if isinstance(content, list):
+            content = " ".join(
+                p.get("text", "") for p in content if isinstance(p, dict)
+            )
+        if role == "system":
+            parts.append(f"[System instructions: {content}]")
+        elif role == "user":
+            parts.append(f"User: {content}")
+        elif role == "assistant":
+            parts.append(f"Assistant: {content}")
+    return "\n".join(parts)
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    body = await request.json()
+    messages = body.get("messages", [])
+    prompt = messages_to_prompt(messages)
+
+    def run_gemini() -> str:
+        result = subprocess.run(
+            [GEMINI_CMD, "-p", prompt],
+            capture_output=True,
+            timeout=REQUEST_TIMEOUT,
+            shell=False,
+        )
+        return result.stdout.decode("utf-8", errors="replace").strip()
+
+    try:
+        response_text = await asyncio.wait_for(
+            asyncio.to_thread(run_gemini),
+            timeout=REQUEST_TIMEOUT + 5,
+        )
+    except (asyncio.TimeoutError, subprocess.TimeoutExpired):
+        return JSONResponse({"error": {"message": "Gemini CLI timed out", "type": "timeout"}}, status_code=504)
+
+    return JSONResponse({
+        "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": "gemini-cli",
+        "choices": [{
+            "index": 0,
+            "message": {"role": "assistant", "content": response_text},
+            "finish_reason": "stop",
+        }],
+        "usage": {
+            "prompt_tokens": len(prompt.split()),
+            "completion_tokens": len(response_text.split()),
+            "total_tokens": len(prompt.split()) + len(response_text.split()),
+        },
+    })
+
+
+@app.get("/v1/models")
+async def list_models():
+    return JSONResponse({
+        "object": "list",
+        "data": [{
+            "id": "gemini-cli",
+            "object": "model",
+            "created": int(time.time()),
+            "owned_by": "google",
+        }],
+    })
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="127.0.0.1", port=8080, log_level="warning")
diff --git a/start.bat b/start.bat
@@ -0,0 +1,6 @@
+@echo off
+setlocal
+cd /d "%~dp0"
+
+set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe
+"%VENV_PYTHON%" start.py %*
diff --git a/start.py b/start.py
@@ -0,0 +1,117 @@
+"""
+RouteLLM launcher — starts Gemini proxy then RouteLLM server.
+
+  Strong model: Gemini CLI via local proxy on :8080
+  Weak model:   Ollama phi4 on :11434 (change WEAK_MODEL to swap)
+  Router:       mf (matrix factorization, best accuracy)
+  RouteLLM:     OpenAI-compatible server on :6060
+
+Usage:
+    python start.py                   # defaults
+    python start.py --threshold 0.2   # more Gemini, less Ollama
+    python start.py --weak phi4       # pick a different Ollama model
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+import urllib.request
+
+# Use the hermes venv Python which has routellm + fastapi installed
+HERMES_PYTHON = os.path.join(
+    os.environ.get("LOCALAPPDATA", ""),
+    "hermes", "hermes-agent", "venv", "Scripts", "python.exe",
+)
+VENV_PYTHON = HERMES_PYTHON if os.path.exists(HERMES_PYTHON) else sys.executable
+
+# ── tunables ────────────────────────────────────────────────────────────────
+PROXY_PORT = 8080
+ROUTELLM_PORT = 6060
+STRONG_MODEL = "openai/gemini-cli"          # resolved via OPENAI_API_BASE below
+WEAK_MODEL_DEFAULT = "ollama_chat/phi4"
+ROUTER = "mf"
+THRESHOLD = 0.11593  # ~50 % strong-model calls; lower = more Gemini
+# ────────────────────────────────────────────────────────────────────────────
+
+
+def wait_for_http(url: str, timeout: int = 30) -> bool:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            urllib.request.urlopen(url, timeout=2)
+            return True
+        except Exception:
+            time.sleep(1)
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--threshold", type=float, default=THRESHOLD,
+                        help="Cost threshold (lower = more strong-model calls)")
+    parser.add_argument("--weak", default=WEAK_MODEL_DEFAULT,
+                        help="Weak (cheap) model in LiteLLM format")
+    parser.add_argument("--port", type=int, default=ROUTELLM_PORT,
+                        help="RouteLLM server port")
+    args = parser.parse_args()
+
+    # Gemini proxy must resolve via OpenAI provider in LiteLLM
+    os.environ.setdefault("OPENAI_API_KEY", "no-key")
+    os.environ["OPENAI_API_BASE"] = f"http://127.0.0.1:{PROXY_PORT}/v1"
+
+    here = os.path.dirname(os.path.abspath(__file__))
+
+    # ── 1. Start Gemini CLI proxy ─────────────────────────────────────────
+    print(f"Starting Gemini CLI proxy on port {PROXY_PORT}...")
+    proxy = subprocess.Popen(
+        [VENV_PYTHON, os.path.join(here, "gemini_proxy.py")],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+
+    if not wait_for_http(f"http://127.0.0.1:{PROXY_PORT}/health", timeout=15):
+        proxy.kill()
+        sys.exit("ERROR: Gemini proxy failed to start.")
+    print(f"  Gemini proxy ready at http://127.0.0.1:{PROXY_PORT}/v1")
+
+    # ── 2. Start RouteLLM OpenAI-compatible server ────────────────────────
+    print(f"Starting RouteLLM on port {args.port} (router={ROUTER}, threshold={args.threshold})...")
+    config_path = os.path.join(here, "config.example.yaml")
+    routellm_cmd = [
+        VENV_PYTHON, "-m", "routellm.openai_server",
+        "--routers", ROUTER,
+        "--strong-model", STRONG_MODEL,
+        "--weak-model", args.weak,
+        "--config", config_path,
+        "--port", str(args.port),
+    ]
+    server = subprocess.Popen(routellm_cmd)
+
+    print(f"""
+╔══════════════════════════════════════════════════════╗
+║  RouteLLM is running                                 ║
+╠══════════════════════════════════════════════════════╣
+║  Endpoint:     http://localhost:{args.port}/v1          ║
+║  Strong model: Gemini CLI (via proxy :{PROXY_PORT})  ║
+║  Weak model:   {args.weak:<36} ║
+║  Router:       {ROUTER} (threshold={args.threshold})                ║
+╠══════════════════════════════════════════════════════╣
+║  Use model string in your client:                    ║
+║    router-{ROUTER}-{args.threshold}                         ║
+╚══════════════════════════════════════════════════════╝
+Press Ctrl+C to stop.
+""")
+
+    try:
+        server.wait()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+    finally:
+        server.terminate()
+        proxy.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/start_and_test.bat b/start_and_test.bat
@@ -0,0 +1,51 @@
+@echo off
+setlocal
+cd /d "%~dp0"
+
+set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe
+set ROUTELLM_URL=http://localhost:6060/v1/models
+
+echo ============================================
+echo  RouteLLM 啟動中...
+echo ============================================
+
+:: Check if already running
+curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1
+if %errorlevel% == 0 (
+    echo [OK] RouteLLM 已在運行，跳過啟動
+    goto :test
+)
+
+:: Start RouteLLM in a new window
+start "RouteLLM" "%VENV_PYTHON%" start.py
+
+:: Wait up to 30 seconds for RouteLLM to be ready
+echo 等待 RouteLLM 就緒...
+set /a attempts=0
+:wait_loop
+    timeout /t 2 /nobreak >nul
+    curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1
+    if %errorlevel% == 0 goto :ready
+    set /a attempts+=1
+    echo   [%attempts%] 還在啟動中...
+    if %attempts% lss 15 goto :wait_loop
+
+echo [ERROR] RouteLLM 啟動逾時，請檢查 RouteLLM 視窗的錯誤訊息
+pause
+exit /b 1
+
+:ready
+echo [OK] RouteLLM 已就緒！
+
+:test
+echo.
+echo ============================================
+echo  執行路由測試...
+echo ============================================
+"%VENV_PYTHON%" test_routing.py
+
+echo.
+echo ============================================
+echo  完成！按任意鍵關閉
+echo ============================================
+pause
diff --git a/test_routing.py b/test_routing.py
@@ -0,0 +1,38 @@
+"""
+Quick smoke-test for the RouteLLM setup.
+Run AFTER start.py is running.
+
+  python test_routing.py
+"""
+
+import openai
+import time
+
+client = openai.OpenAI(
+    base_url="http://localhost:6060/v1",
+    api_key="no-key",
+)
+
+ROUTER = "mf"
+THRESHOLD = 0.11593
+MODEL = f"router-{ROUTER}-{THRESHOLD}"
+
+TESTS = [
+    ("easy",   "What is 2+2?"),
+    ("medium", "Explain what a REST API is in one sentence."),
+    ("hard",   "Describe three subtle differences between Python's GIL and Java's memory model in terms of concurrency guarantees."),
+]
+
+print(f"Testing RouteLLM at http://localhost:6060 with model={MODEL}\n")
+for label, prompt in TESTS:
+    t0 = time.time()
+    resp = client.chat.completions.create(
+        model=MODEL,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    elapsed = time.time() - t0
+    answer = resp.choices[0].message.content.strip()
+    routed_to = getattr(resp, "model", "unknown")
+    print(f"[{label:6s}] ({elapsed:.1f}s) → {routed_to}")
+    print(f"  Q: {prompt[:60]}")
+    print(f"  A: {answer[:80]}\n")