diff --git a/gemini_proxy.py b/gemini_proxy.py new file mode 100644 index 0000000..9b39f39 --- /dev/null +++ b/gemini_proxy.py @@ -0,0 +1,102 @@ +""" +OpenAI-compatible proxy that routes requests to the Gemini CLI. +Runs on http://localhost:8080/v1 — RouteLLM uses this as its strong model. +""" + +import asyncio +import json +import shutil +import subprocess +import time +import uuid +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse +import uvicorn + +app = FastAPI(title="Gemini CLI Proxy") + +# Resolve the full path to gemini (handles Windows .cmd wrappers) +GEMINI_CMD = shutil.which("gemini") or "gemini" +REQUEST_TIMEOUT = 120 + + +def messages_to_prompt(messages: list[dict]) -> str: + """Flatten OpenAI message list into a single prompt string for Gemini CLI.""" + parts = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + if isinstance(content, list): + content = " ".join( + p.get("text", "") for p in content if isinstance(p, dict) + ) + if role == "system": + parts.append(f"[System instructions: {content}]") + elif role == "user": + parts.append(f"User: {content}") + elif role == "assistant": + parts.append(f"Assistant: {content}") + return "\n".join(parts) + + +@app.post("/v1/chat/completions") +async def chat_completions(request: Request): + body = await request.json() + messages = body.get("messages", []) + prompt = messages_to_prompt(messages) + + def run_gemini() -> str: + result = subprocess.run( + [GEMINI_CMD, "-p", prompt], + capture_output=True, + timeout=REQUEST_TIMEOUT, + shell=False, + ) + return result.stdout.decode("utf-8", errors="replace").strip() + + try: + response_text = await asyncio.wait_for( + asyncio.to_thread(run_gemini), + timeout=REQUEST_TIMEOUT + 5, + ) + except (asyncio.TimeoutError, subprocess.TimeoutExpired): + return JSONResponse({"error": {"message": "Gemini CLI timed out", "type": "timeout"}}, status_code=504) + + return JSONResponse({ + "id": f"chatcmpl-{uuid.uuid4().hex[:12]}", + "object": "chat.completion", + "created": int(time.time()), + "model": "gemini-cli", + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": response_text}, + "finish_reason": "stop", + }], + "usage": { + "prompt_tokens": len(prompt.split()), + "completion_tokens": len(response_text.split()), + "total_tokens": len(prompt.split()) + len(response_text.split()), + }, + }) + + +@app.get("/v1/models") +async def list_models(): + return JSONResponse({ + "object": "list", + "data": [{ + "id": "gemini-cli", + "object": "model", + "created": int(time.time()), + "owned_by": "google", + }], + }) + + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +if __name__ == "__main__": + uvicorn.run(app, host="127.0.0.1", port=8080, log_level="warning") diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..2608c98 --- /dev/null +++ b/start.bat @@ -0,0 +1,6 @@ +@echo off +setlocal +cd /d "%~dp0" + +set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe +"%VENV_PYTHON%" start.py %* diff --git a/start.py b/start.py new file mode 100644 index 0000000..fa3a1b0 --- /dev/null +++ b/start.py @@ -0,0 +1,117 @@ +""" +RouteLLM launcher — starts Gemini proxy then RouteLLM server. + + Strong model: Gemini CLI via local proxy on :8080 + Weak model: Ollama phi4 on :11434 (change WEAK_MODEL to swap) + Router: mf (matrix factorization, best accuracy) + RouteLLM: OpenAI-compatible server on :6060 + +Usage: + python start.py # defaults + python start.py --threshold 0.2 # more Gemini, less Ollama + python start.py --weak phi4 # pick a different Ollama model +""" + +import argparse +import os +import subprocess +import sys +import time +import urllib.request + +# Use the hermes venv Python which has routellm + fastapi installed +HERMES_PYTHON = os.path.join( + os.environ.get("LOCALAPPDATA", ""), + "hermes", "hermes-agent", "venv", "Scripts", "python.exe", +) +VENV_PYTHON = HERMES_PYTHON if os.path.exists(HERMES_PYTHON) else sys.executable + +# ── tunables ──────────────────────────────────────────────────────────────── +PROXY_PORT = 8080 +ROUTELLM_PORT = 6060 +STRONG_MODEL = "openai/gemini-cli" # resolved via OPENAI_API_BASE below +WEAK_MODEL_DEFAULT = "ollama_chat/phi4" +ROUTER = "mf" +THRESHOLD = 0.11593 # ~50 % strong-model calls; lower = more Gemini +# ──────────────────────────────────────────────────────────────────────────── + + +def wait_for_http(url: str, timeout: int = 30) -> bool: + deadline = time.time() + timeout + while time.time() < deadline: + try: + urllib.request.urlopen(url, timeout=2) + return True + except Exception: + time.sleep(1) + return False + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--threshold", type=float, default=THRESHOLD, + help="Cost threshold (lower = more strong-model calls)") + parser.add_argument("--weak", default=WEAK_MODEL_DEFAULT, + help="Weak (cheap) model in LiteLLM format") + parser.add_argument("--port", type=int, default=ROUTELLM_PORT, + help="RouteLLM server port") + args = parser.parse_args() + + # Gemini proxy must resolve via OpenAI provider in LiteLLM + os.environ.setdefault("OPENAI_API_KEY", "no-key") + os.environ["OPENAI_API_BASE"] = f"http://127.0.0.1:{PROXY_PORT}/v1" + + here = os.path.dirname(os.path.abspath(__file__)) + + # ── 1. Start Gemini CLI proxy ───────────────────────────────────────── + print(f"Starting Gemini CLI proxy on port {PROXY_PORT}...") + proxy = subprocess.Popen( + [VENV_PYTHON, os.path.join(here, "gemini_proxy.py")], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + if not wait_for_http(f"http://127.0.0.1:{PROXY_PORT}/health", timeout=15): + proxy.kill() + sys.exit("ERROR: Gemini proxy failed to start.") + print(f" Gemini proxy ready at http://127.0.0.1:{PROXY_PORT}/v1") + + # ── 2. Start RouteLLM OpenAI-compatible server ──────────────────────── + print(f"Starting RouteLLM on port {args.port} (router={ROUTER}, threshold={args.threshold})...") + config_path = os.path.join(here, "config.example.yaml") + routellm_cmd = [ + VENV_PYTHON, "-m", "routellm.openai_server", + "--routers", ROUTER, + "--strong-model", STRONG_MODEL, + "--weak-model", args.weak, + "--config", config_path, + "--port", str(args.port), + ] + server = subprocess.Popen(routellm_cmd) + + print(f""" +╔══════════════════════════════════════════════════════╗ +║ RouteLLM is running ║ +╠══════════════════════════════════════════════════════╣ +║ Endpoint: http://localhost:{args.port}/v1 ║ +║ Strong model: Gemini CLI (via proxy :{PROXY_PORT}) ║ +║ Weak model: {args.weak:<36} ║ +║ Router: {ROUTER} (threshold={args.threshold}) ║ +╠══════════════════════════════════════════════════════╣ +║ Use model string in your client: ║ +║ router-{ROUTER}-{args.threshold} ║ +╚══════════════════════════════════════════════════════╝ +Press Ctrl+C to stop. +""") + + try: + server.wait() + except KeyboardInterrupt: + print("\nShutting down...") + finally: + server.terminate() + proxy.terminate() + + +if __name__ == "__main__": + main() diff --git a/start_and_test.bat b/start_and_test.bat new file mode 100644 index 0000000..d0ce9f6 --- /dev/null +++ b/start_and_test.bat @@ -0,0 +1,51 @@ +@echo off +setlocal +cd /d "%~dp0" + +set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe +set ROUTELLM_URL=http://localhost:6060/v1/models + +echo ============================================ +echo RouteLLM 啟動中... +echo ============================================ + +:: Check if already running +curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1 +if %errorlevel% == 0 ( + echo [OK] RouteLLM 已在運行,跳過啟動 + goto :test +) + +:: Start RouteLLM in a new window +start "RouteLLM" "%VENV_PYTHON%" start.py + +:: Wait up to 30 seconds for RouteLLM to be ready +echo 等待 RouteLLM 就緒... +set /a attempts=0 +:wait_loop + timeout /t 2 /nobreak >nul + curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1 + if %errorlevel% == 0 goto :ready + set /a attempts+=1 + echo [%attempts%] 還在啟動中... + if %attempts% lss 15 goto :wait_loop + +echo [ERROR] RouteLLM 啟動逾時,請檢查 RouteLLM 視窗的錯誤訊息 +pause +exit /b 1 + +:ready +echo [OK] RouteLLM 已就緒! + +:test +echo. +echo ============================================ +echo 執行路由測試... +echo ============================================ +"%VENV_PYTHON%" test_routing.py + +echo. +echo ============================================ +echo 完成!按任意鍵關閉 +echo ============================================ +pause diff --git a/test_routing.py b/test_routing.py new file mode 100644 index 0000000..ec3158d --- /dev/null +++ b/test_routing.py @@ -0,0 +1,38 @@ +""" +Quick smoke-test for the RouteLLM setup. +Run AFTER start.py is running. + + python test_routing.py +""" + +import openai +import time + +client = openai.OpenAI( + base_url="http://localhost:6060/v1", + api_key="no-key", +) + +ROUTER = "mf" +THRESHOLD = 0.11593 +MODEL = f"router-{ROUTER}-{THRESHOLD}" + +TESTS = [ + ("easy", "What is 2+2?"), + ("medium", "Explain what a REST API is in one sentence."), + ("hard", "Describe three subtle differences between Python's GIL and Java's memory model in terms of concurrency guarantees."), +] + +print(f"Testing RouteLLM at http://localhost:6060 with model={MODEL}\n") +for label, prompt in TESTS: + t0 = time.time() + resp = client.chat.completions.create( + model=MODEL, + messages=[{"role": "user", "content": prompt}], + ) + elapsed = time.time() - t0 + answer = resp.choices[0].message.content.strip() + routed_to = getattr(resp, "model", "unknown") + print(f"[{label:6s}] ({elapsed:.1f}s) → {routed_to}") + print(f" Q: {prompt[:60]}") + print(f" A: {answer[:80]}\n")