Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions gemini_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
OpenAI-compatible proxy that routes requests to the Gemini CLI.
Runs on http://localhost:8080/v1 — RouteLLM uses this as its strong model.
"""

import asyncio
import json
import shutil
import subprocess
import time
import uuid
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import uvicorn

app = FastAPI(title="Gemini CLI Proxy")

# Resolve the full path to gemini (handles Windows .cmd wrappers)
GEMINI_CMD = shutil.which("gemini") or "gemini"
REQUEST_TIMEOUT = 120


def messages_to_prompt(messages: list[dict]) -> str:
"""Flatten OpenAI message list into a single prompt string for Gemini CLI."""
parts = []
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if isinstance(content, list):
content = " ".join(
p.get("text", "") for p in content if isinstance(p, dict)
)
if role == "system":
parts.append(f"[System instructions: {content}]")
elif role == "user":
parts.append(f"User: {content}")
elif role == "assistant":
parts.append(f"Assistant: {content}")
return "\n".join(parts)


@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
body = await request.json()
messages = body.get("messages", [])
prompt = messages_to_prompt(messages)

def run_gemini() -> str:
result = subprocess.run(
[GEMINI_CMD, "-p", prompt],
capture_output=True,
timeout=REQUEST_TIMEOUT,
shell=False,
)
return result.stdout.decode("utf-8", errors="replace").strip()

try:
response_text = await asyncio.wait_for(
asyncio.to_thread(run_gemini),
timeout=REQUEST_TIMEOUT + 5,
)
except (asyncio.TimeoutError, subprocess.TimeoutExpired):
return JSONResponse({"error": {"message": "Gemini CLI timed out", "type": "timeout"}}, status_code=504)

return JSONResponse({
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
"object": "chat.completion",
"created": int(time.time()),
"model": "gemini-cli",
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": response_text},
"finish_reason": "stop",
}],
"usage": {
"prompt_tokens": len(prompt.split()),
"completion_tokens": len(response_text.split()),
"total_tokens": len(prompt.split()) + len(response_text.split()),
},
})


@app.get("/v1/models")
async def list_models():
return JSONResponse({
"object": "list",
"data": [{
"id": "gemini-cli",
"object": "model",
"created": int(time.time()),
"owned_by": "google",
}],
})


@app.get("/health")
async def health():
return {"status": "ok"}


if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=8080, log_level="warning")
6 changes: 6 additions & 0 deletions start.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@echo off
setlocal
cd /d "%~dp0"

set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe
"%VENV_PYTHON%" start.py %*
117 changes: 117 additions & 0 deletions start.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
RouteLLM launcher — starts Gemini proxy then RouteLLM server.

Strong model: Gemini CLI via local proxy on :8080
Weak model: Ollama phi4 on :11434 (change WEAK_MODEL to swap)
Router: mf (matrix factorization, best accuracy)
RouteLLM: OpenAI-compatible server on :6060

Usage:
python start.py # defaults
python start.py --threshold 0.2 # more Gemini, less Ollama
python start.py --weak phi4 # pick a different Ollama model
"""

import argparse
import os
import subprocess
import sys
import time
import urllib.request

# Use the hermes venv Python which has routellm + fastapi installed
HERMES_PYTHON = os.path.join(
os.environ.get("LOCALAPPDATA", ""),
"hermes", "hermes-agent", "venv", "Scripts", "python.exe",
)
VENV_PYTHON = HERMES_PYTHON if os.path.exists(HERMES_PYTHON) else sys.executable

# ── tunables ────────────────────────────────────────────────────────────────
PROXY_PORT = 8080
ROUTELLM_PORT = 6060
STRONG_MODEL = "openai/gemini-cli" # resolved via OPENAI_API_BASE below
WEAK_MODEL_DEFAULT = "ollama_chat/phi4"
ROUTER = "mf"
THRESHOLD = 0.11593 # ~50 % strong-model calls; lower = more Gemini
# ────────────────────────────────────────────────────────────────────────────


def wait_for_http(url: str, timeout: int = 30) -> bool:
deadline = time.time() + timeout
while time.time() < deadline:
try:
urllib.request.urlopen(url, timeout=2)
return True
except Exception:
time.sleep(1)
return False


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--threshold", type=float, default=THRESHOLD,
help="Cost threshold (lower = more strong-model calls)")
parser.add_argument("--weak", default=WEAK_MODEL_DEFAULT,
help="Weak (cheap) model in LiteLLM format")
parser.add_argument("--port", type=int, default=ROUTELLM_PORT,
help="RouteLLM server port")
args = parser.parse_args()

# Gemini proxy must resolve via OpenAI provider in LiteLLM
os.environ.setdefault("OPENAI_API_KEY", "no-key")
os.environ["OPENAI_API_BASE"] = f"http://127.0.0.1:{PROXY_PORT}/v1"

here = os.path.dirname(os.path.abspath(__file__))

# ── 1. Start Gemini CLI proxy ─────────────────────────────────────────
print(f"Starting Gemini CLI proxy on port {PROXY_PORT}...")
proxy = subprocess.Popen(
[VENV_PYTHON, os.path.join(here, "gemini_proxy.py")],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)

if not wait_for_http(f"http://127.0.0.1:{PROXY_PORT}/health", timeout=15):
proxy.kill()
sys.exit("ERROR: Gemini proxy failed to start.")
print(f" Gemini proxy ready at http://127.0.0.1:{PROXY_PORT}/v1")

# ── 2. Start RouteLLM OpenAI-compatible server ────────────────────────
print(f"Starting RouteLLM on port {args.port} (router={ROUTER}, threshold={args.threshold})...")
config_path = os.path.join(here, "config.example.yaml")
routellm_cmd = [
VENV_PYTHON, "-m", "routellm.openai_server",
"--routers", ROUTER,
"--strong-model", STRONG_MODEL,
"--weak-model", args.weak,
"--config", config_path,
"--port", str(args.port),
]
server = subprocess.Popen(routellm_cmd)

print(f"""
╔══════════════════════════════════════════════════════╗
║ RouteLLM is running ║
╠══════════════════════════════════════════════════════╣
║ Endpoint: http://localhost:{args.port}/v1 ║
║ Strong model: Gemini CLI (via proxy :{PROXY_PORT}) ║
║ Weak model: {args.weak:<36} ║
║ Router: {ROUTER} (threshold={args.threshold}) ║
╠══════════════════════════════════════════════════════╣
║ Use model string in your client: ║
║ router-{ROUTER}-{args.threshold} ║
╚══════════════════════════════════════════════════════╝
Press Ctrl+C to stop.
""")

try:
server.wait()
except KeyboardInterrupt:
print("\nShutting down...")
finally:
server.terminate()
proxy.terminate()


if __name__ == "__main__":
main()
51 changes: 51 additions & 0 deletions start_and_test.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
@echo off
setlocal
cd /d "%~dp0"

set VENV_PYTHON=%LOCALAPPDATA%\hermes\hermes-agent\venv\Scripts\python.exe
set ROUTELLM_URL=http://localhost:6060/v1/models

echo ============================================
echo RouteLLM 啟動中...
echo ============================================

:: Check if already running
curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1
if %errorlevel% == 0 (
echo [OK] RouteLLM 已在運行,跳過啟動
goto :test
)

:: Start RouteLLM in a new window
start "RouteLLM" "%VENV_PYTHON%" start.py

:: Wait up to 30 seconds for RouteLLM to be ready
echo 等待 RouteLLM 就緒...
set /a attempts=0
:wait_loop
timeout /t 2 /nobreak >nul
curl -s --max-time 2 %ROUTELLM_URL% >nul 2>&1
if %errorlevel% == 0 goto :ready
set /a attempts+=1
echo [%attempts%] 還在啟動中...
if %attempts% lss 15 goto :wait_loop

echo [ERROR] RouteLLM 啟動逾時,請檢查 RouteLLM 視窗的錯誤訊息
pause
exit /b 1

:ready
echo [OK] RouteLLM 已就緒!

:test
echo.
echo ============================================
echo 執行路由測試...
echo ============================================
"%VENV_PYTHON%" test_routing.py

echo.
echo ============================================
echo 完成!按任意鍵關閉
echo ============================================
pause
38 changes: 38 additions & 0 deletions test_routing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Quick smoke-test for the RouteLLM setup.
Run AFTER start.py is running.

python test_routing.py
"""

import openai
import time

client = openai.OpenAI(
base_url="http://localhost:6060/v1",
api_key="no-key",
)

ROUTER = "mf"
THRESHOLD = 0.11593
MODEL = f"router-{ROUTER}-{THRESHOLD}"

TESTS = [
("easy", "What is 2+2?"),
("medium", "Explain what a REST API is in one sentence."),
("hard", "Describe three subtle differences between Python's GIL and Java's memory model in terms of concurrency guarantees."),
]

print(f"Testing RouteLLM at http://localhost:6060 with model={MODEL}\n")
for label, prompt in TESTS:
t0 = time.time()
resp = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
)
elapsed = time.time() - t0
answer = resp.choices[0].message.content.strip()
routed_to = getattr(resp, "model", "unknown")
print(f"[{label:6s}] ({elapsed:.1f}s) → {routed_to}")
print(f" Q: {prompt[:60]}")
print(f" A: {answer[:80]}\n")