Switch AI inference path from Modal to Lightning-ready runtime

codex · codex · commit 4289368a225a · 2026-04-06T18:10:45.000+08:00
diff --git a/.github/workflows/ai_trading_smoke.yml b/.github/workflows/ai_trading_smoke.yml
@@ -6,49 +6,35 @@ on:
 jobs:
   ai-smoke:
     runs-on: ubuntu-latest
-    timeout-minutes: 90
+    timeout-minutes: 30
     env:
       PYTHONUNBUFFERED: "1"
-      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
-      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
       TWELVEDATA_API_KEYS: ${{ secrets.TWELVEDATA_API_KEYS }}
       ALPHAVANTAGE_API_KEYS: ${{ secrets.ALPHAVANTAGE_API_KEYS }}
       NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+      TRAINED_MODEL_INFERENCE_URL: ${{ secrets.TRAINED_MODEL_INFERENCE_URL }}
+      TRAINED_MODEL_API_KEY: ${{ secrets.TRAINED_MODEL_API_KEY }}
       AI_SMOKE_TICKERS: "AAPL"
-      TRAINED_MODEL_BASE_MODEL: "Qwen/Qwen2.5-7B-Instruct"
-      TRAINED_MODEL_ADAPTER_PATH: "_smoke_artifacts/lora_solid_adapter"
-      HF_HUB_DISABLE_TELEMETRY: "1"
-      HF_HUB_ENABLE_HF_TRANSFER: "1"
-      TRAINED_MODEL_CPU_THREADS: "4"
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.11"
+          python-version: "3.10"
 
       - name: Install dependencies
         run: |
           pip install -r requirements.txt
-          pip install modal hf_transfer
-          pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install "transformers>=4.46.0" "peft>=0.13.2" "accelerate>=1.0.1" "sentencepiece>=0.2.0"
-
-      - name: Fetch trained adapter from Modal volume
-        run: |
-          mkdir -p _smoke_artifacts/lora_solid_adapter
-          modal volume get train-once-artifacts /lora_solid_adapter/adapter_model.safetensors _smoke_artifacts/lora_solid_adapter/adapter_model.safetensors
-          modal volume get train-once-artifacts /lora_solid_adapter/adapter_config.json _smoke_artifacts/lora_solid_adapter/adapter_config.json
 
       - name: Run AI-only smoke test
-        run: python run_ai_trading_smoke_direct.py
+        run: python run_ai_trading_smoke.py
 
       - name: Upload AI smoke artifacts
         if: always()
         uses: actions/upload-artifact@v4
         with:
           name: ai-trading-smoke
-          path: results/ai_smoke_direct_*.json
+          path: results/ai_smoke_*.json
           retention-days: 7
diff --git a/.github/workflows/deploy_lightning_inference.yml b/.github/workflows/deploy_lightning_inference.yml
@@ -0,0 +1,40 @@
+name: Deploy Lightning Inference
+
+on:
+  workflow_dispatch:
+
+jobs:
+  deploy-lightning-inference:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    env:
+      LIGHTNING_USERNAME: ${{ secrets.LIGHTNING_USERNAME }}
+      LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }}
+      LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }}
+      TRAINED_MODEL_BASE_MODEL: "Qwen/Qwen2.5-7B-Instruct"
+      TRAINED_MODEL_NAME: "quant-trained-trading-model"
+      TRAINED_MODEL_CPU_THREADS: "8"
+      LIGHTNING_INFERENCE_COMPUTE_NAME: "cpu"
+      LIGHTNING_INFERENCE_DISK_GB: "80"
+      TRAINED_MODEL_API_KEY: ${{ secrets.TRAINED_MODEL_API_KEY }}
+      TRAINED_MODEL_ADAPTER_ARCHIVE_URL: ${{ secrets.TRAINED_MODEL_ADAPTER_ARCHIVE_URL }}
+      TRAINED_MODEL_ADAPTER_ARCHIVE_TOKEN: ${{ secrets.TRAINED_MODEL_ADAPTER_ARCHIVE_TOKEN }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install Lightning deploy dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install lightning-app==2.3.2 lightning-cloud==0.5.70
+
+      - name: Lightning preflight
+        run: python quant_platform/scripts/lightning_account_preflight.py
+
+      - name: Deploy inference service
+        run: python deploy_lightning_inference.py
diff --git a/README.md b/README.md
@@ -17,7 +17,9 @@ trading_bot/
 ├── main.py                      # Daily core bot + AI bot orchestration
 ├── llm_trader.py                # AI trading branch using the trained model
 ├── trained_model_client.py      # Remote HTTP client for trained-model inference
-├── modal_trained_model_service.py
+├── trained_model_service_runtime.py
+├── lightning_trained_model_app.py
+├── deploy_lightning_inference.py
 ├── backtesting/                 # Existing research stack in the bot repo
 └── quant_platform/              # Merged train-once quant platform repo
 ```
@@ -26,27 +28,32 @@ trading_bot/
 
 - Core bot remains unchanged in principle: price ingestion, feature generation, OLS ranking, meta-learner, portfolio logic
 - AI trading bot is separate and now uses the trained quant model over HTTP
-- The AI path is batched and designed to call the Modal CPU endpoint, not a local model
+- The AI path is batched and designed to call a remote CPU inference endpoint, not a local model
 
 ## Secrets
 
 ### Still used
 - `NVIDIA_API_KEY`: news sentiment path
-- `TRAINED_MODEL_INFERENCE_URL`: deployed Modal CPU inference URL for the AI trading bot
+- `TRAINED_MODEL_INFERENCE_URL`: deployed inference URL for the AI trading bot
 - `TRAINED_MODEL_API_KEY`: optional auth for the trained-model endpoint
 - `TWELVEDATA_API_KEYS`, `ALPHAVANTAGE_API_KEYS`: optional price providers
 
 ### No longer used by the AI trading bot
 - `NVIDIA_REASONING_API_KEY`
+- `MODAL_TOKEN_ID`
+- `MODAL_TOKEN_SECRET`
 
 ## Main Workflows
 
 - `.github/workflows/daily_trading_bot.yml`
   - Daily root bot workflow
   - Core + AI orchestration
 - `.github/workflows/ai_trading_smoke.yml`
-  - AI-only smoke test against the trained model endpoint
+  - AI-only smoke test against the remote trained-model endpoint
   - Does not run the core strategy
+- `.github/workflows/deploy_lightning_inference.yml`
+  - Deploys the trained-model inference service to Lightning AI
+  - Leaves the core bot untouched
 
 ## AI-Only Smoke Test
 
@@ -89,5 +96,6 @@ python run_ai_trading_smoke.py
 ## Notes
 
 - The AI bot is remote-only and expects the trained model to be served externally.
-- The current deployment target is Modal CPU.
+- The current deployment target is Lightning AI CPU.
+- The Lightning inference app can either mount a ready adapter directory or download a `tar.gz` / `.zip` archive via `TRAINED_MODEL_ADAPTER_ARCHIVE_URL`.
 - The core bot and AI bot remain logically separate even though they now live in one combined repo.
diff --git a/RUNBOOK.md b/RUNBOOK.md
@@ -4,7 +4,8 @@
 - **Python 3.9+**
 - **Dependencies**: `pandas`, `requests`, `pyyaml`, `yfinance`, `python-dotenv`
 - **Email Configuration**: Gmail App Password required in `.env`.
-- **LLM (Optional)**: NVIDIA API key(s) in `.env` for LLM sentiment scoring and AI trade selection.
+- **News Sentiment (Optional)**: NVIDIA API key(s) in `.env` for news sentiment scoring.
+- **AI Trading Bot**: Remote trained-model inference endpoint URL in `.env` or GitHub secrets.
 
 ## 2. Setup
 1. Move the `trading_bot` folder to your desired location (e.g., home folder).
@@ -16,8 +17,11 @@
    ```
 5. (Optional) NVIDIA keys:
    - `NVIDIA_API_KEY` enables News/LLM sentiment if `news.enabled: true` in `config.yaml`.
-   - `NVIDIA_REASONING_API_KEY` enables the AI strategy trade selection (`ai_trading.enabled: true`).
+   - `NVIDIA_REASONING_API_KEY` is no longer used by the AI trading bot.
    - Do not commit `.env` (it is gitignored).
+6. AI trading endpoint:
+   - `TRAINED_MODEL_INFERENCE_URL` points the AI trading bot at the hosted trained-model service.
+   - `TRAINED_MODEL_API_KEY` optionally protects that endpoint.
 
 ## 3. Daily Workflow
 The bot is fully automated:
@@ -45,5 +49,5 @@ Note: PineScript translation is a placeholder; set `strategy.type: pine` with a
 - **No Email**: Verify `SENDER_EMAIL` and `SENDER_PASSWORD` in `.env`.
 - **No Data**: Ensure internet connection is active (Wi-Fi check).
 - **AI Strategy Not Trading**:
-  - If the AI LLM call fails, the run continues but new AI entries are blocked (to avoid hallucinated trades).
-  - Check that `NVIDIA_REASONING_API_KEY` is set and the configured model is available.
+  - If the trained-model endpoint call fails, the run continues but new AI entries are blocked.
+  - Check that `TRAINED_MODEL_INFERENCE_URL` is set and the hosted service is healthy.
diff --git a/config.yaml b/config.yaml
@@ -117,6 +117,7 @@ ai_trading:
     inference_url: ""
     inference_url_env: "TRAINED_MODEL_INFERENCE_URL"
     api_key_env: "TRAINED_MODEL_API_KEY"
+    # The endpoint is expected to be a remote CPU inference service (for example Lightning AI).
     timeout_seconds: 600
     model_name: "quant-trained-trading-model"
 
diff --git a/deploy_lightning_inference.py b/deploy_lightning_inference.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+import sys
+
+
+ROOT_DIR = Path(__file__).resolve().parent
+QP_SRC_DIR = ROOT_DIR / "quant_platform" / "src"
+if str(QP_SRC_DIR) not in sys.path:
+    sys.path.insert(0, str(QP_SRC_DIR))
+
+from lightning_cloud_utils import (  # noqa: E402
+    ensure_auth_env,
+    find_app_by_name,
+    get_client_and_project,
+    json_safe,
+    phase_name,
+    set_process_env,
+)
+
+from lightning_app.runners.runtime import dispatch  # noqa: E402
+from lightning_app.runners.runtime_type import RuntimeType  # noqa: E402
+
+
+ENV_KEYS = (
+    "TRAINED_MODEL_BASE_MODEL",
+    "TRAINED_MODEL_NAME",
+    "TRAINED_MODEL_CPU_THREADS",
+    "TRAINED_MODEL_CPU",
+    "TRAINED_MODEL_API_KEY",
+    "TRAINED_MODEL_ADAPTER_PATH",
+    "TRAINED_MODEL_ADAPTER_ARCHIVE_URL",
+    "TRAINED_MODEL_ADAPTER_ARCHIVE_TOKEN",
+    "TRAINED_MODEL_CACHE_DIR",
+    "LIGHTNING_INFERENCE_COMPUTE_NAME",
+    "LIGHTNING_INFERENCE_DISK_GB",
+    "LIGHTNING_INFERENCE_PORT",
+    "TRAINED_MODEL_LOG_LEVEL",
+)
+
+
+def _collect_env() -> dict[str, str]:
+    env_vars: dict[str, str] = {}
+    for key in ENV_KEYS:
+        value = os.getenv(key)
+        if value:
+            env_vars[key] = value
+    return env_vars
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--app-name", default="trading-bot-lightning-inference")
+    parser.add_argument("--blocking", action="store_true")
+    parser.add_argument("--open-ui", action="store_true")
+    args = parser.parse_args()
+
+    auth_env = ensure_auth_env()
+    set_process_env(auth_env)
+    client, project = get_client_and_project()
+
+    entrypoint = ROOT_DIR / "lightning_trained_model_app.py"
+    env_vars = _collect_env()
+
+    dispatch(
+        entrypoint,
+        RuntimeType.CLOUD,
+        start_server=False,
+        no_cache=False,
+        blocking=args.blocking,
+        open_ui=args.open_ui,
+        name=args.app_name,
+        env_vars=env_vars,
+        secrets={},
+    )
+
+    latest = find_app_by_name(client, project.project_id, args.app_name)
+    payload = {
+        "project_id": project.project_id,
+        "project_name": project.name,
+        "app_name": args.app_name,
+        "app_id": getattr(latest, "id", None) if latest else None,
+        "phase": phase_name(latest) if latest else None,
+        "note": "Copy the Lightning service URL from the app layout once the inference work is running.",
+    }
+    print(json.dumps(json_safe(payload), indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lightning_trained_model_app.py b/lightning_trained_model_app.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from lightning_app import BuildConfig, CloudCompute, LightningApp, LightningFlow, LightningWork
+
+
+ROOT_DIR = Path(__file__).resolve().parent
+REQUIREMENTS_FILE = ROOT_DIR / "requirements-lightning-inference.txt"
+DEFAULT_COMPUTE_NAME = os.getenv("LIGHTNING_INFERENCE_COMPUTE_NAME", "cpu")
+DEFAULT_DISK_SIZE_GB = int(os.getenv("LIGHTNING_INFERENCE_DISK_GB", "80") or 80)
+DEFAULT_PORT = int(os.getenv("LIGHTNING_INFERENCE_PORT", "8000") or 8000)
+
+
+class TrainedModelInferenceWork(LightningWork):
+    def __init__(self) -> None:
+        build_config = BuildConfig(requirements=[str(REQUIREMENTS_FILE.resolve())])
+        cloud_compute = CloudCompute(name=DEFAULT_COMPUTE_NAME, disk_size=DEFAULT_DISK_SIZE_GB)
+        super().__init__(
+            parallel=True,
+            port=DEFAULT_PORT,
+            raise_exception=False,
+            cloud_build_config=build_config,
+            cloud_compute=cloud_compute,
+        )
+
+    def run(self) -> None:
+        import uvicorn
+
+        os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+        uvicorn.run(
+            "trained_model_service_runtime:app",
+            host="0.0.0.0",
+            port=self.port,
+            log_level=os.getenv("TRAINED_MODEL_LOG_LEVEL", "info").lower(),
+        )
+
+
+class RootFlow(LightningFlow):
+    def __init__(self) -> None:
+        super().__init__()
+        self.inference = TrainedModelInferenceWork()
+
+    def run(self) -> None:
+        self.inference.run()
+
+    def configure_layout(self):
+        return [{"name": "trained-model-inference", "content": self.inference.url}]
+
+
+app = LightningApp(RootFlow())
diff --git a/modal_trained_model_service.py b/modal_trained_model_service.py
diff --git a/requirements-lightning-inference.txt b/requirements-lightning-inference.txt
diff --git a/trained_model_service_runtime.py b/trained_model_service_runtime.py