Skip to content

Commit 8582f52

Browse files
Darth-Hidiousclaude
andcommitted
feat: bundle Firecrawl as managed Docker service for web search
Firecrawl (open source, ghcr.io/mendableai/firecrawl) is now a managed service like Neo4j/Qdrant — starts automatically with `prism node up`. Web tools auto-detect local instance at localhost:3002, no API key needed. - Add FirecrawlConfig to ServiceConfig (port 3002, enabled by default) - Add start_firecrawl() to DockerOrchestrator - web.py: auto-detect local Firecrawl → cloud API → DDG fallback chain - pyproject.toml: firecrawl-py + duckduckgo-search + beautifulsoup4 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7ecc4b6 commit 8582f52

4 files changed

Lines changed: 118 additions & 8 deletions

File tree

app/tools/web.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,42 @@
1313

1414
logger = logging.getLogger(__name__)
1515

16-
# Firecrawl config — works with local self-hosted instance OR the API
16+
# Firecrawl config — prefers local self-hosted instance (bundled with PRISM),
17+
# falls back to cloud API if key is set.
1718
FIRECRAWL_KEY = os.environ.get("FIRECRAWL_API_KEY", "")
18-
FIRECRAWL_URL = os.environ.get("FIRECRAWL_API_URL", "https://api.firecrawl.dev/v1")
19+
FIRECRAWL_LOCAL_URL = os.environ.get("FIRECRAWL_LOCAL_URL", "http://localhost:3002")
20+
FIRECRAWL_API_URL = os.environ.get("FIRECRAWL_API_URL", "https://api.firecrawl.dev/v1")
21+
22+
23+
def _firecrawl_available() -> bool:
24+
"""Check if local Firecrawl instance is running."""
25+
try:
26+
import httpx
27+
28+
r = httpx.get(f"{FIRECRAWL_LOCAL_URL}/", timeout=2)
29+
return r.status_code < 500
30+
except Exception:
31+
return False
32+
33+
34+
# Cache the check at import time so we don't hit it on every call
35+
_LOCAL_FIRECRAWL = _firecrawl_available()
36+
37+
# Resolve which Firecrawl URL and key to use
38+
if _LOCAL_FIRECRAWL:
39+
# Local instance — no API key needed
40+
FIRECRAWL_URL = FIRECRAWL_LOCAL_URL
41+
FIRECRAWL_ACTIVE = True
42+
logger.info(f"Firecrawl: using local instance at {FIRECRAWL_URL}")
43+
elif FIRECRAWL_KEY:
44+
# Cloud API with key
45+
FIRECRAWL_URL = FIRECRAWL_API_URL
46+
FIRECRAWL_ACTIVE = True
47+
logger.info("Firecrawl: using cloud API")
48+
else:
49+
FIRECRAWL_URL = ""
50+
FIRECRAWL_ACTIVE = False
51+
logger.info("Firecrawl: not available, using DuckDuckGo fallback")
1952

2053

2154
def _web_read(**kwargs) -> dict:
@@ -29,11 +62,14 @@ def _web_read(**kwargs) -> dict:
2962
return {"error": "url is required"}
3063

3164
# Try Firecrawl first (best quality — handles JS, returns markdown)
32-
if FIRECRAWL_KEY:
65+
if FIRECRAWL_ACTIVE:
3366
try:
3467
from firecrawl import FirecrawlApp
3568

36-
app = FirecrawlApp(api_key=FIRECRAWL_KEY, api_url=FIRECRAWL_URL)
69+
app = FirecrawlApp(
70+
api_key=FIRECRAWL_KEY or "local",
71+
api_url=FIRECRAWL_URL,
72+
)
3773
result = app.scrape_url(url, params={"formats": ["markdown"]})
3874
content = result.get("markdown", "") if isinstance(result, dict) else str(result)
3975
title = ""
@@ -106,12 +142,15 @@ def _web_search(**kwargs) -> dict:
106142
if not query:
107143
return {"error": "query is required"}
108144

109-
# Try Firecrawl search first (if key is set)
110-
if FIRECRAWL_KEY:
145+
# Try Firecrawl search first (local or cloud)
146+
if FIRECRAWL_ACTIVE:
111147
try:
112148
from firecrawl import FirecrawlApp
113149

114-
app = FirecrawlApp(api_key=FIRECRAWL_KEY, api_url=FIRECRAWL_URL)
150+
app = FirecrawlApp(
151+
api_key=FIRECRAWL_KEY or "local",
152+
api_url=FIRECRAWL_URL,
153+
)
115154
results = app.search(query, params={"limit": limit})
116155
items = results if isinstance(results, list) else results.get("data", [])
117156
return {

crates/cli/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1156,7 +1156,8 @@ async fn main() -> Result<()> {
11561156
let wants_managed_services = svc_config.neo4j.is_some()
11571157
|| svc_config.vector_db.is_some()
11581158
|| svc_config.kafka.is_some()
1159-
|| svc_config.spark.is_some();
1159+
|| svc_config.spark.is_some()
1160+
|| svc_config.firecrawl.is_some();
11601161

11611162
if wants_managed_services {
11621163
println!("\n PRISM v{}", env!("CARGO_PKG_VERSION"));

crates/orch/src/docker.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,49 @@ impl DockerOrchestrator {
350350
})
351351
}
352352

353+
/// Start Firecrawl (open-source web scraper) and return a ServiceHandle.
354+
async fn start_firecrawl(
355+
&self,
356+
config: &crate::services::FirecrawlConfig,
357+
) -> Result<ServiceHandle> {
358+
self.ensure_image(&config.image).await?;
359+
360+
let env = vec![
361+
// Firecrawl minimal config — no external dependencies needed
362+
format!("PORT={}", config.port),
363+
"HOST=0.0.0.0".to_string(),
364+
"NUM_WORKERS_PER_QUEUE=2".to_string(),
365+
];
366+
367+
let mut port_bindings = HashMap::new();
368+
port_bindings.insert(
369+
format!("{}/tcp", config.port),
370+
Some(vec![PortBinding {
371+
host_ip: Some("127.0.0.1".to_string()),
372+
host_port: Some(config.port.to_string()),
373+
}]),
374+
);
375+
376+
let container_id = self
377+
.run_container("firecrawl", &config.image, env, None, port_bindings, None)
378+
.await?;
379+
380+
Ok(ServiceHandle {
381+
name: "firecrawl".to_string(),
382+
container_id: Some(container_id),
383+
port: config.port,
384+
healthy: false,
385+
})
386+
}
387+
388+
/// Restart Firecrawl (public, used by health monitor).
389+
pub async fn start_firecrawl_public(
390+
&self,
391+
config: &crate::services::FirecrawlConfig,
392+
) -> Result<ServiceHandle> {
393+
self.start_firecrawl(config).await
394+
}
395+
353396
/// Restart Spark (public, used by health monitor).
354397
pub async fn start_spark_public(
355398
&self,
@@ -479,6 +522,13 @@ impl ServiceOrchestrator for DockerOrchestrator {
479522
services.push(spark);
480523
}
481524

525+
// Firecrawl — open-source web scraping (enabled by default)
526+
if let Some(ref firecrawl_cfg) = config.firecrawl {
527+
info!("Starting Firecrawl...");
528+
let firecrawl = self.start_firecrawl(firecrawl_cfg).await?;
529+
services.push(firecrawl);
530+
}
531+
482532
// Wait for services to become healthy
483533
let checker = HealthChecker::new();
484534
for handle in &mut services {

crates/orch/src/services.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ pub struct ServiceConfig {
66
pub vector_db: Option<VectorDbConfig>,
77
pub kafka: Option<KafkaConfig>,
88
pub spark: Option<SparkConfig>,
9+
pub firecrawl: Option<FirecrawlConfig>,
910
}
1011

1112
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -73,13 +74,32 @@ impl Default for SparkConfig {
7374
}
7475
}
7576

77+
/// Firecrawl — open-source web scraping & search engine.
78+
/// Docker image: `ghcr.io/mendableai/firecrawl`
79+
/// Default port: 3002 (API)
80+
#[derive(Debug, Clone, Serialize, Deserialize)]
81+
pub struct FirecrawlConfig {
82+
pub image: String,
83+
pub port: u16,
84+
}
85+
86+
impl Default for FirecrawlConfig {
87+
fn default() -> Self {
88+
Self {
89+
image: "ghcr.io/mendableai/firecrawl:latest".to_string(),
90+
port: 3002,
91+
}
92+
}
93+
}
94+
7695
impl Default for ServiceConfig {
7796
fn default() -> Self {
7897
Self {
7998
neo4j: Some(Neo4jConfig::default()),
8099
vector_db: Some(VectorDbConfig::default()),
81100
kafka: None,
82101
spark: None,
102+
firecrawl: Some(FirecrawlConfig::default()),
83103
}
84104
}
85105
}

0 commit comments

Comments
 (0)