Skip to content
This repository was archived by the owner on May 14, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ optional = true
[tool.poetry.group.community.dependencies]
wolframalpha = "^5.0.0"
torch = "^2.3.0"
logosdb = ">=0.7.3"

[tool.poetry.group.local-model]
optional = true
Expand Down
22 changes: 22 additions & 0 deletions src/backend/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,25 @@ class WolframAlphaSettings(BaseSettings, BaseModel):
)


class LogosDBSettings(BaseSettings, BaseModel):
model_config = SETTINGS_CONFIG
path: Optional[str] = Field(
default=None,
validation_alias=AliasChoices("LOGOSDB_PATH", "path"),
description="Root directory for the LogosDB index (e.g. /data/logosdb).",
)
embed_model: str = Field(
default="embed-english-v3.0",
validation_alias=AliasChoices("LOGOSDB_EMBED_MODEL", "embed_model"),
description="Cohere embedding model used for indexing and querying.",
)
namespace: str = Field(
default="default",
validation_alias=AliasChoices("LOGOSDB_NAMESPACE", "namespace"),
description="Default collection name within the LogosDB index.",
)


class GDriveSettings(BaseSettings, BaseModel):
model_config = SETTINGS_CONFIG
client_id: Optional[str] = Field(
Expand Down Expand Up @@ -296,6 +315,9 @@ class ToolSettings(BaseSettings, BaseModel):
wolfram_alpha: Optional[WolframAlphaSettings] = Field(
default=WolframAlphaSettings()
)
logosdb: Optional[LogosDBSettings] = Field(
default=LogosDBSettings()
)
google_drive: Optional[GDriveSettings] = Field(default=GDriveSettings())
tavily_web_search: Optional[TavilyWebSearchSettings] = Field(
default=TavilyWebSearchSettings()
Expand Down
2 changes: 2 additions & 0 deletions src/community/config/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ClinicalTrials,
ConnectorRetriever,
LlamaIndexUploadPDFRetriever,
LogosDBRetriever,
PubMedRetriever,
WolframAlpha,
)
Expand All @@ -18,6 +19,7 @@ class CommunityTool(Enum):
File_Upload_LlamaIndex = LlamaIndexUploadPDFRetriever
Wolfram_Alpha = WolframAlpha
ClinicalTrials = ClinicalTrials
LogosDB = LogosDBRetriever


def get_community_tools() -> dict[str, ToolDefinition]:
Expand Down
2 changes: 2 additions & 0 deletions src/community/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from community.tools.clinicaltrials import ClinicalTrials
from community.tools.connector import ConnectorRetriever
from community.tools.llama_index import LlamaIndexUploadPDFRetriever
from community.tools.logosdb import LogosDBRetriever
from community.tools.pub_med import PubMedRetriever
from community.tools.wolfram import WolframAlpha

Expand All @@ -12,4 +13,5 @@
"ConnectorRetriever",
"LlamaIndexUploadPDFRetriever",
"PubMedRetriever",
"LogosDBRetriever",
]
201 changes: 201 additions & 0 deletions src/community/tools/logosdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""LogosDB semantic retriever for Cohere Toolkit.

Uses Cohere embed API to index and query a local LogosDB HNSW vector store.
No external infrastructure required — the index lives in a directory on disk.

Configuration (environment variables or configuration.yaml):

LOGOSDB_PATH Root directory for the index (default: /tmp/logosdb)
COHERE_API_KEY Cohere API key (already required by the toolkit)
LOGOSDB_EMBED_MODEL Cohere embedding model (default: embed-english-v3.0)
LOGOSDB_NAMESPACE Default collection/namespace (default: default)

Install the extra dependency::

poetry add logosdb --group community
"""

from __future__ import annotations

import logging
import os
from typing import Any, Dict, List, Optional

import cohere

from backend.config.settings import Settings
from backend.schemas.tool import ToolCategory, ToolDefinition
from backend.tools.base import BaseTool

logger = logging.getLogger(__name__)

try:
import numpy as np
from logosdb import DB, DIST_COSINE

_LOGOSDB_AVAILABLE = True
except ImportError:
_LOGOSDB_AVAILABLE = False


_settings = Settings().get("tools.logosdb")


class LogosDBRetriever(BaseTool):
"""Semantic retriever backed by a local LogosDB HNSW vector index.

Documents are embedded with the Cohere Embed API and stored on disk.
At query time the question is embedded and the nearest neighbours are
returned as tool results.

Because the index is local and mmap-backed, searches are sub-millisecond
even for millions of vectors — with zero external services.
"""

ID = "logosdb"

# Settings resolved once at class load time.
_logosdb_path: Optional[str] = _settings.path if _settings else None
_embed_model: str = (
_settings.embed_model if _settings else "embed-english-v3.0"
)
_namespace: str = _settings.namespace if _settings else "default"
_cohere_api_key: Optional[str] = Settings().get("cohere_platform.api_key")

# Embedding dimension for embed-english-v3.0 / embed-multilingual-v3.0.
_DIM = 1024

def __init__(self) -> None:
if not _LOGOSDB_AVAILABLE:
return

self._co = cohere.Client(api_key=self._cohere_api_key)
root = self._logosdb_path or "/tmp/logosdb"
col_path = os.path.join(root, self._namespace)
os.makedirs(col_path, exist_ok=True)
self._db = DB(
path=col_path,
dim=self._DIM,
max_elements=1_000_000,
ef_construction=200,
M=16,
ef_search=50,
distance=DIST_COSINE,
)

# ── Availability ──────────────────────────────────────────────────────

@classmethod
def is_available(cls) -> bool:
return (
_LOGOSDB_AVAILABLE
and cls._cohere_api_key is not None
and cls._logosdb_path is not None
)

# ── Tool definition ───────────────────────────────────────────────────

@classmethod
def get_tool_definition(cls) -> ToolDefinition:
return ToolDefinition(
name=cls.ID,
display_name="LogosDB Semantic Search",
implementation=cls,
parameter_definitions={
"query": {
"description": (
"Natural-language query to search the local semantic index."
),
"type": "str",
"required": True,
},
"top_k": {
"description": "Maximum number of results to return (default 5).",
"type": "int",
"required": False,
},
},
is_visible=True,
is_available=cls.is_available(),
error_message=cls.generate_error_message(),
category=ToolCategory.DataLoader,
description=(
"Semantic search over a local HNSW vector index powered by LogosDB. "
"Documents are embedded with Cohere's Embed API and retrieved by "
"cosine similarity — no external database required."
),
)

# ── Query ─────────────────────────────────────────────────────────────

async def call(self, parameters: dict, **kwargs: Any) -> List[Dict[str, Any]]:
if not _LOGOSDB_AVAILABLE:
return self.get_tool_error(
details="logosdb is not installed. Run: poetry add logosdb --group community"
)

query = parameters.get("query", "").strip()
top_k = int(parameters.get("top_k", 5))

if not query:
return self.get_no_results_error()

try:
resp = self._co.embed(
texts=[query],
model=self._embed_model,
input_type="search_query",
embedding_types=["float"],
)
qvec = np.asarray(resp.embeddings.float[0], dtype=np.float32)
except Exception as exc:
return self.get_tool_error(details=f"Cohere embed error: {exc}")

try:
hits = self._db.search(qvec, top_k=top_k)
except Exception as exc:
return self.get_tool_error(details=f"LogosDB search error: {exc}")

if not hits:
return self.get_no_results_error()

return [
{
"text": h.text or f"[vector id={h.id}]",
"score": float(h.score),
"url": h.timestamp or "",
}
for h in hits
]

# ── Index helper (callable from outside, e.g. a setup script) ────────

def index(
self,
texts: List[str],
urls: Optional[List[str]] = None,
) -> int:
"""Embed and store *texts* in the local index.

Args:
texts: Documents to embed and store.
urls: Optional source URLs stored as the timestamp field.

Returns:
Number of vectors inserted.
"""
if not _LOGOSDB_AVAILABLE:
raise RuntimeError("logosdb is not installed.")

urls = urls or [""] * len(texts)
resp = self._co.embed(
texts=texts,
model=self._embed_model,
input_type="search_document",
embedding_types=["float"],
)
for text, url, embedding in zip(texts, urls, resp.embeddings.float):
vec = np.asarray(embedding, dtype=np.float32)
self._db.put(vec, text=text, timestamp=url)

return len(texts)