Skip to content

Commit 51706ec

Browse files
committed
Text-to-SQL: Help agents turn natural language into SQL queries
DataQuery is the little sister of Google's QueryData product.
1 parent 408ec74 commit 51706ec

File tree

10 files changed

+396
-22
lines changed

10 files changed

+396
-22
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Kinesis: Added `ctk kinesis` CLI group with `list-checkpoints` and
66
`prune-checkpoints` commands for checkpoint table maintenance
77
- Dependencies: Permitted installation of click 8.3
8+
- QueryData: Help agents turn natural language into SQL queries
89

910
## 2026/03/16 v0.0.46
1011
- I/O: API improvements: `ctk {load,save} table` became `ctk {load,save}`

cratedb_toolkit/query/cli.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,9 @@
1-
import logging
2-
3-
import click
4-
from click_aliases import ClickAliasedGroup
5-
6-
from ..util.cli import boot_click
1+
from ..util.app import make_cli
72
from .convert.cli import convert_query
3+
from .llm.cli import llm_cli
84
from .mcp.cli import cli as mcp_cli
95

10-
logger = logging.getLogger(__name__)
11-
12-
13-
@click.group(cls=ClickAliasedGroup)
14-
@click.option("--verbose", is_flag=True, required=False, help="Turn on logging")
15-
@click.option("--debug", is_flag=True, required=False, help="Turn on logging with debug level")
16-
@click.version_option()
17-
@click.pass_context
18-
def cli(ctx: click.Context, verbose: bool, debug: bool):
19-
"""
20-
Query utilities.
21-
"""
22-
return boot_click(ctx, verbose, debug)
23-
24-
6+
cli = make_cli()
257
cli.add_command(convert_query, name="convert")
8+
cli.add_command(llm_cli, name="llm")
269
cli.add_command(mcp_cli, name="mcp")

cratedb_toolkit/query/llm/__init__.py

Whitespace-only changes.

cratedb_toolkit/query/llm/api.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
Use an LLM to query a database in human language via NLSQLTableQueryEngine.
3+
Example code using LlamaIndex with vanilla Open AI, Azure Open AI, or Ollama.
4+
"""
5+
6+
import dataclasses
7+
import logging
8+
import os
9+
from typing import Optional
10+
11+
import sqlalchemy as sa
12+
from llama_index.core.base.embeddings.base import BaseEmbedding
13+
from llama_index.core.base.response.schema import RESPONSE_TYPE
14+
from llama_index.core.llms import LLM
15+
from llama_index.core.query_engine import NLSQLTableQueryEngine
16+
from llama_index.core.utilities.sql_wrapper import SQLDatabase
17+
18+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo
19+
from cratedb_toolkit.query.llm.util import configure_llm
20+
21+
logger = logging.getLogger(__name__)
22+
23+
24+
@dataclasses.dataclass
25+
class DataQuery:
26+
"""
27+
DataQuery helps agents turn natural language into SQL queries.
28+
It's the little sister of Google's QueryData product. [1]
29+
30+
We recommend evaluating the Text-to-SQL interface using the Gemma models if you are
31+
looking at non-frontier variants that need less resources for inference. However,
32+
depending on the complexity of your problem, you may also want to use cutting-edge
33+
models with your provider of choice at the cost of higher resource usage.
34+
35+
Attention: Any natural language SQL table query engine and Text-to-SQL application
36+
should be aware that executing arbitrary SQL queries can be a security risk.
37+
It is recommended to take precautions as needed, such as using restricted roles,
38+
read-only databases, sandboxing, etc.
39+
40+
[1] https://cloud.google.com/blog/products/databases/introducing-querydata-for-near-100-percent-accurate-data-agents
41+
[2] https://github.com/kupp0/multi-db-property-search-data-agents
42+
"""
43+
44+
db: DatabaseInfo
45+
model: ModelInfo
46+
query_engine: Optional[NLSQLTableQueryEngine] = None
47+
48+
def __post_init__(self):
49+
self.setup()
50+
51+
def setup(self):
52+
# Configure database connection and query engine.
53+
logger.info("Connecting to CrateDB")
54+
engine_crate = sa.create_engine(os.getenv("CRATEDB_SQLALCHEMY_URL", "crate://"))
55+
engine_crate.connect()
56+
57+
# Configure model.
58+
logger.info("Configuring LLM model")
59+
llm: LLM
60+
embed_model: BaseEmbedding
61+
llm, embed_model = configure_llm(self.model)
62+
63+
# Configure query engine.
64+
logger.info("Creating query engine")
65+
sql_database = SQLDatabase(
66+
engine_crate,
67+
ignore_tables=self.db.ignore_tables,
68+
include_tables=self.db.include_tables,
69+
)
70+
self.query_engine = NLSQLTableQueryEngine(
71+
sql_database=sql_database,
72+
llm=llm,
73+
embed_model=embed_model,
74+
)
75+
76+
def ask(self, question: str) -> RESPONSE_TYPE:
77+
"""Invoke an inquiry to the LLM."""
78+
if not self.query_engine:
79+
raise ValueError("Query engine not configured")
80+
logger.debug("Running query: %s", question)
81+
return self.query_engine.query(question)

cratedb_toolkit/query/llm/cli.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import logging
2+
import os
3+
from typing import Optional
4+
5+
import click
6+
from dotenv import load_dotenv
7+
8+
from cratedb_toolkit import DatabaseCluster
9+
from cratedb_toolkit.query.llm.api import DataQuery
10+
from cratedb_toolkit.query.llm.model import DatabaseInfo, ModelInfo, ModelProvider
11+
from cratedb_toolkit.util.common import setup_logging
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
def help_llm():
17+
"""
18+
Use an LLM to query the database in human language.
19+
20+
Synopsis
21+
========
22+
23+
export CRATEDB_CLUSTER_URL=crate://localhost/
24+
ctk query llm "What is the average value for sensor 1?"
25+
26+
""" # noqa: E501
27+
28+
29+
@click.command()
30+
@click.argument("question")
31+
@click.option("--schema", envvar="CRATEDB_SCHEMA", type=str, required=False, help="Schema where to operate on")
32+
@click.option("--llm-provider", envvar="LLM_PROVIDER", type=str, required=True, help="LLM provider name")
33+
@click.option("--llm-name", envvar="LLM_NAME", type=str, required=False, help="LLM model name for completions")
34+
@click.option(
35+
"--llm-embedding-name", envvar="LLM_EMBEDDING_NAME", type=str, required=False, help="LLM model name for embeddings"
36+
)
37+
@click.option("--llm-api-key", envvar="LLM_API_KEY", type=str, required=False, help="LLM API key")
38+
@click.pass_context
39+
def llm_cli(
40+
ctx: click.Context,
41+
question: str,
42+
schema: Optional[str],
43+
llm_provider: str,
44+
llm_name: Optional[str],
45+
llm_embedding_name: Optional[str],
46+
llm_api_key: Optional[str],
47+
):
48+
"""
49+
Use an LLM to query a database in human language.
50+
"""
51+
setup_logging()
52+
load_dotenv()
53+
54+
# Connect to database.
55+
dc = DatabaseCluster.from_options(ctx.meta["address"])
56+
engine = dc.adapter.engine
57+
schema = os.getenv("CRATEDB_SCHEMA", "doc")
58+
59+
provider = ModelProvider(llm_provider)
60+
61+
# Parameter sanity checks and heuristics.
62+
if not llm_name:
63+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
64+
llm_name = "gpt-4.1"
65+
elif provider in [ModelProvider.OLLAMA]:
66+
llm_name = "gemma3:1b"
67+
else:
68+
raise ValueError("LLM completion model not selected")
69+
if not llm_embedding_name:
70+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
71+
llm_embedding_name = "text-embedding-3-large"
72+
elif provider in [ModelProvider.OLLAMA]:
73+
llm_embedding_name = "local"
74+
else:
75+
raise ValueError("LLM embedding model not selected")
76+
if not llm_api_key:
77+
if provider in [ModelProvider.OPENAI, ModelProvider.AZURE]:
78+
llm_api_key = os.getenv("OPENAI_API_KEY")
79+
80+
logger.info("Selected LLM: completion=%s, embedding=%s", llm_name, llm_embedding_name)
81+
82+
# Submit query.
83+
dq = DataQuery(
84+
db=DatabaseInfo(
85+
engine=engine,
86+
schema=schema,
87+
),
88+
model=ModelInfo(
89+
provider=provider,
90+
completion=llm_name,
91+
embedding=llm_embedding_name,
92+
api_key=llm_api_key,
93+
),
94+
)
95+
response = dq.ask(question)
96+
97+
logger.info("Query was: %s", question)
98+
logger.info("Answer was: %s", response)
99+
logger.info("More (metadata, formatted sources):")
100+
logger.info(response.get_formatted_sources())
101+
logger.info(response.metadata)
102+
return response
103+
104+
# assert "Answer was: The average value for sensor 1 is approximately 17.03." in out # noqa: ERA001

cratedb_toolkit/query/llm/model.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import dataclasses
2+
from enum import Enum
3+
from typing import List, Optional
4+
5+
import sqlalchemy as sa
6+
7+
8+
class ModelProvider(Enum):
9+
"""Model provider choices."""
10+
11+
OPENAI = "openai"
12+
AZURE = "azure"
13+
OLLAMA = "ollama"
14+
15+
16+
@dataclasses.dataclass
17+
class ModelInfo:
18+
"""Information about the model."""
19+
20+
provider: ModelProvider
21+
completion: str
22+
embedding: str
23+
endpoint: Optional[str] = None
24+
instance: Optional[str] = None
25+
api_key: Optional[str] = None
26+
api_version: Optional[str] = None
27+
28+
29+
@dataclasses.dataclass
30+
class DatabaseInfo:
31+
"""Information about the database."""
32+
33+
engine: sa.engine.Engine
34+
schema: Optional[str] = None
35+
ignore_tables: Optional[List[str]] = None
36+
include_tables: Optional[List[str]] = None

cratedb_toolkit/query/llm/util.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import os
2+
from typing import Tuple
3+
4+
import llama_index.core
5+
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
6+
from llama_index.core.base.embeddings.base import BaseEmbedding
7+
from llama_index.core.llms import LLM
8+
from llama_index.embeddings.langchain import LangchainEmbedding
9+
from llama_index.llms.azure_openai import AzureOpenAI
10+
from llama_index.llms.ollama import Ollama
11+
from llama_index.llms.openai import OpenAI
12+
13+
from cratedb_toolkit.query.llm.model import ModelInfo, ModelProvider
14+
15+
16+
def configure_llm(info: ModelInfo, debug: bool = False) -> Tuple[LLM, BaseEmbedding]:
17+
"""
18+
Configure LLM access and model types. Use either vanilla Open AI, Azure Open AI, or Ollama.
19+
20+
TODO: What about Hugging Face, Runpod, vLLM, and others?
21+
22+
Notes about text embedding models:
23+
24+
> The new model, `text-embedding-ada-002`, replaces five separate models for text search,
25+
> text similarity, and code search, and outperforms our previous most capable model,
26+
> Davinci, at most tasks, while being priced 99.8% lower.
27+
28+
- https://openai.com/index/new-and-improved-embedding-model/
29+
- https://community.openai.com/t/models-embedding-vs-similarity-vs-search-models/291265
30+
"""
31+
32+
completion_model = info.completion
33+
embedding_model = info.embedding or "text-embedding-3-large"
34+
35+
if not info.provider:
36+
raise ValueError("LLM model type not defined")
37+
if not completion_model:
38+
raise ValueError("LLM model name not defined")
39+
40+
# https://docs.llamaindex.ai/en/stable/understanding/tracing_and_debugging/tracing_and_debugging/
41+
if debug:
42+
llama_index.core.set_global_handler("simple")
43+
44+
if info.provider is ModelProvider.OPENAI:
45+
llm = OpenAI(
46+
model=completion_model,
47+
temperature=0.0,
48+
api_key=info.api_key,
49+
api_version=info.api_version,
50+
)
51+
elif info.provider is ModelProvider.AZURE:
52+
llm = AzureOpenAI(
53+
model=completion_model,
54+
temperature=0.0,
55+
engine=info.instance,
56+
azure_endpoint=info.endpoint,
57+
api_key=info.api_key,
58+
api_version=info.api_version,
59+
)
60+
elif info.provider is ModelProvider.OLLAMA:
61+
# https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
62+
llm = Ollama(
63+
base_url=info.endpoint or "http://localhost:11434",
64+
model=completion_model,
65+
temperature=0.0,
66+
request_timeout=120.0,
67+
keep_alive=-1,
68+
)
69+
else:
70+
raise ValueError("LLM model type invalid: %s", info.provider)
71+
72+
if info.provider is ModelProvider.OPENAI:
73+
embed_model = LangchainEmbedding(OpenAIEmbeddings(model=embedding_model))
74+
elif info.provider is ModelProvider.AZURE:
75+
embed_model = LangchainEmbedding(
76+
AzureOpenAIEmbeddings(
77+
azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
78+
model=embedding_model,
79+
)
80+
)
81+
else:
82+
embed_model = "local"
83+
84+
return llm, embed_model # ty: ignore[invalid-return-type]

doc/query/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ expressions: Adapters, converters, migration support tasks, etc.
66
```{toctree}
77
:maxdepth: 2
88
9+
llm/index
910
mcp/index
1011
convert
1112
```

0 commit comments

Comments
 (0)