wip

ankitml · ankitml · commit 28788b3d5358 · 2026-02-18T14:01:58.000-05:00
diff --git a/paradedb/sqlalchemy/errors.py b/paradedb/sqlalchemy/errors.py
@@ -55,3 +55,7 @@ class FacetRequiresLimitError(FacetRuntimeError):
 
 class FacetRequiresParadeDBPredicateError(FacetRuntimeError):
     """Raised when rows+facets helper is used without ParadeDB predicate/sentinel."""
+
+
+class FieldNotIndexedError(ParadeDBError):
+    """Raised when a column is not covered by any BM25 index on its table."""
diff --git a/paradedb/sqlalchemy/indexing.py b/paradedb/sqlalchemy/indexing.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Any
 
 from sqlalchemy import Index, event, text
@@ -13,6 +13,7 @@
 
 from .errors import (
     DuplicateTokenizerAliasError,
+    FieldNotIndexedError,
     InvalidArgumentError,
     InvalidBM25FieldError,
     InvalidKeyFieldError,
@@ -35,7 +36,7 @@ def render(self) -> str:
             raise InvalidArgumentError("tokenizer name is required unless raw_sql is provided")
 
         if not self.options:
-            return f"pdb.{self.name}()"
+            return f"pdb.{self.name}"
 
         rendered_options = ",".join(f"{key}={_format_option_value(value)}" for key, value in self.options)
         escaped = rendered_options.replace("'", "''")
@@ -179,12 +180,15 @@ class IndexMeta:
     key_field: str | None
     fields: tuple[str, ...]
     aliases: dict[str, str]
+    tokenizers: dict[str, tuple[str, ...]] = field(default_factory=dict)
+    """Maps field name to the tokenizer names used in this index, e.g. ``{"description": ("unicode_words",)}``."""
 
 
 _KEY_FIELD_RE = re.compile(r"key_field\s*=\s*'?\"?([^'\",)\s]+)\"?'?", re.IGNORECASE)
 _ALIAS_RE = re.compile(r"alias\s*=\s*([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
 _CAST_FIELD_RE = re.compile(r"^\(*\"?([A-Za-z_][A-Za-z0-9_]*)\"?\)*\s*::\s*pdb\.", re.IGNORECASE)
 _PLAIN_FIELD_RE = re.compile(r'^\(*"?([A-Za-z_][A-Za-z0-9_]*)"?\)*$')
+_TOKENIZER_NAME_RE = re.compile(r"::pdb\.([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
 
 
 def _split_top_level_csv(expr: str) -> list[str]:
@@ -274,6 +278,13 @@ def _extract_alias(index_expr: str) -> str | None:
     return None
 
 
+def _extract_tokenizer_name(field_expr: str) -> str | None:
+    """Return the bare tokenizer name from a field expression, e.g. ``unicode_words`` from
+    ``(description::pdb.unicode_words('lowercase=true'))``. Returns ``None`` for plain fields."""
+    match = _TOKENIZER_NAME_RE.search(field_expr)
+    return match.group(1) if match else None
+
+
 def describe(engine: Engine, table) -> list[IndexMeta]:
     query = text(
         """
@@ -293,6 +304,7 @@ def describe(engine: Engine, table) -> list[IndexMeta]:
         key_field = _extract_key_field(indexdef)
         raw_fields = _extract_bm25_field_list(indexdef)
         aliases: dict[str, str] = {}
+        tokenizer_map: dict[str, list[str]] = {}
         fields_ordered: list[str] = []
         for raw in raw_fields:
             field_name = _extract_field_name(raw)
@@ -303,13 +315,93 @@ def describe(engine: Engine, table) -> list[IndexMeta]:
             alias = _extract_alias(raw)
             if alias is not None:
                 aliases[alias] = field_name
+            tok = _extract_tokenizer_name(raw)
+            if tok is not None:
+                tokenizer_map.setdefault(field_name, []).append(tok)
 
         output.append(
             IndexMeta(
                 index_name=row.indexname,
                 key_field=key_field,
                 fields=tuple(fields_ordered),
                 aliases=aliases,
+                tokenizers={k: tuple(v) for k, v in tokenizer_map.items()},
             )
         )
     return output
+
+
+def assert_indexed(
+    engine: Engine,
+    column: Any,
+    *,
+    tokenizer: str | None = None,
+) -> None:
+    """Raise :exc:`FieldNotIndexedError` if *column* is not covered by any BM25 index.
+
+    Args:
+        engine: SQLAlchemy engine connected to the ParadeDB database.
+        column: A table-bound column expression (e.g. ``Product.description``).
+        tokenizer: Optional tokenizer name to verify, e.g. ``"literal"`` or
+                   ``"unicode_words"``.  When given, raises if the column is not
+                   indexed with that specific tokenizer.
+
+    Example::
+
+        assert_indexed(engine, Product.category, tokenizer="literal")
+    """
+    table = getattr(column, "table", None)
+    if table is None:
+        raise InvalidArgumentError("column must be a table-bound column expression")
+    col_name: str | None = getattr(column, "name", None)
+    if col_name is None:
+        raise InvalidArgumentError("column must have a name attribute")
+
+    for idx_meta in describe(engine, table):
+        if col_name not in idx_meta.fields:
+            continue
+        if tokenizer is None:
+            return  # field is indexed; no tokenizer constraint
+        if tokenizer in idx_meta.tokenizers.get(col_name, ()):
+            return  # field is indexed with the requested tokenizer
+
+    msg = f"'{col_name}' is not indexed in any BM25 index on '{table.name}'"
+    if tokenizer:
+        msg += f" with tokenizer '{tokenizer}'"
+    raise FieldNotIndexedError(msg)
+
+
+def validate_pushdown(stmt: Any) -> list[str]:
+    """Inspect *stmt* for patterns that will not push down to ParadeDB.
+
+    Performs **static AST analysis only** — no database connection is required.
+    Returns a (possibly empty) list of human-readable warning strings.
+
+    Example::
+
+        issues = validate_pushdown(stmt)
+        for w in issues:
+            print("Warning:", w)
+    """
+    from . import inspect as _inspect
+
+    warnings: list[str] = []
+
+    whereclause = getattr(stmt, "whereclause", None)
+    if whereclause is None:
+        warnings.append(
+            "No WHERE clause found; query will perform a full table scan without ParadeDB"
+        )
+    elif not _inspect.has_paradedb_predicate(whereclause):
+        warnings.append(
+            "No ParadeDB predicate found in WHERE clause; query will not use a BM25 index"
+        )
+
+    order_by = getattr(stmt, "_order_by_clauses", None) or ()
+    limit = getattr(stmt, "_limit_clause", None)
+    if order_by and limit is None:
+        warnings.append(
+            "ORDER BY is present without LIMIT; top-N pushdown to ParadeDB requires both"
+        )
+
+    return warnings
diff --git a/paradedb/sqlalchemy/search.py b/paradedb/sqlalchemy/search.py
@@ -17,6 +17,8 @@
     require_positive,
 )
 
+_VALID_RANGE_RELATIONS: frozenset[str] = frozenset({"Intersects", "Contains", "Within", "ContainsOrIntersects"})
+
 _MATCH_ALL = operators.custom_op("&&&", precedence=5, is_comparison=True)
 _MATCH_ANY = operators.custom_op("|||", precedence=5, is_comparison=True)
 _TERM = operators.custom_op("===", precedence=5, is_comparison=True)
@@ -168,6 +170,31 @@ def proximity(field: ColumnElement, prox: ProximityExpr | ClauseElement) -> Colu
     return field.operate(_QUERY, prox_expr)
 
 
+def range_term(
+    field: ColumnElement,
+    bounds: str,
+    *,
+    relation: str = "Intersects",
+) -> ColumnElement[bool]:
+    """Match rows where a range-typed field satisfies a range predicate.
+
+    Args:
+        field: A range-typed column (int4range, daterange, tstzrange, etc.).
+        bounds: A range literal string, e.g. ``"[3,9]"``, ``"(3,9]"``.
+        relation: One of ``"Intersects"``, ``"Contains"``, ``"Within"``,
+                  ``"ContainsOrIntersects"``. Defaults to ``"Intersects"``.
+
+    Generates::
+
+        field @@@ pdb.range_term('[3,9]', 'Contains')
+    """
+    if relation not in _VALID_RANGE_RELATIONS:
+        raise InvalidArgumentError(
+            f"relation must be one of: {', '.join(sorted(_VALID_RANGE_RELATIONS))}"
+        )
+    return field.operate(_QUERY, func.pdb.range_term(bounds, relation))
+
+
 def more_like_this(
     field: ColumnElement,
     *,
diff --git a/tests/integration/test_indexing_integration.py b/tests/integration/test_indexing_integration.py
@@ -3,7 +3,8 @@
 import pytest
 from sqlalchemy import Column, Index, Integer, MetaData, String, Table, Text, text
 
-from paradedb.sqlalchemy.indexing import BM25Field, describe, tokenize
+from paradedb.sqlalchemy.indexing import BM25Field, assert_indexed, describe, tokenize
+from paradedb.sqlalchemy.errors import FieldNotIndexedError
 
 
 pytestmark = pytest.mark.integration
@@ -260,3 +261,79 @@ def test_describe_returns_fields_and_aliases(engine):
     assert meta.aliases == {"category_exact": "category"}
 
     _drop_table_and_index(engine, table_name, index_name)
+
+
+def test_describe_includes_tokenizers(engine):
+    """describe() populates IndexMeta.tokenizers from the index definition."""
+    if not _tokenizer_cast_supported(engine):
+        pytest.skip("ParadeDB instance does not support tokenizer cast index syntax yet")
+
+    table_name = "describe_tokenizers_products"
+    index_name = "describe_tokenizers_bm25_idx"
+    _drop_table_and_index(engine, table_name, index_name)
+
+    metadata = MetaData()
+    products = Table(
+        table_name,
+        metadata,
+        Column("id", Integer, primary_key=True),
+        Column("description", Text, nullable=False),
+        Column("category", String(120), nullable=False),
+    )
+    metadata.create_all(engine)
+
+    idx = Index(
+        index_name,
+        BM25Field(products.c.id),
+        BM25Field(products.c.description, tokenizer=tokenize.unicode(lowercase=True)),
+        BM25Field(products.c.category, tokenizer=tokenize.literal()),
+        postgresql_using="bm25",
+        postgresql_with={"key_field": "id"},
+    )
+    idx.create(engine)
+
+    metas = describe(engine, products)
+    meta = next(m for m in metas if m.index_name == index_name)
+
+    assert "unicode_words" in meta.tokenizers.get("description", ())
+    assert "literal" in meta.tokenizers.get("category", ())
+    assert "id" not in meta.tokenizers  # no tokenizer for plain key field
+
+    _drop_table_and_index(engine, table_name, index_name)
+
+
+def test_assert_indexed_passes_and_raises(engine):
+    """assert_indexed passes for an indexed column and raises for an unindexed one."""
+    table_name = "assert_indexed_products"
+    index_name = "assert_indexed_bm25_idx"
+    _drop_table_and_index(engine, table_name, index_name)
+
+    metadata = MetaData()
+    tbl = Table(
+        table_name,
+        metadata,
+        Column("id", Integer, primary_key=True),
+        Column("description", Text, nullable=False),
+        Column("extra", Text, nullable=False),
+    )
+    metadata.create_all(engine)
+
+    idx = Index(
+        index_name,
+        BM25Field(tbl.c.id),
+        BM25Field(tbl.c.description),
+        postgresql_using="bm25",
+        postgresql_with={"key_field": "id"},
+    )
+    idx.create(engine)
+
+    # 'description' is indexed → no error
+    assert_indexed(engine, tbl.c.description)
+
+    # 'extra' is not indexed → FieldNotIndexedError
+    with pytest.raises(FieldNotIndexedError, match="'extra'"):
+        assert_indexed(engine, tbl.c.extra)
+
+    _drop_table_and_index(engine, table_name, index_name)
+
+
diff --git a/tests/unit/test_indexing_unit.py b/tests/unit/test_indexing_unit.py
diff --git a/tests/unit/test_sql_compilation_unit.py b/tests/unit/test_sql_compilation_unit.py