11from __future__ import annotations
22
33import re
4- from dataclasses import dataclass
4+ from dataclasses import dataclass , field
55from typing import Any
66
77from sqlalchemy import Index , event , text
1313
1414from .errors import (
1515 DuplicateTokenizerAliasError ,
16+ FieldNotIndexedError ,
1617 InvalidArgumentError ,
1718 InvalidBM25FieldError ,
1819 InvalidKeyFieldError ,
@@ -35,7 +36,7 @@ def render(self) -> str:
3536 raise InvalidArgumentError ("tokenizer name is required unless raw_sql is provided" )
3637
3738 if not self .options :
38- return f"pdb.{ self .name } () "
39+ return f"pdb.{ self .name } "
3940
4041 rendered_options = "," .join (f"{ key } ={ _format_option_value (value )} " for key , value in self .options )
4142 escaped = rendered_options .replace ("'" , "''" )
@@ -179,12 +180,15 @@ class IndexMeta:
179180 key_field : str | None
180181 fields : tuple [str , ...]
181182 aliases : dict [str , str ]
183+ tokenizers : dict [str , tuple [str , ...]] = field (default_factory = dict )
184+ """Maps field name to the tokenizer names used in this index, e.g. ``{"description": ("unicode_words",)}``."""
182185
183186
184187_KEY_FIELD_RE = re .compile (r"key_field\s*=\s*'?\"?([^'\",)\s]+)\"?'?" , re .IGNORECASE )
185188_ALIAS_RE = re .compile (r"alias\s*=\s*([A-Za-z_][A-Za-z0-9_]*)" , re .IGNORECASE )
186189_CAST_FIELD_RE = re .compile (r"^\(*\"?([A-Za-z_][A-Za-z0-9_]*)\"?\)*\s*::\s*pdb\." , re .IGNORECASE )
187190_PLAIN_FIELD_RE = re .compile (r'^\(*"?([A-Za-z_][A-Za-z0-9_]*)"?\)*$' )
191+ _TOKENIZER_NAME_RE = re .compile (r"::pdb\.([A-Za-z_][A-Za-z0-9_]*)" , re .IGNORECASE )
188192
189193
190194def _split_top_level_csv (expr : str ) -> list [str ]:
@@ -274,6 +278,13 @@ def _extract_alias(index_expr: str) -> str | None:
274278 return None
275279
276280
281+ def _extract_tokenizer_name (field_expr : str ) -> str | None :
282+ """Return the bare tokenizer name from a field expression, e.g. ``unicode_words`` from
283+ ``(description::pdb.unicode_words('lowercase=true'))``. Returns ``None`` for plain fields."""
284+ match = _TOKENIZER_NAME_RE .search (field_expr )
285+ return match .group (1 ) if match else None
286+
287+
277288def describe (engine : Engine , table ) -> list [IndexMeta ]:
278289 query = text (
279290 """
@@ -293,6 +304,7 @@ def describe(engine: Engine, table) -> list[IndexMeta]:
293304 key_field = _extract_key_field (indexdef )
294305 raw_fields = _extract_bm25_field_list (indexdef )
295306 aliases : dict [str , str ] = {}
307+ tokenizer_map : dict [str , list [str ]] = {}
296308 fields_ordered : list [str ] = []
297309 for raw in raw_fields :
298310 field_name = _extract_field_name (raw )
@@ -303,13 +315,93 @@ def describe(engine: Engine, table) -> list[IndexMeta]:
303315 alias = _extract_alias (raw )
304316 if alias is not None :
305317 aliases [alias ] = field_name
318+ tok = _extract_tokenizer_name (raw )
319+ if tok is not None :
320+ tokenizer_map .setdefault (field_name , []).append (tok )
306321
307322 output .append (
308323 IndexMeta (
309324 index_name = row .indexname ,
310325 key_field = key_field ,
311326 fields = tuple (fields_ordered ),
312327 aliases = aliases ,
328+ tokenizers = {k : tuple (v ) for k , v in tokenizer_map .items ()},
313329 )
314330 )
315331 return output
332+
333+
334+ def assert_indexed (
335+ engine : Engine ,
336+ column : Any ,
337+ * ,
338+ tokenizer : str | None = None ,
339+ ) -> None :
340+ """Raise :exc:`FieldNotIndexedError` if *column* is not covered by any BM25 index.
341+
342+ Args:
343+ engine: SQLAlchemy engine connected to the ParadeDB database.
344+ column: A table-bound column expression (e.g. ``Product.description``).
345+ tokenizer: Optional tokenizer name to verify, e.g. ``"literal"`` or
346+ ``"unicode_words"``. When given, raises if the column is not
347+ indexed with that specific tokenizer.
348+
349+ Example::
350+
351+ assert_indexed(engine, Product.category, tokenizer="literal")
352+ """
353+ table = getattr (column , "table" , None )
354+ if table is None :
355+ raise InvalidArgumentError ("column must be a table-bound column expression" )
356+ col_name : str | None = getattr (column , "name" , None )
357+ if col_name is None :
358+ raise InvalidArgumentError ("column must have a name attribute" )
359+
360+ for idx_meta in describe (engine , table ):
361+ if col_name not in idx_meta .fields :
362+ continue
363+ if tokenizer is None :
364+ return # field is indexed; no tokenizer constraint
365+ if tokenizer in idx_meta .tokenizers .get (col_name , ()):
366+ return # field is indexed with the requested tokenizer
367+
368+ msg = f"'{ col_name } ' is not indexed in any BM25 index on '{ table .name } '"
369+ if tokenizer :
370+ msg += f" with tokenizer '{ tokenizer } '"
371+ raise FieldNotIndexedError (msg )
372+
373+
374+ def validate_pushdown (stmt : Any ) -> list [str ]:
375+ """Inspect *stmt* for patterns that will not push down to ParadeDB.
376+
377+ Performs **static AST analysis only** — no database connection is required.
378+ Returns a (possibly empty) list of human-readable warning strings.
379+
380+ Example::
381+
382+ issues = validate_pushdown(stmt)
383+ for w in issues:
384+ print("Warning:", w)
385+ """
386+ from . import inspect as _inspect
387+
388+ warnings : list [str ] = []
389+
390+ whereclause = getattr (stmt , "whereclause" , None )
391+ if whereclause is None :
392+ warnings .append (
393+ "No WHERE clause found; query will perform a full table scan without ParadeDB"
394+ )
395+ elif not _inspect .has_paradedb_predicate (whereclause ):
396+ warnings .append (
397+ "No ParadeDB predicate found in WHERE clause; query will not use a BM25 index"
398+ )
399+
400+ order_by = getattr (stmt , "_order_by_clauses" , None ) or ()
401+ limit = getattr (stmt , "_limit_clause" , None )
402+ if order_by and limit is None :
403+ warnings .append (
404+ "ORDER BY is present without LIMIT; top-N pushdown to ParadeDB requires both"
405+ )
406+
407+ return warnings
0 commit comments