paradedb
diff --git a/‎paradedb/sqlalchemy/alembic.py‎
Lines changed: 124 additions & 37 deletions b/‎paradedb/sqlalchemy/alembic.py‎
Lines changed: 124 additions & 37 deletions
diff --git a/‎paradedb/sqlalchemy/facets.py‎
Lines changed: 3 additions & 3 deletions b/‎paradedb/sqlalchemy/facets.py‎
Lines changed: 3 additions & 3 deletions
@@ -1,10 +1,14 @@
 from __future__ import annotations
 
+import re
+
 from alembic.autogenerate import comparators, renderers
 from alembic.operations import Operations
 from alembic.operations.ops import MigrateOperation
 from alembic.util import DispatchPriority, PriorityDispatchResult
+from sqlalchemy.dialects import postgresql
 from sqlalchemy import text
+from sqlalchemy.sql.elements import ClauseElement
 
 
 def _quote_ident(name: str) -> str:
@@ -15,100 +19,150 @@ def _quote_literal(value: str) -> str:
     return "'" + value.replace("'", "''") + "'"
 
 
+def _quote_qualified(schema: str | None, name: str) -> str:
+    if schema:
+        return f"{_quote_ident(schema)}.{_quote_ident(name)}"
+    return _quote_ident(name)
+
+
 @Operations.register_operation("create_bm25_index")
 class CreateBM25IndexOp(MigrateOperation):
-    def __init__(self, index_name: str, table_name: str, fields: list[str], key_field: str) -> None:
+    def __init__(
+        self,
+        index_name: str,
+        table_name: str,
+        expressions: list[str],
+        key_field: str,
+        *,
+        table_schema: str | None = None,
+        index_schema: str | None = None,
+    ) -> None:
         self.index_name = index_name
         self.table_name = table_name
-        self.fields = fields
+        self.expressions = expressions
         self.key_field = key_field
+        self.table_schema = table_schema
+        self.index_schema = index_schema
 
     @classmethod
     def create_bm25_index(
         cls,
         operations: Operations,
         index_name: str,
         table_name: str,
-        fields: list[str],
+        expressions: list[str],
         *,
         key_field: str,
+        table_schema: str | None = None,
+        index_schema: str | None = None,
     ) -> MigrateOperation:
-        return operations.invoke(cls(index_name, table_name, fields, key_field))
+        return operations.invoke(
+            cls(
+                index_name,
+                table_name,
+                expressions,
+                key_field,
+                table_schema=table_schema,
+                index_schema=index_schema,
+            )
+        )
 
 
 @Operations.implementation_for(CreateBM25IndexOp)
 def _create_bm25_index_impl(operations: Operations, operation: CreateBM25IndexOp) -> None:
-    fields_sql = ", ".join(_quote_ident(field) for field in operation.fields)
+    expressions_sql = ", ".join(operation.expressions)
     sql = (
-        f"CREATE INDEX {_quote_ident(operation.index_name)} ON {_quote_ident(operation.table_name)} "
-        f"USING bm25 ({fields_sql}) WITH (key_field={_quote_literal(operation.key_field)})"
+        f"CREATE INDEX {_quote_ident(operation.index_name)} "
+        f"ON {_quote_qualified(operation.table_schema, operation.table_name)} "
+        f"USING bm25 ({expressions_sql}) WITH (key_field={_quote_literal(operation.key_field)})"
     )
     operations.execute(sql)
 
 
 @renderers.dispatch_for(CreateBM25IndexOp)
 def _render_create_bm25_index_op(autogen_context, op: CreateBM25IndexOp) -> str:
-    return (
-        f"op.create_bm25_index({op.index_name!r}, {op.table_name!r}, {op.fields!r}, "
-        f"key_field={op.key_field!r})"
-    )
+    parts = [
+        repr(op.index_name),
+        repr(op.table_name),
+        repr(op.expressions),
+        f"key_field={op.key_field!r}",
+    ]
+    if op.table_schema is not None:
+        parts.append(f"table_schema={op.table_schema!r}")
+    if op.index_schema is not None:
+        parts.append(f"index_schema={op.index_schema!r}")
+    return f"op.create_bm25_index({', '.join(parts)})"
 
 
 @Operations.register_operation("drop_bm25_index")
 class DropBM25IndexOp(MigrateOperation):
-    def __init__(self, index_name: str, if_exists: bool = True) -> None:
+    def __init__(self, index_name: str, if_exists: bool = True, schema: str | None = None) -> None:
         self.index_name = index_name
         self.if_exists = if_exists
+        self.schema = schema
 
     @classmethod
-    def drop_bm25_index(cls, operations: Operations, index_name: str, if_exists: bool = True) -> MigrateOperation:
-        return operations.invoke(cls(index_name=index_name, if_exists=if_exists))
+    def drop_bm25_index(
+        cls, operations: Operations, index_name: str, if_exists: bool = True, schema: str | None = None
+    ) -> MigrateOperation:
+        return operations.invoke(cls(index_name=index_name, if_exists=if_exists, schema=schema))
 
 
 @Operations.implementation_for(DropBM25IndexOp)
 def _drop_bm25_index_impl(operations: Operations, operation: DropBM25IndexOp) -> None:
     if_exists_sql = " IF EXISTS" if operation.if_exists else ""
-    operations.execute(f"DROP INDEX{if_exists_sql} {_quote_ident(operation.index_name)}")
+    operations.execute(f"DROP INDEX{if_exists_sql} {_quote_qualified(operation.schema, operation.index_name)}")
 
 
 @renderers.dispatch_for(DropBM25IndexOp)
 def _render_drop_bm25_index_op(autogen_context, op: DropBM25IndexOp) -> str:
-    return f"op.drop_bm25_index({op.index_name!r}, if_exists={op.if_exists!r})"
+    parts = [repr(op.index_name), f"if_exists={op.if_exists!r}"]
+    if op.schema is not None:
+        parts.append(f"schema={op.schema!r}")
+    return f"op.drop_bm25_index({', '.join(parts)})"
 
 
 @Operations.register_operation("reindex_bm25")
 class ReindexBM25Op(MigrateOperation):
-    def __init__(self, index_name: str, concurrently: bool = False) -> None:
+    def __init__(self, index_name: str, concurrently: bool = False, schema: str | None = None) -> None:
         self.index_name = index_name
         self.concurrently = concurrently
+        self.schema = schema
 
     @classmethod
-    def reindex_bm25(cls, operations: Operations, index_name: str, concurrently: bool = False) -> MigrateOperation:
-        return operations.invoke(cls(index_name=index_name, concurrently=concurrently))
+    def reindex_bm25(
+        cls, operations: Operations, index_name: str, concurrently: bool = False, schema: str | None = None
+    ) -> MigrateOperation:
+        return operations.invoke(cls(index_name=index_name, concurrently=concurrently, schema=schema))
 
 
 @Operations.implementation_for(ReindexBM25Op)
 def _reindex_bm25_impl(operations: Operations, operation: ReindexBM25Op) -> None:
     concurrently_sql = " CONCURRENTLY" if operation.concurrently else ""
-    operations.execute(f"REINDEX INDEX{concurrently_sql} {_quote_ident(operation.index_name)}")
+    operations.execute(f"REINDEX INDEX{concurrently_sql} {_quote_qualified(operation.schema, operation.index_name)}")
 
 
 @renderers.dispatch_for(ReindexBM25Op)
 def _render_reindex_bm25_op(autogen_context, op: ReindexBM25Op) -> str:
-    return f"op.reindex_bm25({op.index_name!r}, concurrently={op.concurrently!r})"
+    parts = [repr(op.index_name), f"concurrently={op.concurrently!r}"]
+    if op.schema is not None:
+        parts.append(f"schema={op.schema!r}")
+    return f"op.reindex_bm25({', '.join(parts)})"
 
 
 # ---------------------------------------------------------------------------
 # Autogenerate comparator
 # ---------------------------------------------------------------------------
 
-def _autogen_bm25_meta_indexes(metadata, effective_schemas: set[str]) -> dict[tuple[str, str], object]:
+def _autogen_bm25_meta_indexes(
+    metadata, effective_schemas: set[str], *, default_schema: str
+) -> dict[tuple[str, str], object]:
     """Return {(schema, index_name): Index} for all BM25 indexes in MetaData."""
     from .indexing import _is_bm25_index
 
     result: dict[tuple[str, str], object] = {}
     for table in metadata.tables.values():
-        schema = table.schema or next(iter(effective_schemas), "public")
+        schema = table.schema or default_schema
         if schema not in effective_schemas:
             continue
         for index in table.indexes:
@@ -118,8 +172,8 @@ def _autogen_bm25_meta_indexes(metadata, effective_schemas: set[str]) -> dict[tu
 
 
 def _autogen_bm25_db_indexes(conn, effective_schemas: set[str]) -> dict[tuple[str, str], dict]:
-    """Return {(schema, index_name): {table_name, fields, key_field}} from pg_indexes."""
-    from .indexing import _extract_bm25_field_list, _extract_field_name, _extract_key_field
+    """Return {(schema, index_name): {table_name, expressions, key_field}} from pg_indexes."""
+    from .indexing import _extract_bm25_field_list, _extract_key_field
 
     result: dict[tuple[str, str], dict] = {}
     for schema in effective_schemas:
@@ -137,15 +191,43 @@ def _autogen_bm25_db_indexes(conn, effective_schemas: set[str]) -> dict[tuple[st
         ).fetchall()
         for row in rows:
             raw_fields = _extract_bm25_field_list(row.indexdef)
-            fields = [f for f in (_extract_field_name(rf) for rf in raw_fields) if f is not None]
             result[(row.schemaname, row.indexname)] = {
                 "table_name": row.tablename,
-                "fields": fields,
+                "expressions": raw_fields,
                 "key_field": _extract_key_field(row.indexdef) or "",
             }
     return result
 
 
+def _render_bm25_expression(expr: ClauseElement) -> str:
+    return str(expr.compile(dialect=postgresql.dialect(), compile_kwargs={"literal_binds": True}))  # type: ignore[no-untyped-call]
+
+
+def _strip_relation_qualifiers(expr: str, table_name: str) -> str:
+    # SQLAlchemy may render column refs as `table.col` in metadata compilation;
+    # CREATE INDEX field lists should be table-local expressions.
+    stripped = expr.replace(f'"{table_name}".', "")
+    stripped = stripped.replace(f"{table_name}.", "")
+    return stripped
+
+
+def _normalize_bm25_expression(expr: str) -> str:
+    """Normalize BM25 expression text to reduce false-positive autogen churn."""
+    normalized = "".join(expr.split())
+    normalized = normalized.replace('"', "")
+    normalized = normalized.replace("::text", "")
+    # Ignore schema/table qualification differences, but keep tokenizer namespaces like `pdb.simple`.
+    previous = None
+    while previous != normalized:
+        previous = normalized
+        normalized = re.sub(r"(?<![A-Za-z0-9_])(?!pdb\b)[A-Za-z_][A-Za-z0-9_]*\.", "", normalized)
+    return normalized
+
+
+def _normalized_expression_list(expressions: list[str]) -> list[str]:
+    return [_normalize_bm25_expression(expr) for expr in expressions]
+
+
 def _suppress_standard_bm25_ops(upgrade_ops, bm25_names: set[str]) -> None:
     """Remove any standard Alembic CreateIndexOp/DropIndexOp for BM25 indexes."""
     from alembic.operations.ops import CreateIndexOp, DropIndexOp, ModifyTableOps
@@ -182,7 +264,7 @@ def _compare_bm25_indexes(autogen_context, upgrade_ops, schemas) -> PriorityDisp
     effective_schemas = {s if s is not None else default_schema for s in schemas}
 
     db_bm25 = _autogen_bm25_db_indexes(conn, effective_schemas)
-    meta_bm25 = _autogen_bm25_meta_indexes(metadata, effective_schemas)
+    meta_bm25 = _autogen_bm25_meta_indexes(metadata, effective_schemas, default_schema=default_schema)
 
     all_bm25_names = {k[1] for k in db_bm25} | {k[1] for k in meta_bm25}
     if not all_bm25_names:
@@ -196,37 +278,42 @@ def _compare_bm25_indexes(autogen_context, upgrade_ops, schemas) -> PriorityDisp
     # Emit drop ops for indexes present in DB but absent from MetaData.
     for key in db_bm25:
         if key not in meta_bm25:
-            upgrade_ops.ops.append(DropBM25IndexOp(index_name=key[1], if_exists=True))
+            upgrade_ops.ops.append(DropBM25IndexOp(index_name=key[1], if_exists=True, schema=key[0]))
 
     # Emit create ops for indexes present in MetaData but absent from DB.
-    # Also re-create indexes whose field list or key_field differs from the DB.
+    # Also re-create indexes whose expression list or key_field differs from the DB.
     for key, index in meta_bm25.items():
-        from .indexing import _bm25_field_name
-
         with_opts = index.dialect_options["postgresql"].get("with") or {}
         key_field = with_opts.get("key_field", "")
-        fields = [f for f in (_bm25_field_name(expr) for expr in index.expressions) if f is not None]
+        expressions = [
+            _strip_relation_qualifiers(_render_bm25_expression(expr), index.table.name)
+            for expr in index.expressions
+        ]
 
         if key not in db_bm25:
             upgrade_ops.ops.append(
                 CreateBM25IndexOp(
                     index_name=index.name,
                     table_name=index.table.name,
-                    fields=fields,
+                    expressions=expressions,
                     key_field=key_field,
+                    table_schema=key[0],
+                    index_schema=key[0],
                 )
             )
         else:
             db = db_bm25[key]
-            if db["fields"] != fields or db["key_field"] != key_field:
+            if _normalized_expression_list(db["expressions"]) != _normalized_expression_list(expressions) or db["key_field"] != key_field:
                 # Index configuration changed: drop the old one, create the new one.
-                upgrade_ops.ops.append(DropBM25IndexOp(index_name=key[1], if_exists=True))
+                upgrade_ops.ops.append(DropBM25IndexOp(index_name=key[1], if_exists=True, schema=key[0]))
                 upgrade_ops.ops.append(
                     CreateBM25IndexOp(
                         index_name=index.name,
                         table_name=index.table.name,
-                        fields=fields,
+                        expressions=expressions,
                         key_field=key_field,
+                        table_schema=key[0],
+                        index_schema=key[0],
                     )
                 )
 
 
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from collections.abc import Sequence
 from typing import Any
 
 from sqlalchemy import Select
@@ -108,10 +109,9 @@ def extract(self, rows: list[object]) -> Any | None:
         mapping = getattr(first, "_mapping", None)
         if mapping is not None and self.label in mapping:
             return mapping[self.label]
-        try:
+        if isinstance(first, Sequence) and not isinstance(first, (str, bytes)):
             return first[-1]
-        except Exception:
-            return None
+        return None
 
 
 def with_rows(