paradedb · ankitml · Feb 26, 2026 · Feb 19, 2026 · Feb 20, 2026 · Feb 20, 2026
@@ -14,7 +14,7 @@ pip install -e .[test,dev]
 ruff check .
 mypy paradedb
 python -m pytest tests/unit
-PARADEDB_TEST_DSN=postgres://postgres:postgres@localhost:5432/postgres python -m pytest -m integration
+PARADEDB_TEST_DSN=postgres://postgres:postgres@localhost:5443/postgres python -m pytest -m integration
 ```
 
 ## Guidelines

@@ -26,7 +26,7 @@ pip install -e .[test,dev]
 - `paradedb.sqlalchemy.search`: ParadeDB predicates (`match_all`, `fuzzy`, `parse`, `more_like_this`, etc.).
 - `paradedb.sqlalchemy.pdb`: function wrappers (`score`, `snippet`, `snippets`, `agg`).
 - `paradedb.sqlalchemy.facets`: aggregate/facet JSON builders and rows+facets helper.
-- `paradedb.sqlalchemy.select_with`: select decorators for score/snippet columns.
+- `paradedb.sqlalchemy.select_with`: select decorators for score/snippet/snippet_positions columns.
 - `paradedb.sqlalchemy.alembic`: Alembic operations for BM25 index lifecycle.
 
 ## Quickstart
@@ -49,11 +49,65 @@ products_bm25_idx.create(engine)
 stmt = select(Product.id, Product.description).where(search.match_any(Product.description, "running", "shoes"))
 ```
 
+Index JSON keys using BM25Field expressions:
+
+```python
+from paradedb.sqlalchemy import expr
+
+products_bm25_idx = Index(
+    "products_bm25_idx",
+    indexing.BM25Field(Product.id),
+    indexing.BM25Field(
+        expr.json_text(Product.metadata, "color"),
+        tokenizer=indexing.tokenize.literal(alias="metadata_color"),
+    ),
+    indexing.BM25Field(
+        expr.json_text(Product.metadata, "location"),
+        tokenizer=indexing.tokenize.literal(alias="metadata_location"),
+    ),
+    postgresql_using="bm25",
+    postgresql_with={"key_field": "id"},
+)
+```
+
+Tokenizer configs can use a Django/Rails-style structured shape:
+
+```python
+products_bm25_idx = Index(
+    "products_bm25_idx",
+    indexing.BM25Field(Product.id),
+    indexing.BM25Field(
+        Product.description,
+        tokenizer=indexing.tokenize.from_config(
+            {
+                "tokenizer": "simple",
+                "filters": ["lowercase", "stemmer"],
+                "stemmer": "english",
+                "alias": "description_simple",
+            }
+        ),
+    ),
+    indexing.BM25Field(
+        Product.description,
+        tokenizer=indexing.tokenize.from_config(
+            {
+                "tokenizer": "ngram",
+                "args": [3, 8],
+                "named_args": {"prefix_only": True},
+                "alias": "description_ngram",
+            }
+        ),
+    ),
+    postgresql_using="bm25",
+    postgresql_with={"key_field": "id"},
+)
+```
+
 ## Query APIs
 
 - Basic predicates: `match_all`, `match_any`, `term`, `phrase`, `fuzzy`, `regex`, `all`
 - Advanced predicates: `parse`, `phrase_prefix`, `regex_phrase`, `near`, `proximity`, `more_like_this`
-- Scoring/snippets: `pdb.score`, `pdb.snippet`, `pdb.snippets`, `select_with.score`, `select_with.snippet`
+- Scoring/snippets: `pdb.score`, `pdb.snippet`, `pdb.snippets`, `pdb.snippet_positions`, `select_with.score`, `select_with.snippet`, `select_with.snippet_positions`
 - Aggregations/facets: `facets.*` builders + `pdb.agg(...)`
 - Rows + facets: `facets.with_rows(...)`
 
@@ -67,6 +121,8 @@ stmt = (
     select(
         pdb.agg(facets.value_count(field="id")).label("count"),
         pdb.agg(facets.avg(field="rating")).label("avg_rating"),
+        pdb.agg(facets.percentiles(field="rating", percents=[50, 95])).label("rating_percentiles"),
+        pdb.agg(facets.top_hits(size=2, sort=[{"rating": "desc"}], docvalue_fields=["id", "rating"])).label("top_hits"),
     )
     .select_from(Product)
     .where(search.match_all(Product.description, "running"))
@@ -85,8 +141,24 @@ Usage:
 
 ```python
 op.create_bm25_index("products_bm25_idx", "products", ["id", "description"], key_field="id")
+# Tokenizer-aware expressions are supported too:
+op.create_bm25_index(
+    "products_bm25_idx",
+    "products",
+    ["id", "((description)::pdb.simple('alias=description_simple,lowercase=true'))"],
+    key_field="id",
+)
+# Schema-aware operations:
+op.create_bm25_index(
+    "products_bm25_idx",
+    "products",
+    ["id", "description"],
+    key_field="id",
+    table_schema="search",
+)
 op.reindex_bm25("products_bm25_idx", concurrently=True)
-op.drop_bm25_index("products_bm25_idx", if_exists=True)
+op.reindex_bm25("products_bm25_idx", concurrently=True, schema="search")
+op.drop_bm25_index("products_bm25_idx", if_exists=True, schema="search")
 ```
 
 ## Validation and Guardrails
@@ -117,7 +189,7 @@ python -m pytest tests/unit
 Integration tests (requires running ParadeDB):
 
 ```bash
-PARADEDB_TEST_DSN=postgres://postgres:postgres@localhost:5432/postgres python -m pytest -m integration
+PARADEDB_TEST_DSN=postgres://postgres:postgres@localhost:5443/postgres python -m pytest -m integration
 ```
 
 ## CI

@@ -23,7 +23,7 @@
 
 
 def engine_from_env() -> Engine:
-    dsn = os.getenv("DATABASE_URL", "postgresql+psycopg://postgres:postgres@localhost:5432/postgres")
+    dsn = os.getenv("DATABASE_URL", "postgresql+psycopg://postgres:postgres@localhost:5443/postgres")
     return create_engine(dsn)
 
 

@@ -28,6 +28,14 @@ def main() -> None:
         select(
             pdb.agg(facets.value_count(field="id")).label("count"),
             pdb.agg(facets.avg(field="rating")).label("avg_rating"),
+            pdb.agg(facets.percentiles(field="rating", percents=[50, 95])).label("rating_percentiles"),
+            pdb.agg(
+                facets.top_hits(
+                    size=2,
+                    sort=[{"rating": "desc"}],
+                    docvalue_fields=["id", "rating"],
+                )
+            ).label("top_hits"),
         )
         .select_from(Product)
         .where(search.match_all(Product.description, "running"))