Skip to content

Commit 7731bbe

Browse files
committed
Harden BM25 field parsing and container startup
1 parent 08cb745 commit 7731bbe

3 files changed

Lines changed: 77 additions & 11 deletions

File tree

paradedb/sqlalchemy/indexing.py

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,8 @@ class IndexMeta:
452452

453453
_KEY_FIELD_RE = re.compile(r"key_field\s*=\s*'?\"?([^'\",)\s]+)\"?'?", re.IGNORECASE)
454454
_ALIAS_RE = re.compile(r"alias\s*=\s*([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
455-
_CAST_FIELD_RE = re.compile(r"^\(*\"?([A-Za-z_][A-Za-z0-9_]*)\"?\)*\s*::\s*pdb\.", re.IGNORECASE)
456-
_PLAIN_FIELD_RE = re.compile(r'^\(*"?([A-Za-z_][A-Za-z0-9_]*)"?\)*$')
457455
_TOKENIZER_NAME_RE = re.compile(r"::pdb\.([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
456+
_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
458457

459458

460459
def _split_top_level_csv(expr: str) -> list[str]:
@@ -520,16 +519,58 @@ def _extract_bm25_field_list(indexdef: str) -> list[str]:
520519

521520

522521
def _extract_field_name(field_expr: str) -> str | None:
523-
expr = field_expr.strip()
524-
cast_match = _CAST_FIELD_RE.match(expr)
525-
if cast_match:
526-
return cast_match.group(1)
527-
plain_match = _PLAIN_FIELD_RE.match(expr)
528-
if plain_match:
529-
return plain_match.group(1)
522+
expr = _strip_outer_parens(field_expr.strip())
523+
cast_marker = re.search(r"::\s*pdb\.", expr, re.IGNORECASE)
524+
if cast_marker is not None:
525+
expr = _strip_outer_parens(expr[: cast_marker.start()].strip())
526+
527+
if "->" in expr:
528+
expr = _strip_outer_parens(expr.split("->", 1)[0].strip())
529+
530+
# Strip schema/table qualifiers and keep the terminal identifier.
531+
if "." in expr:
532+
expr = _strip_outer_parens(expr.rsplit(".", 1)[1].strip())
533+
534+
if expr.startswith('"') and expr.endswith('"') and len(expr) >= 2:
535+
return expr[1:-1].replace('""', '"')
536+
if _IDENT_RE.match(expr):
537+
return expr
530538
return None
531539

532540

541+
def _strip_outer_parens(value: str) -> str:
542+
expr = value
543+
while expr.startswith("(") and expr.endswith(")") and _has_balanced_outer_parens(expr):
544+
expr = expr[1:-1].strip()
545+
return expr
546+
547+
548+
def _has_balanced_outer_parens(value: str) -> bool:
549+
depth = 0
550+
in_single = False
551+
in_double = False
552+
553+
for i, ch in enumerate(value):
554+
if ch == "'" and not in_double:
555+
in_single = not in_single
556+
continue
557+
if ch == '"' and not in_single:
558+
in_double = not in_double
559+
continue
560+
if in_single or in_double:
561+
continue
562+
563+
if ch == "(":
564+
depth += 1
565+
elif ch == ")":
566+
depth -= 1
567+
if depth == 0 and i != len(value) - 1:
568+
return False
569+
if depth < 0:
570+
return False
571+
return depth == 0
572+
573+
533574
def _extract_key_field(indexdef: str) -> str | None:
534575
match = _KEY_FIELD_RE.search(indexdef)
535576
if match:

scripts/run_paradedb.sh

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ if docker ps -a --format '{{.Names}}' | grep -Eq "^${CONTAINER_NAME}$"; then
4141
container_exists=1
4242
fi
4343

44+
created_or_recreated=0
4445
if [[ "$container_exists" == "0" ]]; then
4546
echo "Starting ParadeDB container ${CONTAINER_NAME} from ${IMAGE}..."
4647
docker run -d \
@@ -50,6 +51,7 @@ if [[ "$container_exists" == "0" ]]; then
5051
-e "POSTGRES_DB=${DB}" \
5152
-p "${PORT}:5432" \
5253
"${IMAGE}" >/dev/null
54+
created_or_recreated=1
5355
else
5456
mapped_port="$(docker port "${CONTAINER_NAME}" 5432/tcp 2>/dev/null | head -n1 | awk -F: '{print $NF}')"
5557
if [[ -n "${mapped_port}" && "${mapped_port}" != "${PORT}" ]]; then
@@ -65,9 +67,17 @@ else
6567
-e "POSTGRES_DB=${DB}" \
6668
-p "${PORT}:5432" \
6769
"${IMAGE}" >/dev/null
70+
created_or_recreated=1
71+
fi
72+
if [[ "${created_or_recreated}" == "0" ]]; then
73+
is_running="$(docker inspect -f '{{.State.Running}}' "${CONTAINER_NAME}" 2>/dev/null || echo false)"
74+
if [[ "${is_running}" == "true" ]]; then
75+
echo "Container ${CONTAINER_NAME} is already running."
76+
else
77+
echo "Container ${CONTAINER_NAME} already exists; starting it..."
78+
docker start "${CONTAINER_NAME}" >/dev/null
79+
fi
6880
fi
69-
echo "Container ${CONTAINER_NAME} already exists; starting it..."
70-
docker start "${CONTAINER_NAME}" >/dev/null
7181
fi
7282

7383
export PARADEDB_PORT="${PORT}"

tests/unit/test_indexing_unit.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,21 @@ def test_extract_bm25_field_list_parses_tokenizer_casts():
294294
assert _extract_alias(parts[2]) == "category_exact"
295295

296296

297+
def test_extract_field_name_from_json_key_tokenizer_cast():
298+
expr = "((metadata ->> 'color')::pdb.literal('alias=metadata_color'))"
299+
assert _extract_field_name(expr) == "metadata"
300+
301+
302+
def test_extract_field_name_from_qualified_tokenizer_cast():
303+
expr = "((public.products.description)::pdb.unicode_words('lowercase=true'))"
304+
assert _extract_field_name(expr) == "description"
305+
306+
307+
def test_extract_field_name_from_quoted_identifier():
308+
expr = '(("Display Name")::pdb.literal(\'alias=display_name\'))'
309+
assert _extract_field_name(expr) == "Display Name"
310+
311+
297312
# ---------------------------------------------------------------------------
298313
# _extract_tokenizer_name
299314
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)