Skip to content

Commit 29a69ce

Browse files
philippemnoelankitml
authored andcommitted
Harden BM25 field parsing and container startup
1 parent c946a8d commit 29a69ce

3 files changed

Lines changed: 80 additions & 13 deletions

File tree

paradedb/sqlalchemy/indexing.py

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,8 @@ class IndexMeta:
452452

453453
_KEY_FIELD_RE = re.compile(r"key_field\s*=\s*'?\"?([^'\",)\s]+)\"?'?", re.IGNORECASE)
454454
_ALIAS_RE = re.compile(r"alias\s*=\s*([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
455-
_CAST_FIELD_RE = re.compile(r"^\(*\"?([A-Za-z_][A-Za-z0-9_]*)\"?\)*\s*::\s*pdb\.", re.IGNORECASE)
456-
_PLAIN_FIELD_RE = re.compile(r'^\(*"?([A-Za-z_][A-Za-z0-9_]*)"?\)*$')
457455
_TOKENIZER_NAME_RE = re.compile(r"::pdb\.([A-Za-z_][A-Za-z0-9_]*)", re.IGNORECASE)
456+
_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
458457

459458

460459
def _split_top_level_csv(expr: str) -> list[str]:
@@ -520,16 +519,58 @@ def _extract_bm25_field_list(indexdef: str) -> list[str]:
520519

521520

522521
def _extract_field_name(field_expr: str) -> str | None:
523-
expr = field_expr.strip()
524-
cast_match = _CAST_FIELD_RE.match(expr)
525-
if cast_match:
526-
return cast_match.group(1)
527-
plain_match = _PLAIN_FIELD_RE.match(expr)
528-
if plain_match:
529-
return plain_match.group(1)
522+
expr = _strip_outer_parens(field_expr.strip())
523+
cast_marker = re.search(r"::\s*pdb\.", expr, re.IGNORECASE)
524+
if cast_marker is not None:
525+
expr = _strip_outer_parens(expr[: cast_marker.start()].strip())
526+
527+
if "->" in expr:
528+
expr = _strip_outer_parens(expr.split("->", 1)[0].strip())
529+
530+
# Strip schema/table qualifiers and keep the terminal identifier.
531+
if "." in expr:
532+
expr = _strip_outer_parens(expr.rsplit(".", 1)[1].strip())
533+
534+
if expr.startswith('"') and expr.endswith('"') and len(expr) >= 2:
535+
return expr[1:-1].replace('""', '"')
536+
if _IDENT_RE.match(expr):
537+
return expr
530538
return None
531539

532540

541+
def _strip_outer_parens(value: str) -> str:
542+
expr = value
543+
while expr.startswith("(") and expr.endswith(")") and _has_balanced_outer_parens(expr):
544+
expr = expr[1:-1].strip()
545+
return expr
546+
547+
548+
def _has_balanced_outer_parens(value: str) -> bool:
549+
depth = 0
550+
in_single = False
551+
in_double = False
552+
553+
for i, ch in enumerate(value):
554+
if ch == "'" and not in_double:
555+
in_single = not in_single
556+
continue
557+
if ch == '"' and not in_single:
558+
in_double = not in_double
559+
continue
560+
if in_single or in_double:
561+
continue
562+
563+
if ch == "(":
564+
depth += 1
565+
elif ch == ")":
566+
depth -= 1
567+
if depth == 0 and i != len(value) - 1:
568+
return False
569+
if depth < 0:
570+
return False
571+
return depth == 0
572+
573+
533574
def _extract_key_field(indexdef: str) -> str | None:
534575
match = _KEY_FIELD_RE.search(indexdef)
535576
if match:

scripts/run_paradedb.sh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,22 @@ run_container() {
5151
"${IMAGE}" >/dev/null
5252
}
5353

54+
created_or_recreated=0
5455
if [[ "$container_exists" == "0" ]]; then
5556
echo "Starting ParadeDB container ${CONTAINER_NAME} from ${IMAGE}..."
5657
run_container
58+
created_or_recreated=1
5759
else
5860
current_image="$(docker inspect -f '{{.Config.Image}}' "${CONTAINER_NAME}" 2>/dev/null || true)"
5961
if [[ -n "${current_image}" && "${current_image}" != "${IMAGE}" ]]; then
6062
echo "Container ${CONTAINER_NAME} uses image ${current_image}; recreating with ${IMAGE}..."
6163
docker rm -f "${CONTAINER_NAME}" >/dev/null
6264
echo "Starting ParadeDB container ${CONTAINER_NAME} from ${IMAGE}..."
6365
run_container
64-
container_exists=0
66+
created_or_recreated=1
6567
fi
6668

67-
if [[ "$container_exists" == "1" ]]; then
69+
if [[ "${created_or_recreated}" == "0" ]]; then
6870
mapped_port="$(docker port "${CONTAINER_NAME}" 5432/tcp 2>/dev/null | head -n1 | awk -F: '{print $NF}')"
6971
if [[ -n "${mapped_port}" && "${mapped_port}" != "${PORT}" ]]; then
7072
echo "Container ${CONTAINER_NAME} is already mapped to host port ${mapped_port}; using that port."
@@ -73,9 +75,18 @@ else
7375
echo "Container ${CONTAINER_NAME} has no published 5432 port; recreating with ${PORT}:5432..."
7476
docker rm -f "${CONTAINER_NAME}" >/dev/null
7577
run_container
78+
created_or_recreated=1
79+
fi
80+
fi
81+
82+
if [[ "${created_or_recreated}" == "0" ]]; then
83+
is_running="$(docker inspect -f '{{.State.Running}}' "${CONTAINER_NAME}" 2>/dev/null || echo false)"
84+
if [[ "${is_running}" == "true" ]]; then
85+
echo "Container ${CONTAINER_NAME} is already running."
86+
else
87+
echo "Container ${CONTAINER_NAME} already exists; starting it..."
88+
docker start "${CONTAINER_NAME}" >/dev/null
7689
fi
77-
echo "Container ${CONTAINER_NAME} already exists; starting it..."
78-
docker start "${CONTAINER_NAME}" >/dev/null
7990
fi
8091
fi
8192

tests/unit/test_indexing_unit.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,21 @@ def test_extract_bm25_field_list_parses_tokenizer_casts():
294294
assert _extract_alias(parts[2]) == "category_exact"
295295

296296

297+
def test_extract_field_name_from_json_key_tokenizer_cast():
298+
expr = "((metadata ->> 'color')::pdb.literal('alias=metadata_color'))"
299+
assert _extract_field_name(expr) == "metadata"
300+
301+
302+
def test_extract_field_name_from_qualified_tokenizer_cast():
303+
expr = "((public.products.description)::pdb.unicode_words('lowercase=true'))"
304+
assert _extract_field_name(expr) == "description"
305+
306+
307+
def test_extract_field_name_from_quoted_identifier():
308+
expr = '(("Display Name")::pdb.literal(\'alias=display_name\'))'
309+
assert _extract_field_name(expr) == "Display Name"
310+
311+
297312
# ---------------------------------------------------------------------------
298313
# _extract_tokenizer_name
299314
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)