Skip to content

Commit ec43d53

Browse files
committed
Merge branch 'feat/stopword-presets' into feat/ascii-fold
2 parents 38c7f44 + 4e0a0f2 commit ec43d53

8 files changed

Lines changed: 694 additions & 35 deletions

File tree

integration/test_collection_config.py

Lines changed: 404 additions & 1 deletion
Large diffs are not rendered by default.

test/collection/test_config.py

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
Property,
1010
Reconfigure,
1111
ReferenceProperty,
12+
StopwordsPreset,
1213
Tokenization,
1314
Vectorizers,
1415
_AsyncReplicationConfig,
@@ -3034,7 +3035,7 @@ def test_property_with_ascii_fold_only(self) -> None:
30343035
prop = Property(
30353036
name="title",
30363037
data_type=DataType.TEXT,
3037-
text_analyzer=Configure.TextAnalyzer.ascii_fold(),
3038+
text_analyzer=Configure.TextAnalyzer(ascii_fold=True),
30383039
)
30393040
assert prop._to_dict()["textAnalyzer"] == {"asciiFold": True}
30403041

@@ -3043,7 +3044,7 @@ def test_property_with_ascii_fold_and_ignore(self) -> None:
30433044
name="title",
30443045
data_type=DataType.TEXT,
30453046
tokenization=Tokenization.WORD,
3046-
text_analyzer=Configure.TextAnalyzer.ascii_fold(ignore=["é", "ñ"]),
3047+
text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["é", "ñ"]),
30473048
)
30483049
out = prop._to_dict()
30493050
assert out["textAnalyzer"] == {
@@ -3064,7 +3065,7 @@ def test_nested_property_with_text_analyzer(self) -> None:
30643065
Property(
30653066
name="title",
30663067
data_type=DataType.TEXT,
3067-
text_analyzer=Configure.TextAnalyzer.ascii_fold(ignore=["ñ"]),
3068+
text_analyzer=Configure.TextAnalyzer(ascii_fold=True, ascii_fold_ignore=["ñ"]),
30683069
),
30693070
],
30703071
)
@@ -3078,4 +3079,101 @@ def test_text_analyzer_rejects_wrong_types(self) -> None:
30783079
with pytest.raises(ValidationError):
30793080
_TextAnalyzerConfigCreate(ascii_fold="yes") # type: ignore[arg-type]
30803081
with pytest.raises(ValidationError):
3081-
_TextAnalyzerConfigCreate(ascii_fold_ignore="é") # type: ignore[arg-type]
3082+
_TextAnalyzerConfigCreate(ascii_fold_ignore="é")
3083+
3084+
def test_text_analyzer_stopword_preset_builtin_enum(self) -> None:
3085+
prop = Property(
3086+
name="title",
3087+
data_type=DataType.TEXT,
3088+
tokenization=Tokenization.WORD,
3089+
text_analyzer=Configure.TextAnalyzer(stopword_preset=StopwordsPreset.EN),
3090+
)
3091+
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "en"}
3092+
3093+
def test_text_analyzer_stopword_preset_user_defined_string(self) -> None:
3094+
prop = Property(
3095+
name="title_fr",
3096+
data_type=DataType.TEXT,
3097+
tokenization=Tokenization.WORD,
3098+
text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
3099+
)
3100+
assert prop._to_dict()["textAnalyzer"] == {"stopwordPreset": "fr"}
3101+
3102+
def test_text_analyzer_combined_ascii_fold_and_stopword_preset(self) -> None:
3103+
prop = Property(
3104+
name="title",
3105+
data_type=DataType.TEXT,
3106+
tokenization=Tokenization.WORD,
3107+
text_analyzer=Configure.TextAnalyzer(
3108+
ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset="fr"
3109+
),
3110+
)
3111+
assert prop._to_dict()["textAnalyzer"] == {
3112+
"asciiFold": True,
3113+
"asciiFoldIgnore": ["é"],
3114+
"stopwordPreset": "fr",
3115+
}
3116+
3117+
def test_text_analyzer_stopword_preset_only_omits_other_keys(self) -> None:
3118+
prop = Property(
3119+
name="title",
3120+
data_type=DataType.TEXT,
3121+
tokenization=Tokenization.WORD,
3122+
text_analyzer=Configure.TextAnalyzer(stopword_preset="fr"),
3123+
)
3124+
out = prop._to_dict()
3125+
assert "asciiFold" not in out["textAnalyzer"]
3126+
assert "asciiFoldIgnore" not in out["textAnalyzer"]
3127+
3128+
3129+
class TestInvertedIndexStopwordPresets:
3130+
def test_configure_inverted_index_with_stopword_presets(self) -> None:
3131+
ic = Configure.inverted_index(
3132+
stopword_presets={
3133+
"fr": ["le", "la", "les"],
3134+
"es": ["el", "la", "los"],
3135+
},
3136+
)
3137+
out = ic._to_dict()
3138+
assert out["stopwordPresets"] == {
3139+
"fr": ["le", "la", "les"],
3140+
"es": ["el", "la", "los"],
3141+
}
3142+
3143+
def test_configure_inverted_index_without_stopword_presets_omits_key(self) -> None:
3144+
ic = Configure.inverted_index()
3145+
assert "stopwordPresets" not in ic._to_dict()
3146+
3147+
def test_reconfigure_inverted_index_merges_stopword_presets(self) -> None:
3148+
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le", "la"]})
3149+
existing = {
3150+
"stopwords": {"preset": "en", "additions": None, "removals": None},
3151+
"bm25": {"b": 0.75, "k1": 1.2},
3152+
"cleanupIntervalSeconds": 60,
3153+
}
3154+
merged = rc.merge_with_existing(existing)
3155+
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}
3156+
# other fields untouched
3157+
assert merged["stopwords"]["preset"] == "en"
3158+
assert merged["bm25"]["b"] == 0.75
3159+
3160+
def test_reconfigure_inverted_index_replaces_existing_stopword_presets(self) -> None:
3161+
rc = Reconfigure.inverted_index(stopword_presets={"fr": ["le"]})
3162+
existing = {
3163+
"stopwords": {"preset": "en", "additions": None, "removals": None},
3164+
"stopwordPresets": {"fr": ["le", "la", "les"], "es": ["el"]},
3165+
}
3166+
merged = rc.merge_with_existing(existing)
3167+
# The new value fully replaces the prior dict (this matches the server-side
3168+
# PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
3169+
assert merged["stopwordPresets"] == {"fr": ["le"]}
3170+
3171+
def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing(self) -> None:
3172+
rc = Reconfigure.inverted_index(bm25_b=0.7, bm25_k1=1.1)
3173+
existing = {
3174+
"stopwords": {"preset": "en", "additions": None, "removals": None},
3175+
"bm25": {"b": 0.75, "k1": 1.2},
3176+
"stopwordPresets": {"fr": ["le", "la"]},
3177+
}
3178+
merged = rc.merge_with_existing(existing)
3179+
assert merged["stopwordPresets"] == {"fr": ["le", "la"]}

test/collection/test_config_methods.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from weaviate.collections.classes.config_methods import (
2+
_collection_config_from_json,
23
_collection_configs_simple_from_json,
34
_nested_properties_from_config,
45
_properties_from_config,
@@ -145,3 +146,102 @@ def test_nested_properties_from_config_parses_text_analyzer() -> None:
145146
"asciiFold": True,
146147
"asciiFoldIgnore": ["ñ"],
147148
}
149+
150+
151+
def test_properties_from_config_parses_stopword_preset_only() -> None:
152+
"""A property with only stopwordPreset (no asciiFold) must still produce a text_analyzer."""
153+
schema = {
154+
"vectorizer": "none",
155+
"properties": [
156+
_make_text_prop("title", textAnalyzer={"stopwordPreset": "fr"}),
157+
],
158+
}
159+
title = _properties_from_config(schema)[0]
160+
assert title.text_analyzer is not None
161+
assert title.text_analyzer.ascii_fold is False
162+
assert title.text_analyzer.ascii_fold_ignore is None
163+
assert title.text_analyzer.stopword_preset == "fr"
164+
165+
166+
def test_properties_from_config_parses_combined_text_analyzer() -> None:
167+
schema = {
168+
"vectorizer": "none",
169+
"properties": [
170+
_make_text_prop(
171+
"title",
172+
textAnalyzer={
173+
"asciiFold": True,
174+
"asciiFoldIgnore": ["é"],
175+
"stopwordPreset": "fr",
176+
},
177+
),
178+
],
179+
}
180+
title = _properties_from_config(schema)[0]
181+
assert title.text_analyzer is not None
182+
assert title.text_analyzer.ascii_fold is True
183+
assert title.text_analyzer.ascii_fold_ignore == ["é"]
184+
assert title.text_analyzer.stopword_preset == "fr"
185+
186+
187+
def _full_schema(class_name: str, **inverted_overrides) -> dict:
188+
inverted = {
189+
"bm25": {"b": 0.75, "k1": 1.2},
190+
"cleanupIntervalSeconds": 60,
191+
"stopwords": {"preset": "en", "additions": None, "removals": None},
192+
}
193+
inverted.update(inverted_overrides)
194+
return {
195+
"class": class_name,
196+
"vectorizer": "none",
197+
"properties": [],
198+
"invertedIndexConfig": inverted,
199+
"replicationConfig": {"factor": 1, "deletionStrategy": "NoAutomatedResolution"},
200+
"shardingConfig": {
201+
"virtualPerPhysical": 128,
202+
"desiredCount": 1,
203+
"actualCount": 1,
204+
"desiredVirtualCount": 128,
205+
"actualVirtualCount": 128,
206+
"key": "_id",
207+
"strategy": "hash",
208+
"function": "murmur3",
209+
},
210+
"vectorIndexType": "hnsw",
211+
"vectorIndexConfig": {
212+
"skip": False,
213+
"cleanupIntervalSeconds": 300,
214+
"maxConnections": 64,
215+
"efConstruction": 128,
216+
"ef": -1,
217+
"dynamicEfMin": 100,
218+
"dynamicEfMax": 500,
219+
"dynamicEfFactor": 8,
220+
"vectorCacheMaxObjects": 1000000000000,
221+
"flatSearchCutoff": 40000,
222+
"distance": "cosine",
223+
},
224+
}
225+
226+
227+
def test_collection_config_parses_stopword_presets() -> None:
228+
"""The inverted index config exposes stopwordPresets when present in the schema."""
229+
schema = _full_schema(
230+
"TestStopwordPresets",
231+
stopwordPresets={
232+
"fr": ["le", "la", "les"],
233+
"es": ["el", "la", "los"],
234+
},
235+
)
236+
full = _collection_config_from_json(schema)
237+
assert full.inverted_index_config.stopword_presets == {
238+
"fr": ["le", "la", "les"],
239+
"es": ["el", "la", "los"],
240+
}
241+
242+
243+
def test_collection_config_stopword_presets_absent() -> None:
244+
"""If the server response omits stopwordPresets, the parsed value is None."""
245+
schema = _full_schema("TestNoStopwordPresets")
246+
full = _collection_config_from_json(schema)
247+
assert full.inverted_index_config.stopword_presets is None

0 commit comments

Comments
 (0)