99 Property ,
1010 Reconfigure ,
1111 ReferenceProperty ,
12+ StopwordsPreset ,
1213 Tokenization ,
1314 Vectorizers ,
1415 _AsyncReplicationConfig ,
@@ -3034,7 +3035,7 @@ def test_property_with_ascii_fold_only(self) -> None:
30343035 prop = Property (
30353036 name = "title" ,
30363037 data_type = DataType .TEXT ,
3037- text_analyzer = Configure .TextAnalyzer . ascii_fold ( ),
3038+ text_analyzer = Configure .TextAnalyzer ( ascii_fold = True ),
30383039 )
30393040 assert prop ._to_dict ()["textAnalyzer" ] == {"asciiFold" : True }
30403041
@@ -3043,7 +3044,7 @@ def test_property_with_ascii_fold_and_ignore(self) -> None:
30433044 name = "title" ,
30443045 data_type = DataType .TEXT ,
30453046 tokenization = Tokenization .WORD ,
3046- text_analyzer = Configure .TextAnalyzer . ascii_fold ( ignore = ["é" , "ñ" ]),
3047+ text_analyzer = Configure .TextAnalyzer ( ascii_fold = True , ascii_fold_ignore = ["é" , "ñ" ]),
30473048 )
30483049 out = prop ._to_dict ()
30493050 assert out ["textAnalyzer" ] == {
@@ -3064,7 +3065,7 @@ def test_nested_property_with_text_analyzer(self) -> None:
30643065 Property (
30653066 name = "title" ,
30663067 data_type = DataType .TEXT ,
3067- text_analyzer = Configure .TextAnalyzer . ascii_fold ( ignore = ["ñ" ]),
3068+ text_analyzer = Configure .TextAnalyzer ( ascii_fold = True , ascii_fold_ignore = ["ñ" ]),
30683069 ),
30693070 ],
30703071 )
@@ -3078,4 +3079,101 @@ def test_text_analyzer_rejects_wrong_types(self) -> None:
30783079 with pytest .raises (ValidationError ):
30793080 _TextAnalyzerConfigCreate (ascii_fold = "yes" ) # type: ignore[arg-type]
30803081 with pytest .raises (ValidationError ):
3081- _TextAnalyzerConfigCreate (ascii_fold_ignore = "é" ) # type: ignore[arg-type]
3082+ _TextAnalyzerConfigCreate (ascii_fold_ignore = "é" )
3083+
3084+ def test_text_analyzer_stopword_preset_builtin_enum (self ) -> None :
3085+ prop = Property (
3086+ name = "title" ,
3087+ data_type = DataType .TEXT ,
3088+ tokenization = Tokenization .WORD ,
3089+ text_analyzer = Configure .TextAnalyzer (stopword_preset = StopwordsPreset .EN ),
3090+ )
3091+ assert prop ._to_dict ()["textAnalyzer" ] == {"stopwordPreset" : "en" }
3092+
3093+ def test_text_analyzer_stopword_preset_user_defined_string (self ) -> None :
3094+ prop = Property (
3095+ name = "title_fr" ,
3096+ data_type = DataType .TEXT ,
3097+ tokenization = Tokenization .WORD ,
3098+ text_analyzer = Configure .TextAnalyzer (stopword_preset = "fr" ),
3099+ )
3100+ assert prop ._to_dict ()["textAnalyzer" ] == {"stopwordPreset" : "fr" }
3101+
3102+ def test_text_analyzer_combined_ascii_fold_and_stopword_preset (self ) -> None :
3103+ prop = Property (
3104+ name = "title" ,
3105+ data_type = DataType .TEXT ,
3106+ tokenization = Tokenization .WORD ,
3107+ text_analyzer = Configure .TextAnalyzer (
3108+ ascii_fold = True , ascii_fold_ignore = ["é" ], stopword_preset = "fr"
3109+ ),
3110+ )
3111+ assert prop ._to_dict ()["textAnalyzer" ] == {
3112+ "asciiFold" : True ,
3113+ "asciiFoldIgnore" : ["é" ],
3114+ "stopwordPreset" : "fr" ,
3115+ }
3116+
3117+ def test_text_analyzer_stopword_preset_only_omits_other_keys (self ) -> None :
3118+ prop = Property (
3119+ name = "title" ,
3120+ data_type = DataType .TEXT ,
3121+ tokenization = Tokenization .WORD ,
3122+ text_analyzer = Configure .TextAnalyzer (stopword_preset = "fr" ),
3123+ )
3124+ out = prop ._to_dict ()
3125+ assert "asciiFold" not in out ["textAnalyzer" ]
3126+ assert "asciiFoldIgnore" not in out ["textAnalyzer" ]
3127+
3128+
3129+ class TestInvertedIndexStopwordPresets :
3130+ def test_configure_inverted_index_with_stopword_presets (self ) -> None :
3131+ ic = Configure .inverted_index (
3132+ stopword_presets = {
3133+ "fr" : ["le" , "la" , "les" ],
3134+ "es" : ["el" , "la" , "los" ],
3135+ },
3136+ )
3137+ out = ic ._to_dict ()
3138+ assert out ["stopwordPresets" ] == {
3139+ "fr" : ["le" , "la" , "les" ],
3140+ "es" : ["el" , "la" , "los" ],
3141+ }
3142+
3143+ def test_configure_inverted_index_without_stopword_presets_omits_key (self ) -> None :
3144+ ic = Configure .inverted_index ()
3145+ assert "stopwordPresets" not in ic ._to_dict ()
3146+
3147+ def test_reconfigure_inverted_index_merges_stopword_presets (self ) -> None :
3148+ rc = Reconfigure .inverted_index (stopword_presets = {"fr" : ["le" , "la" ]})
3149+ existing = {
3150+ "stopwords" : {"preset" : "en" , "additions" : None , "removals" : None },
3151+ "bm25" : {"b" : 0.75 , "k1" : 1.2 },
3152+ "cleanupIntervalSeconds" : 60 ,
3153+ }
3154+ merged = rc .merge_with_existing (existing )
3155+ assert merged ["stopwordPresets" ] == {"fr" : ["le" , "la" ]}
3156+ # other fields untouched
3157+ assert merged ["stopwords" ]["preset" ] == "en"
3158+ assert merged ["bm25" ]["b" ] == 0.75
3159+
3160+ def test_reconfigure_inverted_index_replaces_existing_stopword_presets (self ) -> None :
3161+ rc = Reconfigure .inverted_index (stopword_presets = {"fr" : ["le" ]})
3162+ existing = {
3163+ "stopwords" : {"preset" : "en" , "additions" : None , "removals" : None },
3164+ "stopwordPresets" : {"fr" : ["le" , "la" , "les" ], "es" : ["el" ]},
3165+ }
3166+ merged = rc .merge_with_existing (existing )
3167+ # The new value fully replaces the prior dict (this matches the server-side
3168+ # PUT semantics — see test_tokenize.py::test_remove_unused_preset_is_allowed).
3169+ assert merged ["stopwordPresets" ] == {"fr" : ["le" ]}
3170+
3171+ def test_reconfigure_inverted_index_without_stopword_presets_leaves_existing (self ) -> None :
3172+ rc = Reconfigure .inverted_index (bm25_b = 0.7 , bm25_k1 = 1.1 )
3173+ existing = {
3174+ "stopwords" : {"preset" : "en" , "additions" : None , "removals" : None },
3175+ "bm25" : {"b" : 0.75 , "k1" : 1.2 },
3176+ "stopwordPresets" : {"fr" : ["le" , "la" ]},
3177+ }
3178+ merged = rc .merge_with_existing (existing )
3179+ assert merged ["stopwordPresets" ] == {"fr" : ["le" , "la" ]}
0 commit comments