Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
78ff5ec
feat: add TextAnalyzerConfig for ASCII folding in text properties
amourao Apr 9, 2026
6931a6f
refactor: ruff format
amourao Apr 9, 2026
bda3008
feat: add min version check
amourao Apr 9, 2026
77fc0ff
feat: update TextAnalyzerConfig docstring for ascii_fold attributes
amourao Apr 9, 2026
a8d6927
feat: add asciiFold check in _text_analyzer_from_config function
amourao Apr 9, 2026
e8919a3
test: fix ASCII folding tests
amourao Apr 9, 2026
3cc6306
feat: add support for stopword presets in inverted index configuratio…
amourao Apr 9, 2026
ef04dea
test: added live and config tests
amourao Apr 9, 2026
8f1b33b
refactor: improve docstrings for stopword presets and asciiFold tests
amourao Apr 9, 2026
03d6ff4
refactor: simplify _any_property_has_text_analyzer function using _pr…
amourao Apr 13, 2026
1342204
test: remove redundant insertion ascii fold tests from test_collectio…
amourao Apr 13, 2026
cb53d6a
test: add stopwords roundtrip test for collection configuration
amourao Apr 13, 2026
9de03f3
feat: add model validator to enforce asciiFoldIgnore constraints in T…
amourao Apr 13, 2026
7018927
feat: add factory class for text analyzer configurations with ASCII f…
amourao Apr 13, 2026
8e91984
refactor: update TextAnalyzerConfig usage to new Configure class methods
amourao Apr 13, 2026
30814fc
Merge branch 'feat/ascii-fold' into feat/stopword-presets
amourao Apr 13, 2026
db3009c
test: remove redundant line in stopword presets merge test
amourao Apr 13, 2026
50f7768
refactor: use factory pattern
amourao Apr 13, 2026
a0efe43
refactor: format text analyzer configuration for better readability
amourao Apr 14, 2026
fa92fc2
refactor: remove server side behavior tests
amourao Apr 14, 2026
27cd0a4
test: add stopword presets roundtrip tests for Weaviate collections
amourao Apr 14, 2026
83c2431
refactor: remove unnecessary stopword preset coercion from _TextAnaly…
amourao Apr 14, 2026
4e0a0f2
refactor: replace custom text analyzer method with a direct function …
amourao Apr 14, 2026
eaea155
Merge branch 'dev/1.37' into feat/ascii-fold
amourao Apr 14, 2026
38c7f44
chore: remove unused deprecated import from config.py
amourao Apr 14, 2026
ec43d53
Merge branch 'feat/stopword-presets' into feat/ascii-fold
amourao Apr 14, 2026
b3eb0ac
chore: update WEAVIATE_137 version to 1.37.0-rc.1-578c4eb in workflow
amourao Apr 14, 2026
ceef271
refactor: update text analyzer method to use new static method in Con…
amourao Apr 14, 2026
5e751bf
test: add stopwords roundtrip test with ASCII folding configuration
amourao Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion integration/test_collection_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@
IndexName,
)
from weaviate.collections.classes.tenants import Tenant
from weaviate.exceptions import UnexpectedStatusCodeError, WeaviateInvalidInputError
from weaviate.exceptions import (
UnexpectedStatusCodeError,
WeaviateInvalidInputError,
WeaviateUnsupportedFeatureError,
)
from integration.conftest import retry_on_http_error


Expand Down Expand Up @@ -2196,3 +2200,48 @@ def test_delete_property_index(
assert config.properties[0].index_range_filters is False
assert config.properties[0].index_searchable is _index_searchable
assert config.properties[0].index_filterable is _index_filterable


def test_property_text_analyzer_ascii_fold_version_gate(
collection_factory: CollectionFactory,
) -> None:
"""On Weaviate < 1.37 the client must raise before sending the request."""
dummy = collection_factory("dummy")
if dummy._connection._weaviate_version.is_at_least(1, 37, 0):
pytest.skip("Version gate only applies to Weaviate < 1.37.0")

with pytest.raises(WeaviateUnsupportedFeatureError):
collection_factory(
vectorizer_config=Configure.Vectorizer.none(),
properties=[
Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer.ascii_fold(),
),
],
)
Comment thread
amourao marked this conversation as resolved.


def test_stopwords_roundtrip_from_dict(collection_factory: CollectionFactory) -> None:
collection = collection_factory(
inverted_index_config=Configure.inverted_index(
Comment thread
dirkkul marked this conversation as resolved.
stopwords_additions=["a"],
stopwords_preset=StopwordsPreset.EN,
stopwords_removals=["the"],
),
)
config = collection.config.get()
assert config.inverted_index_config.stopwords.preset == StopwordsPreset.EN
assert config.inverted_index_config.stopwords.removals == ["the"]

name = f"TestStopwordsRoundtrip{collection.name}"
config.name = name
with weaviate.connect_to_local() as client:
client.collections.delete(name)
client.collections.create_from_dict(config.to_dict())
new = client.collections.use(name).config.get()
assert config == new
assert config.to_dict() == new.to_dict()
client.collections.delete(name)
64 changes: 61 additions & 3 deletions test/collection/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
from pydantic import ValidationError

from weaviate.collections.classes.config import (
_AsyncReplicationConfig,
_ReplicationConfig,
_ReplicationConfigUpdate,
Configure,
DataType,
Property,
Reconfigure,
ReferenceProperty,
Tokenization,
Vectorizers,
_AsyncReplicationConfig,
_CollectionConfigCreate,
_GenerativeProvider,
_ReplicationConfig,
_ReplicationConfigUpdate,
_RerankerProvider,
_TextAnalyzerConfigCreate,
_VectorizerConfigCreate,
_ReplicationConfigCreate,
ReplicationDeletionStrategy,
Expand Down Expand Up @@ -3021,3 +3023,59 @@ def test_nested_property_with_id_name_is_allowed() -> None:
],
)
assert prop.nestedProperties[0].name == "id"


class Test_TextAnalyzerConfigCreate:
def test_property_without_text_analyzer_omits_key(self) -> None:
prop = Property(name="title", data_type=DataType.TEXT)
assert "textAnalyzer" not in prop._to_dict()

def test_property_with_ascii_fold_only(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
text_analyzer=Configure.TextAnalyzer.ascii_fold(),
)
assert prop._to_dict()["textAnalyzer"] == {"asciiFold": True}

def test_property_with_ascii_fold_and_ignore(self) -> None:
prop = Property(
name="title",
data_type=DataType.TEXT,
tokenization=Tokenization.WORD,
text_analyzer=Configure.TextAnalyzer.ascii_fold(ignore=["é", "ñ"]),
)
out = prop._to_dict()
assert out["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é", "ñ"],
}
assert out["tokenization"] == "word"

def test_text_analyzer_rejects_ignore_without_ascii_fold(self) -> None:
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold_ignore=["é"])

def test_nested_property_with_text_analyzer(self) -> None:
prop = Property(
name="meta",
data_type=DataType.OBJECT,
nested_properties=[
Property(
name="title",
data_type=DataType.TEXT,
text_analyzer=Configure.TextAnalyzer.ascii_fold(ignore=["ñ"]),
),
],
)
out = prop._to_dict()
assert out["nestedProperties"][0]["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["ñ"],
}

def test_text_analyzer_rejects_wrong_types(self) -> None:
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold="yes") # type: ignore[arg-type]
with pytest.raises(ValidationError):
_TextAnalyzerConfigCreate(ascii_fold_ignore="é") # type: ignore[arg-type]
79 changes: 78 additions & 1 deletion test/collection/test_config_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from weaviate.collections.classes.config_methods import _collection_configs_simple_from_json
from weaviate.collections.classes.config_methods import (
_collection_configs_simple_from_json,
_nested_properties_from_config,
_properties_from_config,
)


def test_collection_config_simple_from_json_with_none_vectorizer_config() -> None:
Expand Down Expand Up @@ -68,3 +72,76 @@ def test_collection_config_simple_from_json_with_none_vectorizer_config() -> Non
assert "default" in vec_config
assert vec_config["default"].vectorizer.model == {}
assert vec_config["default"].vectorizer.source_properties is None


def _make_text_prop(name: str, **extra) -> dict:
base = {
"name": name,
"dataType": ["text"],
"indexFilterable": True,
"indexSearchable": True,
"indexRangeFilters": False,
"tokenization": "word",
}
base.update(extra)
return base


def test_properties_from_config_parses_text_analyzer() -> None:
schema = {
"vectorizer": "none",
"properties": [
_make_text_prop(
"title",
textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["é"]},
),
_make_text_prop("body"),
],
}
props = _properties_from_config(schema)
title = next(p for p in props if p.name == "title")
body = next(p for p in props if p.name == "body")

assert title.text_analyzer is not None
assert title.text_analyzer.ascii_fold is True
assert title.text_analyzer.ascii_fold_ignore == ["é"]

assert body.text_analyzer is None

# The dataclass round-trips back to the wire format.
assert title.to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["é"],
}
assert "textAnalyzer" not in body.to_dict()


def test_properties_from_config_text_analyzer_omitted_when_no_ascii_fold() -> None:
"""If the server response omits asciiFold, the client treats text_analyzer as unset."""
schema = {
"vectorizer": "none",
"properties": [
# Server response with textAnalyzer present but no asciiFold key
_make_text_prop("title", textAnalyzer={"asciiFoldIgnore": ["é"]}),
],
}
title = _properties_from_config(schema)[0]
assert title.text_analyzer is None


def test_nested_properties_from_config_parses_text_analyzer() -> None:
nested = _nested_properties_from_config(
[
_make_text_prop(
"title",
textAnalyzer={"asciiFold": True, "asciiFoldIgnore": ["ñ"]},
),
]
)
assert nested[0].text_analyzer is not None
assert nested[0].text_analyzer.ascii_fold is True
assert nested[0].text_analyzer.ascii_fold_ignore == ["ñ"]
assert nested[0].to_dict()["textAnalyzer"] == {
"asciiFold": True,
"asciiFoldIgnore": ["ñ"],
}
2 changes: 2 additions & 0 deletions weaviate/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
ReplicationDeletionStrategy,
Rerankers,
StopwordsPreset,
TextAnalyzerConfig,
Tokenization,
VectorDistances,
)
Expand Down Expand Up @@ -39,6 +40,7 @@
"ReferenceProperty",
"Rerankers",
"StopwordsPreset",
"TextAnalyzerConfig",
"Tokenization",
"Vectorizers",
"VectorDistances",
Expand Down
75 changes: 74 additions & 1 deletion weaviate/collections/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@
)

from deprecation import deprecated as docstring_deprecated
from pydantic import AnyHttpUrl, Field, TypeAdapter, ValidationInfo, field_validator
from pydantic import (
AnyHttpUrl,
Field,
TypeAdapter,
ValidationInfo,
field_validator,
model_validator,
)
from typing_extensions import TypeAlias
from typing_extensions import deprecated as typing_deprecated

Expand Down Expand Up @@ -1671,6 +1678,15 @@ class _PropertyVectorizerConfig:
PropertyVectorizerConfig = _PropertyVectorizerConfig


@dataclass
class _TextAnalyzerConfig(_ConfigBase):
Comment thread
dirkkul marked this conversation as resolved.
ascii_fold: bool
ascii_fold_ignore: Optional[List[str]]


TextAnalyzerConfig = _TextAnalyzerConfig


@dataclass
class _NestedProperty(_ConfigBase):
data_type: DataType
Expand All @@ -1679,6 +1695,7 @@ class _NestedProperty(_ConfigBase):
index_searchable: bool
name: str
nested_properties: Optional[List["NestedProperty"]]
text_analyzer: Optional[_TextAnalyzerConfig]
tokenization: Optional[Tokenization]

def to_dict(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -1712,6 +1729,7 @@ class _Property(_PropertyBase):
index_range_filters: bool
index_searchable: bool
nested_properties: Optional[List[NestedProperty]]
text_analyzer: Optional[_TextAnalyzerConfig]
tokenization: Optional[Tokenization]
vectorizer_config: Optional[PropertyVectorizerConfig]
vectorizer: Optional[str]
Expand All @@ -1724,6 +1742,8 @@ def to_dict(self) -> Dict[str, Any]:
out["indexSearchable"] = self.index_searchable
out["indexRangeFilters"] = self.index_range_filters
out["tokenization"] = self.tokenization.value if self.tokenization else None
if self.text_analyzer is not None:
out["textAnalyzer"] = self.text_analyzer.to_dict()
if self.nested_properties is not None and len(self.nested_properties) > 0:
out["nestedProperties"] = [np.to_dict() for np in self.nested_properties]
module_config: Dict[str, Any] = {}
Expand Down Expand Up @@ -2161,6 +2181,54 @@ class _ShardStatus:
ShardStatus = _ShardStatus


class _TextAnalyzerConfigCreate(_ConfigCreateModel):
"""Text analysis options for a property.

Configures ASCII folding behavior for `text` and `text[]` properties that use an
inverted index (searchable or filterable). When enabled, accent/diacritic marks are
folded to their base characters during indexing and search (e.g. 'école' matches
'ecole').

Attributes:
ascii_fold: If True, accent/diacritic marks are folded to their base characters
during indexing and search. If omitted, the field is not sent to the server
and the server default (False) applies.
ascii_fold_ignore: Optional list of characters that should be excluded from
ASCII folding (e.g. ['é'] keeps 'é' from being folded to 'e'). If omitted,
the field is not sent to the server.

Both settings are immutable after the property is created.
Comment thread
dirkkul marked this conversation as resolved.
Outdated
"""

asciiFold: Optional[bool] = Field(default=None, alias="ascii_fold")
Comment thread
dirkkul marked this conversation as resolved.
asciiFoldIgnore: Optional[List[str]] = Field(default=None, alias="ascii_fold_ignore")

Comment thread
dirkkul marked this conversation as resolved.
@model_validator(mode="after")
def _validate_ascii_fold_ignore(self) -> "_TextAnalyzerConfigCreate":
if self.asciiFold is not True and self.asciiFoldIgnore is not None:
raise ValueError("asciiFoldIgnore cannot be set when asciiFold is not enabled")
return self


class _TextAnalyzer:
"""Factory class for creating text analyzer configurations.

Use ``Configure.TextAnalyzer`` to access these methods.
"""

@staticmethod
def ascii_fold(
Comment thread
dirkkul marked this conversation as resolved.
Outdated
ignore: Optional[List[str]] = None,
) -> _TextAnalyzerConfigCreate:
"""Create a text analyzer config with ASCII folding enabled.

Args:
ignore: Optional list of characters that should be excluded from
ASCII folding (e.g. ``['é']`` keeps 'é' from being folded to 'e').
"""
return _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=ignore)


class Property(_ConfigCreateModel):
"""This class defines the structure of a data property that a collection can have within Weaviate.

Expand All @@ -2173,6 +2241,9 @@ class Property(_ConfigCreateModel):
index_searchable: Whether the property should be searchable in the inverted index.
nested_properties: nested properties for data type OBJECT and OBJECT_ARRAY`.
skip_vectorization: Whether to skip vectorization of the property. Defaults to `False`.
text_analyzer: Text analysis options for the property. Configures ASCII folding
behavior for text and text[] properties using an inverted index. Immutable
after the property is created.
tokenization: The tokenization method to use for the inverted index. Defaults to `None`.
vectorize_property_name: Whether to vectorize the property name. Defaults to `True`.
"""
Expand All @@ -2187,6 +2258,7 @@ class Property(_ConfigCreateModel):
default=None, alias="nested_properties"
)
skip_vectorization: bool = Field(default=False)
textAnalyzer: Optional[_TextAnalyzerConfigCreate] = Field(default=None, alias="text_analyzer")
tokenization: Optional[Tokenization] = Field(default=None)
vectorize_property_name: bool = Field(default=True)

Expand Down Expand Up @@ -2566,6 +2638,7 @@ class Configure:
MultiVectors = _MultiVectors
ObjectTTL = _ObjectTTL
Replication = _Replication
TextAnalyzer = _TextAnalyzer

@staticmethod
def inverted_index(
Expand Down
Loading