@@ -23,7 +23,8 @@ class SupportingTextValidator:
2323 appears in the referenced publication using deterministic substring matching.
2424
2525 Supports:
26- - Editorial notes in [square brackets] that are ignored
26+ - Editorial notes in [square brackets] that are ignored by default
27+ - Configurable literal bracket patterns for bracketed source text
2728 - Multi-part quotes with "..." separators indicating omitted text
2829
2930 Examples:
@@ -46,6 +47,9 @@ def __init__(self, config: ReferenceValidationConfig):
4647 """
4748 self .config = config
4849 self .fetcher = ReferenceFetcher (config )
50+ self ._literal_bracket_regexes = [
51+ re .compile (pattern ) for pattern in config .literal_bracket_patterns
52+ ]
4953
5054 def validate_title (
5155 self ,
@@ -287,7 +291,7 @@ def find_text_in_reference(
287291 return self ._substring_match (query_parts , reference .content , supporting_text )
288292
289293 def _split_query (self , text : str ) -> list [str ]:
290- """Split query into parts separated by ... removing [...] editorial notes .
294+ """Split query into parts while stripping editorial brackets by default .
291295
292296 Args:
293297 text: Query text
@@ -302,12 +306,28 @@ def _split_query(self, text: str) -> list[str]:
302306 ['protein functions', 'in cells']
303307 >>> validator._split_query("protein [important] functions")
304308 ['protein functions']
309+ >>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\ d"])
310+ >>> validator = SupportingTextValidator(config)
311+ >>> validator._split_query("protein [important] binds [2Fe-2S] cluster")
312+ ['protein binds [2Fe-2S] cluster']
305313 >>> validator._split_query("[editorial note]")
306314 []
307315 """
308- text_without_brackets = re .sub (r"\[.*?\]" , " " , text )
316+ if not self ._literal_bracket_regexes :
317+ text_without_brackets = re .sub (r"\[.*?\]" , " " , text )
318+ else :
319+
320+ def replace_bracket (match : re .Match [str ]) -> str :
321+ """Preserve configured literal bracket content, strip editorial notes."""
322+ content = match .group (1 )
323+ if any (regex .search (content ) for regex in self ._literal_bracket_regexes ):
324+ return match .group (0 )
325+ return " "
326+
327+ text_without_brackets = re .sub (r"\[(.*?)\]" , replace_bracket , text )
328+
309329 parts = re .split (r"\s*\.{2,}\s*" , text_without_brackets )
310- parts = [p .strip () for p in parts if p .strip ()]
330+ parts = [re . sub ( r"\s+" , " " , p ) .strip () for p in parts if p .strip ()]
311331 return parts
312332
313333 def _substring_match (
0 commit comments