Add literal bracket patterns for supporting text validation

cmungall · cmungall · commit e0f2163ae90d · 2026-04-03T14:58:33.000-07:00
diff --git a/README.md b/README.md
@@ -496,6 +496,19 @@ supporting_text: "protein [X] functions in cell cycle regulation"
 - `[gene name]` - Clarifications
 - `[...]` - Omitted content markers
 
+If your corpus uses square brackets as literal source text, configure patterns
+that should be preserved instead of stripped:
+
+```yaml
+validation:
+  literal_bracket_patterns:
+    - "\\d"       # keep [2Fe-2S], [+21], [30S], [Ca2+]
+    - "^\\S"      # keep tight brackets like [poly(A)+], strip [ editorial ]
+```
+
+Patterns are matched against the content inside `[...]`. If any pattern matches,
+that bracketed text is kept verbatim during validation.
+
 ### Omitted Text `...`
 
 Use ellipsis for gaps in quoted text:
diff --git a/docs/concepts/editorial-conventions.md b/docs/concepts/editorial-conventions.md
@@ -70,6 +70,30 @@ Nested brackets are not recommended and may not work as expected:
 ✅ "MUC1 [mucin 1, a membrane protein] blocks targeting"
 ```
 
+### Literal Brackets in Source Text
+
+Some corpora use square brackets as part of the quoted source text itself:
+
+```
+Reference: "polyadenylated [poly(A)+] RNA export"
+Quote:     "polyadenylated [poly(A)+] RNA export"
+```
+
+By default, linkml-reference-validator strips all bracketed text for backward
+compatibility. To preserve literal brackets, configure
+`literal_bracket_patterns` in your validation config:
+
+```yaml
+validation:
+  literal_bracket_patterns:
+    - "\\d"       # keep [2Fe-2S], [+21], [30S], [Ca2+]
+    - "^\\S"      # keep tight brackets like [poly(A)+], strip [ editorial ]
+```
+
+Each pattern is matched against the content inside `[...]`. If any pattern
+matches, that bracketed text is kept during validation. If none match, the
+content is treated as an editorial insertion and stripped.
+
 ## Ellipsis `...`
 
 Use ellipsis (three dots) to indicate **omitted text** between parts of a quote.
diff --git a/src/linkml_reference_validator/models.py b/src/linkml_reference_validator/models.py
@@ -352,6 +352,11 @@ class ReferenceValidationConfig(BaseModel):
         ['SRA', 'MGNIFY']
         >>> config.unknown_prefix_severity
         <ValidationSeverity.WARNING: 'WARNING'>
+        >>> config = ReferenceValidationConfig(
+        ...     literal_bracket_patterns=[r"\d", r"^[A-Z]$"]
+        ... )
+        >>> config.literal_bracket_patterns
+        ['\\d', '^[A-Z]$']
     """
 
     cache_dir: Path = Field(
@@ -385,6 +390,15 @@ class ReferenceValidationConfig(BaseModel):
         ge=1,
         description="Regex capture group number containing the reference ID",
     )
+    literal_bracket_patterns: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Regular expressions matched against the content inside square brackets. "
+            "If any pattern matches, the bracketed text is treated as literal source "
+            "text and preserved during supporting text validation. "
+            "If no patterns are configured, all bracketed text is stripped."
+        ),
+    )
     reference_prefix_map: dict[str, str] = Field(
         default_factory=dict,
         description=(
diff --git a/src/linkml_reference_validator/validation/supporting_text_validator.py b/src/linkml_reference_validator/validation/supporting_text_validator.py
@@ -23,7 +23,8 @@ class SupportingTextValidator:
     appears in the referenced publication using deterministic substring matching.
 
     Supports:
-    - Editorial notes in [square brackets] that are ignored
+    - Editorial notes in [square brackets] that are ignored by default
+    - Configurable literal bracket patterns for bracketed source text
     - Multi-part quotes with "..." separators indicating omitted text
 
     Examples:
@@ -46,6 +47,9 @@ def __init__(self, config: ReferenceValidationConfig):
         """
         self.config = config
         self.fetcher = ReferenceFetcher(config)
+        self._literal_bracket_regexes = [
+            re.compile(pattern) for pattern in config.literal_bracket_patterns
+        ]
 
     def validate_title(
         self,
@@ -287,7 +291,7 @@ def find_text_in_reference(
         return self._substring_match(query_parts, reference.content, supporting_text)
 
     def _split_query(self, text: str) -> list[str]:
-        """Split query into parts separated by ... removing [...] editorial notes.
+        """Split query into parts while stripping editorial brackets by default.
 
         Args:
             text: Query text
@@ -302,12 +306,28 @@ def _split_query(self, text: str) -> list[str]:
             ['protein functions', 'in cells']
             >>> validator._split_query("protein [important] functions")
             ['protein functions']
+            >>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\d"])
+            >>> validator = SupportingTextValidator(config)
+            >>> validator._split_query("protein [important] binds [2Fe-2S] cluster")
+            ['protein binds [2Fe-2S] cluster']
             >>> validator._split_query("[editorial note]")
             []
         """
-        text_without_brackets = re.sub(r"\[.*?\]", " ", text)
+        if not self._literal_bracket_regexes:
+            text_without_brackets = re.sub(r"\[.*?\]", " ", text)
+        else:
+
+            def replace_bracket(match: re.Match[str]) -> str:
+                """Preserve configured literal bracket content, strip editorial notes."""
+                content = match.group(1)
+                if any(regex.search(content) for regex in self._literal_bracket_regexes):
+                    return match.group(0)
+                return " "
+
+            text_without_brackets = re.sub(r"\[(.*?)\]", replace_bracket, text)
+
         parts = re.split(r"\s*\.{2,}\s*", text_without_brackets)
-        parts = [p.strip() for p in parts if p.strip()]
+        parts = [re.sub(r"\s+", " ", p).strip() for p in parts if p.strip()]
         return parts
 
     def _substring_match(
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -16,16 +16,19 @@ def test_config_defaults():
     config = ReferenceValidationConfig()
     assert config.cache_dir == Path("references_cache")
     assert config.rate_limit_delay == 0.5
+    assert config.literal_bracket_patterns == []
 
 
 def test_config_custom_values():
     """Test configuration with custom values."""
     config = ReferenceValidationConfig(
         cache_dir=Path("/tmp/cache"),
         rate_limit_delay=1.0,
+        literal_bracket_patterns=[r"\d"],
     )
     assert config.cache_dir == Path("/tmp/cache")
     assert config.rate_limit_delay == 1.0
+    assert config.literal_bracket_patterns == [r"\d"]
 
 
 def test_config_get_cache_dir(tmp_path):
diff --git a/tests/test_supporting_text_validator.py b/tests/test_supporting_text_validator.py
@@ -96,6 +96,20 @@ def test_split_query_with_brackets(validator):
     assert "important" not in parts[0]
 
 
+def test_split_query_keeps_literal_brackets_when_pattern_matches(tmp_path):
+    """Test splitting query keeps configured literal bracket content."""
+    config = ReferenceValidationConfig(
+        cache_dir=tmp_path / "cache",
+        rate_limit_delay=0.0,
+        literal_bracket_patterns=[r"\d"],
+    )
+    validator = SupportingTextValidator(config)
+
+    parts = validator._split_query("protein [important] binds [2Fe-2S] cluster")
+
+    assert parts == ["protein binds [2Fe-2S] cluster"]
+
+
 def test_substring_match_found(validator):
     """Test substring matching when text is found."""
     match = validator._substring_match(
@@ -163,6 +177,37 @@ def test_find_text_empty_query_after_brackets(validator):
     assert "empty" in match.error_message.lower()
 
 
+def test_find_text_in_reference_literal_brackets_require_config(validator):
+    """Test literal bracket content is still stripped by default."""
+    ref = ReferenceContent(
+        reference_id="PMID:123",
+        content="The [2Fe-2S] cluster is required for activity.",
+    )
+
+    match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)
+
+    assert match.found is False
+
+
+def test_find_text_in_reference_keeps_literal_brackets_with_config(tmp_path):
+    """Test literal bracket content can be preserved through configuration."""
+    config = ReferenceValidationConfig(
+        cache_dir=tmp_path / "cache",
+        rate_limit_delay=0.0,
+        literal_bracket_patterns=[r"\d"],
+    )
+    validator = SupportingTextValidator(config)
+    ref = ReferenceContent(
+        reference_id="PMID:123",
+        content="The [2Fe-2S] cluster is required for activity.",
+    )
+
+    match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)
+
+    assert match.found is True
+    assert match.similarity_score == 1.0
+
+
 def test_validate_success(validator, mocker):
     """Test successful validation."""
     mock_fetch = mocker.patch.object(validator.fetcher, "fetch")
diff --git a/tests/test_validation_config.py b/tests/test_validation_config.py
@@ -36,3 +36,20 @@ def test_load_validation_config_ignores_repair_only(tmp_path):
     config = load_validation_config(config_file)
 
     assert config.reference_prefix_map == {}
+
+
+def test_load_validation_config_literal_bracket_patterns(tmp_path):
+    """Should load literal bracket patterns from validation config."""
+    config_file = tmp_path / ".linkml-reference-validator.yaml"
+    config_file.write_text(
+        """
+validation:
+  literal_bracket_patterns:
+    - "\\\\d"
+    - "^\\\\S"
+"""
+    )
+
+    config = load_validation_config(config_file)
+
+    assert config.literal_bracket_patterns == [r"\d", r"^\S"]