Skip to content

Commit e0f2163

Browse files
committed
Add literal bracket patterns for supporting text validation
1 parent 1ca0c8b commit e0f2163

File tree

7 files changed

+140
-4
lines changed

7 files changed

+140
-4
lines changed

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,19 @@ supporting_text: "protein [X] functions in cell cycle regulation"
496496
- `[gene name]` - Clarifications
497497
- `[...]` - Omitted content markers
498498

499+
If your corpus uses square brackets as literal source text, configure patterns
500+
that should be preserved instead of stripped:
501+
502+
```yaml
503+
validation:
504+
literal_bracket_patterns:
505+
- "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+]
506+
- "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ]
507+
```
508+
509+
Patterns are matched against the content inside `[...]`. If any pattern matches,
510+
that bracketed text is kept verbatim during validation.
511+
499512
### Omitted Text `...`
500513

501514
Use ellipsis for gaps in quoted text:

docs/concepts/editorial-conventions.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,30 @@ Nested brackets are not recommended and may not work as expected:
7070
✅ "MUC1 [mucin 1, a membrane protein] blocks targeting"
7171
```
7272

73+
### Literal Brackets in Source Text
74+
75+
Some corpora use square brackets as part of the quoted source text itself:
76+
77+
```
78+
Reference: "polyadenylated [poly(A)+] RNA export"
79+
Quote: "polyadenylated [poly(A)+] RNA export"
80+
```
81+
82+
By default, linkml-reference-validator strips all bracketed text for backward
83+
compatibility. To preserve literal brackets, configure
84+
`literal_bracket_patterns` in your validation config:
85+
86+
```yaml
87+
validation:
88+
literal_bracket_patterns:
89+
- "\\d" # keep [2Fe-2S], [+21], [30S], [Ca2+]
90+
- "^\\S" # keep tight brackets like [poly(A)+], strip [ editorial ]
91+
```
92+
93+
Each pattern is matched against the content inside `[...]`. If any pattern
94+
matches, that bracketed text is kept during validation. If none match, the
95+
content is treated as an editorial insertion and stripped.
96+
7397
## Ellipsis `...`
7498

7599
Use ellipsis (three dots) to indicate **omitted text** between parts of a quote.

src/linkml_reference_validator/models.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,11 @@ class ReferenceValidationConfig(BaseModel):
352352
['SRA', 'MGNIFY']
353353
>>> config.unknown_prefix_severity
354354
<ValidationSeverity.WARNING: 'WARNING'>
355+
>>> config = ReferenceValidationConfig(
356+
... literal_bracket_patterns=[r"\d", r"^[A-Z]$"]
357+
... )
358+
>>> config.literal_bracket_patterns
359+
['\\d', '^[A-Z]$']
355360
"""
356361

357362
cache_dir: Path = Field(
@@ -385,6 +390,15 @@ class ReferenceValidationConfig(BaseModel):
385390
ge=1,
386391
description="Regex capture group number containing the reference ID",
387392
)
393+
literal_bracket_patterns: list[str] = Field(
394+
default_factory=list,
395+
description=(
396+
"Regular expressions matched against the content inside square brackets. "
397+
"If any pattern matches, the bracketed text is treated as literal source "
398+
"text and preserved during supporting text validation. "
399+
"If no patterns are configured, all bracketed text is stripped."
400+
),
401+
)
388402
reference_prefix_map: dict[str, str] = Field(
389403
default_factory=dict,
390404
description=(

src/linkml_reference_validator/validation/supporting_text_validator.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ class SupportingTextValidator:
2323
appears in the referenced publication using deterministic substring matching.
2424
2525
Supports:
26-
- Editorial notes in [square brackets] that are ignored
26+
- Editorial notes in [square brackets] that are ignored by default
27+
- Configurable literal bracket patterns for bracketed source text
2728
- Multi-part quotes with "..." separators indicating omitted text
2829
2930
Examples:
@@ -46,6 +47,9 @@ def __init__(self, config: ReferenceValidationConfig):
4647
"""
4748
self.config = config
4849
self.fetcher = ReferenceFetcher(config)
50+
self._literal_bracket_regexes = [
51+
re.compile(pattern) for pattern in config.literal_bracket_patterns
52+
]
4953

5054
def validate_title(
5155
self,
@@ -287,7 +291,7 @@ def find_text_in_reference(
287291
return self._substring_match(query_parts, reference.content, supporting_text)
288292

289293
def _split_query(self, text: str) -> list[str]:
290-
"""Split query into parts separated by ... removing [...] editorial notes.
294+
"""Split query into parts while stripping editorial brackets by default.
291295
292296
Args:
293297
text: Query text
@@ -302,12 +306,28 @@ def _split_query(self, text: str) -> list[str]:
302306
['protein functions', 'in cells']
303307
>>> validator._split_query("protein [important] functions")
304308
['protein functions']
309+
>>> config = ReferenceValidationConfig(literal_bracket_patterns=[r"\\d"])
310+
>>> validator = SupportingTextValidator(config)
311+
>>> validator._split_query("protein [important] binds [2Fe-2S] cluster")
312+
['protein binds [2Fe-2S] cluster']
305313
>>> validator._split_query("[editorial note]")
306314
[]
307315
"""
308-
text_without_brackets = re.sub(r"\[.*?\]", " ", text)
316+
if not self._literal_bracket_regexes:
317+
text_without_brackets = re.sub(r"\[.*?\]", " ", text)
318+
else:
319+
320+
def replace_bracket(match: re.Match[str]) -> str:
321+
"""Preserve configured literal bracket content, strip editorial notes."""
322+
content = match.group(1)
323+
if any(regex.search(content) for regex in self._literal_bracket_regexes):
324+
return match.group(0)
325+
return " "
326+
327+
text_without_brackets = re.sub(r"\[(.*?)\]", replace_bracket, text)
328+
309329
parts = re.split(r"\s*\.{2,}\s*", text_without_brackets)
310-
parts = [p.strip() for p in parts if p.strip()]
330+
parts = [re.sub(r"\s+", " ", p).strip() for p in parts if p.strip()]
311331
return parts
312332

313333
def _substring_match(

tests/test_models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,19 @@ def test_config_defaults():
1616
config = ReferenceValidationConfig()
1717
assert config.cache_dir == Path("references_cache")
1818
assert config.rate_limit_delay == 0.5
19+
assert config.literal_bracket_patterns == []
1920

2021

2122
def test_config_custom_values():
2223
"""Test configuration with custom values."""
2324
config = ReferenceValidationConfig(
2425
cache_dir=Path("/tmp/cache"),
2526
rate_limit_delay=1.0,
27+
literal_bracket_patterns=[r"\d"],
2628
)
2729
assert config.cache_dir == Path("/tmp/cache")
2830
assert config.rate_limit_delay == 1.0
31+
assert config.literal_bracket_patterns == [r"\d"]
2932

3033

3134
def test_config_get_cache_dir(tmp_path):

tests/test_supporting_text_validator.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,20 @@ def test_split_query_with_brackets(validator):
9696
assert "important" not in parts[0]
9797

9898

99+
def test_split_query_keeps_literal_brackets_when_pattern_matches(tmp_path):
100+
"""Test splitting query keeps configured literal bracket content."""
101+
config = ReferenceValidationConfig(
102+
cache_dir=tmp_path / "cache",
103+
rate_limit_delay=0.0,
104+
literal_bracket_patterns=[r"\d"],
105+
)
106+
validator = SupportingTextValidator(config)
107+
108+
parts = validator._split_query("protein [important] binds [2Fe-2S] cluster")
109+
110+
assert parts == ["protein binds [2Fe-2S] cluster"]
111+
112+
99113
def test_substring_match_found(validator):
100114
"""Test substring matching when text is found."""
101115
match = validator._substring_match(
@@ -163,6 +177,37 @@ def test_find_text_empty_query_after_brackets(validator):
163177
assert "empty" in match.error_message.lower()
164178

165179

180+
def test_find_text_in_reference_literal_brackets_require_config(validator):
181+
"""Test literal bracket content is still stripped by default."""
182+
ref = ReferenceContent(
183+
reference_id="PMID:123",
184+
content="The [2Fe-2S] cluster is required for activity.",
185+
)
186+
187+
match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)
188+
189+
assert match.found is False
190+
191+
192+
def test_find_text_in_reference_keeps_literal_brackets_with_config(tmp_path):
193+
"""Test literal bracket content can be preserved through configuration."""
194+
config = ReferenceValidationConfig(
195+
cache_dir=tmp_path / "cache",
196+
rate_limit_delay=0.0,
197+
literal_bracket_patterns=[r"\d"],
198+
)
199+
validator = SupportingTextValidator(config)
200+
ref = ReferenceContent(
201+
reference_id="PMID:123",
202+
content="The [2Fe-2S] cluster is required for activity.",
203+
)
204+
205+
match = validator.find_text_in_reference("The [2Fe-2S] cluster", ref)
206+
207+
assert match.found is True
208+
assert match.similarity_score == 1.0
209+
210+
166211
def test_validate_success(validator, mocker):
167212
"""Test successful validation."""
168213
mock_fetch = mocker.patch.object(validator.fetcher, "fetch")

tests/test_validation_config.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,20 @@ def test_load_validation_config_ignores_repair_only(tmp_path):
3636
config = load_validation_config(config_file)
3737

3838
assert config.reference_prefix_map == {}
39+
40+
41+
def test_load_validation_config_literal_bracket_patterns(tmp_path):
42+
"""Should load literal bracket patterns from validation config."""
43+
config_file = tmp_path / ".linkml-reference-validator.yaml"
44+
config_file.write_text(
45+
"""
46+
validation:
47+
literal_bracket_patterns:
48+
- "\\\\d"
49+
- "^\\\\S"
50+
"""
51+
)
52+
53+
config = load_validation_config(config_file)
54+
55+
assert config.literal_bracket_patterns == [r"\d", r"^\S"]

0 commit comments

Comments
 (0)