Skip to content

Commit 38c1513

Browse files
authored
Merge pull request #42 from Reasat/feature/source-extra-fields
Feature/source extra fields
2 parents 57d9077 + 72cf904 commit 38c1513

File tree

12 files changed

+576
-26
lines changed

12 files changed

+576
-26
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,4 @@ dmypy.json
141141
.idea
142142
# Local vscode editor config
143143
.vscode
144+
.cursor

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,32 @@ linkml-reference-validator validate text "text" UNKNOWN:12345
871871
# ✗ Valid: False (WARNING) - Could not fetch reference
872872
```
873873

874+
#### `source_extra_fields` (per-source JSONPath map)
875+
876+
Capture additional fields from reference API responses and append them to cached content so they are included in validation. Keys are source prefixes (e.g. `clinicaltrials`, `PMID`, `DOI`, `GEO`); values map a field name to a JSONPath expression into the raw API response. Prefer paths to a single value (string/number). If the path selects a list, its elements are converted to strings and joined with spaces. If it selects an object or other type, its string representation is used.
877+
878+
**Example:**
879+
880+
Save as `my-config.yaml`:
881+
882+
```yaml
883+
validation:
884+
source_extra_fields:
885+
clinicaltrials:
886+
eligibility: "$.protocolSection.eligibilityModule.eligibilityCriteria"
887+
outcomes: "$.protocolSection.outcomesModule.primaryOutcomes"
888+
```
889+
890+
Pass this config when fetching so the cache includes these sections: use `--config my-config.yaml` with `cache reference` or `validate`. Captured field names are stored in `extra_fields_captured` in the cache frontmatter.
891+
892+
```bash
893+
# Fetch and cache a trial with extra fields (eligibility, outcomes)
894+
linkml-reference-validator cache reference clinicaltrials:NCT00000001 --config my-config.yaml
895+
896+
# Validate text against the cached content (including extra sections)
897+
linkml-reference-validator validate text "Inclusion: age >= 18" clinicaltrials:NCT00001372 --config my-config.yaml
898+
```
899+
874900
### Cache Directory
875901

876902
Default: `references_cache/` in current directory

src/linkml_reference_validator/etl/reference_fetcher.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,19 @@ def _save_to_disk(self, reference: ReferenceContent) -> None:
302302
for keyword in reference.keywords:
303303
lines.append(f"- {self._quote_yaml_value(keyword)}")
304304
lines.append(f"content_type: {reference.content_type}")
305+
if reference.metadata and "extra_fields_captured" in reference.metadata:
306+
extra_fields = reference.metadata.get("extra_fields_captured")
307+
if isinstance(extra_fields, list):
308+
lines.append("extra_fields_captured:")
309+
for field_name in extra_fields:
310+
if isinstance(field_name, str):
311+
lines.append(f"- {self._quote_yaml_value(field_name)}")
312+
else:
313+
logger.warning(
314+
"Skipping non-string item in extra_fields_captured: %r (type %s)",
315+
field_name,
316+
type(field_name).__name__,
317+
)
305318
if reference.supplementary_files:
306319
lines.append("supplementary_files:")
307320
for sf in reference.supplementary_files:
@@ -415,6 +428,10 @@ def _load_markdown_format(
415428
frontmatter.get("supplementary_files")
416429
)
417430

431+
metadata: dict = {}
432+
if "extra_fields_captured" in frontmatter:
433+
metadata["extra_fields_captured"] = frontmatter["extra_fields_captured"]
434+
418435
return ReferenceContent(
419436
reference_id=frontmatter.get("reference_id", reference_id),
420437
title=frontmatter.get("title"),
@@ -426,6 +443,7 @@ def _load_markdown_format(
426443
doi=frontmatter.get("doi"),
427444
keywords=keywords,
428445
supplementary_files=supplementary_files,
446+
metadata=metadata,
429447
)
430448

431449
def _extract_content_from_markdown(self, body: str) -> str:

src/linkml_reference_validator/etl/sources/clinicaltrials.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
2626
from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
27+
from linkml_reference_validator.etl.sources.utils import (
28+
extract_extra_fields,
29+
format_extra_fields_for_content,
30+
)
2731

2832
logger = logging.getLogger(__name__)
2933

@@ -133,9 +137,11 @@ def fetch(
133137
logger.warning(f"Failed to parse JSON response for {nct_id}: {exc}")
134138
return None
135139

136-
return self._parse_response(nct_id, data)
140+
return self._parse_response(nct_id, data, config)
137141

138-
def _parse_response(self, nct_id: str, data: dict) -> Optional[ReferenceContent]:
142+
def _parse_response(
143+
self, nct_id: str, data: dict, config: ReferenceValidationConfig
144+
) -> Optional[ReferenceContent]:
139145
"""Parse the ClinicalTrials.gov API response into ReferenceContent.
140146
141147
Args:
@@ -169,6 +175,13 @@ def _parse_response(self, nct_id: str, data: dict) -> Optional[ReferenceContent]
169175
if sponsor_name:
170176
metadata["sponsor"] = sponsor_name
171177

178+
extra = extract_extra_fields(
179+
data, config.source_extra_fields.get("clinicaltrials", {})
180+
)
181+
if extra:
182+
content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
183+
metadata["extra_fields_captured"] = list(extra.keys())
184+
172185
content_type = "summary" if content else "unavailable"
173186

174187
return ReferenceContent(

src/linkml_reference_validator/etl/sources/doi.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
SupplementaryFile,
2525
)
2626
from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
27+
from linkml_reference_validator.etl.sources.utils import (
28+
extract_extra_fields,
29+
format_extra_fields_for_content,
30+
)
2731

2832
logger = logging.getLogger(__name__)
2933

@@ -128,6 +132,14 @@ def _fetch_from_crossref(
128132
# Extract keywords/subjects from Crossref
129133
keywords = self._parse_crossref_subjects(message.get("subject", []))
130134

135+
metadata: dict = {}
136+
extra = extract_extra_fields(
137+
message, config.source_extra_fields.get("DOI", {})
138+
)
139+
if extra:
140+
abstract = (abstract + "\n\n" + format_extra_fields_for_content(extra)) if abstract else format_extra_fields_for_content(extra)
141+
metadata["extra_fields_captured"] = list(extra.keys())
142+
131143
return ReferenceContent(
132144
reference_id=f"DOI:{doi}",
133145
title=title,
@@ -138,6 +150,7 @@ def _fetch_from_crossref(
138150
year=year,
139151
doi=doi,
140152
keywords=keywords,
153+
metadata=metadata,
141154
)
142155

143156
def _fetch_from_datacite(
@@ -195,6 +208,14 @@ def _fetch_from_datacite(
195208
# Fetch supplementary files from repository-specific APIs
196209
supplementary_files = self._fetch_repository_files(doi, config)
197210

211+
metadata: dict = {}
212+
extra = extract_extra_fields(
213+
attributes, config.source_extra_fields.get("DOI", {})
214+
)
215+
if extra:
216+
abstract = (abstract + "\n\n" + format_extra_fields_for_content(extra)) if abstract else format_extra_fields_for_content(extra)
217+
metadata["extra_fields_captured"] = list(extra.keys())
218+
198219
return ReferenceContent(
199220
reference_id=f"DOI:{doi}",
200221
title=title,
@@ -206,6 +227,7 @@ def _fetch_from_datacite(
206227
doi=doi,
207228
keywords=keywords,
208229
supplementary_files=supplementary_files,
230+
metadata=metadata,
209231
)
210232

211233
def _detect_repository(self, doi: str) -> Optional[str]:

src/linkml_reference_validator/etl/sources/entrez.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919

2020
from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
2121
from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
22+
from linkml_reference_validator.etl.sources.utils import (
23+
extract_extra_fields,
24+
format_extra_fields_for_content,
25+
)
2226

2327
logger = logging.getLogger(__name__)
2428

@@ -124,14 +128,23 @@ def fetch(
124128

125129
title = self._get_first_field_value(record, self.TITLE_FIELDS)
126130
content = self._get_first_field_value(record, self.CONTENT_FIELDS)
127-
content_type = "summary" if content else "unavailable"
131+
132+
metadata: dict[str, Any] = {"entrez_db": self.ENTREZ_DB}
133+
extra = extract_extra_fields(
134+
record, config.source_extra_fields.get(self.prefix(), {})
135+
)
136+
if extra:
137+
content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
138+
metadata["extra_fields_captured"] = list(extra.keys())
139+
140+
content_type = "summary" if (content or "").strip() else "unavailable"
128141

129142
return ReferenceContent(
130143
reference_id=f"{self.prefix()}:{identifier}",
131144
title=title,
132145
content=content,
133146
content_type=content_type,
134-
metadata={"entrez_db": self.ENTREZ_DB},
147+
metadata=metadata,
135148
)
136149

137150
def _extract_record(self, records: Any) -> Optional[dict[str, Any]]:
@@ -247,14 +260,26 @@ def fetch(
247260

248261
title = self._get_first_field_value(record, self.TITLE_FIELDS)
249262
content = self._get_first_field_value(record, self.CONTENT_FIELDS)
250-
content_type = "summary" if content else "unavailable"
263+
264+
metadata: dict[str, Any] = {
265+
"entrez_db": self.ENTREZ_DB,
266+
"entrez_uid": uid,
267+
}
268+
extra = extract_extra_fields(
269+
record, config.source_extra_fields.get(self.prefix(), {})
270+
)
271+
if extra:
272+
content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
273+
metadata["extra_fields_captured"] = list(extra.keys())
274+
275+
content_type = "summary" if (content or "").strip() else "unavailable"
251276

252277
return ReferenceContent(
253278
reference_id=f"{self.prefix()}:{identifier}",
254279
title=title,
255280
content=content,
256281
content_type=content_type,
257-
metadata={"entrez_db": self.ENTREZ_DB, "entrez_uid": uid},
282+
metadata=metadata,
258283
)
259284

260285
def _accession_to_uid(

src/linkml_reference_validator/etl/sources/pmid.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121

2222
from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
2323
from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
24+
from linkml_reference_validator.etl.sources.utils import (
25+
extract_extra_fields,
26+
format_extra_fields_for_content,
27+
)
2428

2529
logger = logging.getLogger(__name__)
2630

@@ -143,6 +147,17 @@ def fetch(
143147
content = abstract
144148
content_type = "abstract_only" if abstract else "unavailable"
145149

150+
metadata: dict = {}
151+
extra = extract_extra_fields(
152+
record_dict, config.source_extra_fields.get("PMID", {})
153+
)
154+
if extra:
155+
content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
156+
metadata["extra_fields_captured"] = list(extra.keys())
157+
158+
if (content or "").strip() and content_type == "unavailable":
159+
content_type = "summary"
160+
146161
return ReferenceContent(
147162
reference_id=f"PMID:{pmid}",
148163
title=title,
@@ -153,6 +168,7 @@ def fetch(
153168
year=year,
154169
doi=doi,
155170
keywords=keywords,
171+
metadata=metadata,
156172
)
157173

158174
def _parse_authors(self, author_list: list) -> list[str]:
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""Shared utilities for reference source ETL modules.
2+
3+
Provides helpers used across multiple built-in sources, such as extracting
4+
user-configured extra fields from raw API responses.
5+
"""
6+
7+
import logging
8+
9+
from jsonpath_ng import parse as jsonpath_parse
10+
from jsonpath_ng.exceptions import JsonPathParserError
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
def extract_extra_fields(data: dict, field_map: dict[str, str]) -> dict[str, str]:
16+
"""Extract extra fields from a raw API response using JSONPath expressions.
17+
18+
For each entry in *field_map*, the corresponding JSONPath expression is
19+
evaluated against *data*. Fields that produce no match, an empty value,
20+
or an invalid JSONPath expression are omitted from the result.
21+
22+
Args:
23+
data: Raw API response dictionary to extract from.
24+
field_map: Mapping of ``field_name`` → ``JSONPath expression``.
25+
Example: ``{"eligibility": "$.protocolSection.eligibilityModule.eligibilityCriteria"}``.
26+
27+
Returns:
28+
A dict mapping ``field_name`` → extracted text string for each field
29+
that had a non-empty value. Use :func:`format_extra_fields_for_content`
30+
to turn this into text to append to reference content, and
31+
``list(result.keys())`` for ``extra_fields_captured`` metadata.
32+
33+
Examples:
34+
>>> extract_extra_fields({}, {})
35+
{}
36+
>>> extract_extra_fields({"title": "My Paper"}, {})
37+
{}
38+
>>> extract_extra_fields({}, {"eligibility": "$.eligibility"})
39+
{}
40+
>>> extract_extra_fields({"foo": "bar"}, {"foo": "$.foo"})
41+
{'foo': 'bar'}
42+
>>> result = extract_extra_fields(
43+
... {"a": "alpha", "b": "beta"},
44+
... {"a": "$.a", "b": "$.b"},
45+
... )
46+
>>> result == {"a": "alpha", "b": "beta"}
47+
True
48+
>>> extract_extra_fields({"other": "x"}, {"missing": "$.missing"})
49+
{}
50+
>>> extract_extra_fields({"foo": "bar"}, {"bad": "not a valid $[[[jsonpath"})
51+
{}
52+
"""
53+
if not field_map or not data:
54+
return {}
55+
56+
result: dict[str, str] = {}
57+
58+
for field_name, jsonpath_expr in field_map.items():
59+
try:
60+
parsed = jsonpath_parse(jsonpath_expr)
61+
except JsonPathParserError as exc:
62+
logger.warning("Invalid JSONPath expression '%s' for field '%s': %s", jsonpath_expr, field_name, exc)
63+
continue
64+
65+
matches = parsed.find(data)
66+
if not matches:
67+
continue
68+
69+
raw_value = matches[0].value
70+
if raw_value is None:
71+
continue
72+
73+
if isinstance(raw_value, list):
74+
text = " ".join(str(item) for item in raw_value if str(item).strip())
75+
else:
76+
text = str(raw_value)
77+
78+
if not text.strip():
79+
continue
80+
81+
result[field_name] = text
82+
83+
return result
84+
85+
86+
def format_extra_fields_for_content(extra: dict[str, str]) -> str:
87+
"""Format an extra-fields dict as markdown sections for appending to reference content.
88+
89+
Args:
90+
extra: Result of :func:`extract_extra_fields` (field_name → text).
91+
92+
Returns:
93+
String of ``### field_name\\n\\ntext`` sections joined by ``\\n\\n``,
94+
or empty string if *extra* is empty.
95+
96+
Examples:
97+
>>> format_extra_fields_for_content({})
98+
''
99+
>>> format_extra_fields_for_content({"foo": "bar content"})
100+
'### foo\\n\\nbar content'
101+
>>> format_extra_fields_for_content({"a": "alpha", "b": "beta"})
102+
'### a\\n\\nalpha\\n\\n### b\\n\\nbeta'
103+
"""
104+
if not extra:
105+
return ""
106+
return "\n\n".join(f"### {k}\n\n{v}" for k, v in extra.items())

0 commit comments

Comments
 (0)