Merge pull request #42 from Reasat/feature/source-extra-fields

caufieldjh · web-flow · commit 38c1513f3e70 · 2026-03-16T11:27:28.000-04:00
Feature/source extra fields
diff --git a/.gitignore b/.gitignore
@@ -141,3 +141,4 @@ dmypy.json
 .idea
 # Local vscode editor config
 .vscode
+.cursor
diff --git a/README.md b/README.md
@@ -871,6 +871,32 @@ linkml-reference-validator validate text "text" UNKNOWN:12345
 # ✗ Valid: False (WARNING) - Could not fetch reference
 ```
 
+#### `source_extra_fields` (per-source JSONPath map)
+
+Capture additional fields from reference API responses and append them to cached content so they are included in validation. Keys are source prefixes (e.g. `clinicaltrials`, `PMID`, `DOI`, `GEO`); values map a field name to a JSONPath expression into the raw API response. Prefer paths to a single value (string/number). If the path selects a list, its elements are converted to strings and joined with spaces. If it selects an object or other type, its string representation is used.
+
+**Example:**
+
+Save as `my-config.yaml`:
+
+```yaml
+validation:
+  source_extra_fields:
+    clinicaltrials:
+      eligibility: "$.protocolSection.eligibilityModule.eligibilityCriteria"
+      outcomes: "$.protocolSection.outcomesModule.primaryOutcomes"
+```
+
+Pass this config when fetching so the cache includes these sections: use `--config my-config.yaml` with `cache reference` or `validate`. Captured field names are stored in `extra_fields_captured` in the cache frontmatter.
+
+```bash
+# Fetch and cache a trial with extra fields (eligibility, outcomes)
+linkml-reference-validator cache reference clinicaltrials:NCT00000001 --config my-config.yaml
+
+# Validate text against the cached content (including extra sections)
+linkml-reference-validator validate text "Inclusion: age >= 18" clinicaltrials:NCT00001372 --config my-config.yaml
+```
+
 ### Cache Directory
 
 Default: `references_cache/` in current directory
diff --git a/src/linkml_reference_validator/etl/reference_fetcher.py b/src/linkml_reference_validator/etl/reference_fetcher.py
@@ -302,6 +302,19 @@ def _save_to_disk(self, reference: ReferenceContent) -> None:
             for keyword in reference.keywords:
                 lines.append(f"- {self._quote_yaml_value(keyword)}")
         lines.append(f"content_type: {reference.content_type}")
+        if reference.metadata and "extra_fields_captured" in reference.metadata:
+            extra_fields = reference.metadata.get("extra_fields_captured")
+            if isinstance(extra_fields, list):
+                lines.append("extra_fields_captured:")
+                for field_name in extra_fields:
+                    if isinstance(field_name, str):
+                        lines.append(f"- {self._quote_yaml_value(field_name)}")
+                    else:
+                        logger.warning(
+                            "Skipping non-string item in extra_fields_captured: %r (type %s)",
+                            field_name,
+                            type(field_name).__name__,
+                        )
         if reference.supplementary_files:
             lines.append("supplementary_files:")
             for sf in reference.supplementary_files:
@@ -415,6 +428,10 @@ def _load_markdown_format(
             frontmatter.get("supplementary_files")
         )
 
+        metadata: dict = {}
+        if "extra_fields_captured" in frontmatter:
+            metadata["extra_fields_captured"] = frontmatter["extra_fields_captured"]
+
         return ReferenceContent(
             reference_id=frontmatter.get("reference_id", reference_id),
             title=frontmatter.get("title"),
@@ -426,6 +443,7 @@ def _load_markdown_format(
             doi=frontmatter.get("doi"),
             keywords=keywords,
             supplementary_files=supplementary_files,
+            metadata=metadata,
         )
 
     def _extract_content_from_markdown(self, body: str) -> str:
diff --git a/src/linkml_reference_validator/etl/sources/clinicaltrials.py b/src/linkml_reference_validator/etl/sources/clinicaltrials.py
@@ -24,6 +24,10 @@
 
 from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
 from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
+from linkml_reference_validator.etl.sources.utils import (
+    extract_extra_fields,
+    format_extra_fields_for_content,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -133,9 +137,11 @@ def fetch(
             logger.warning(f"Failed to parse JSON response for {nct_id}: {exc}")
             return None
 
-        return self._parse_response(nct_id, data)
+        return self._parse_response(nct_id, data, config)
 
-    def _parse_response(self, nct_id: str, data: dict) -> Optional[ReferenceContent]:
+    def _parse_response(
+        self, nct_id: str, data: dict, config: ReferenceValidationConfig
+    ) -> Optional[ReferenceContent]:
         """Parse the ClinicalTrials.gov API response into ReferenceContent.
 
         Args:
@@ -169,6 +175,13 @@ def _parse_response(self, nct_id: str, data: dict) -> Optional[ReferenceContent]
         if sponsor_name:
             metadata["sponsor"] = sponsor_name
 
+        extra = extract_extra_fields(
+            data, config.source_extra_fields.get("clinicaltrials", {})
+        )
+        if extra:
+            content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
         content_type = "summary" if content else "unavailable"
 
         return ReferenceContent(
diff --git a/src/linkml_reference_validator/etl/sources/doi.py b/src/linkml_reference_validator/etl/sources/doi.py
@@ -24,6 +24,10 @@
     SupplementaryFile,
 )
 from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
+from linkml_reference_validator.etl.sources.utils import (
+    extract_extra_fields,
+    format_extra_fields_for_content,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -128,6 +132,14 @@ def _fetch_from_crossref(
         # Extract keywords/subjects from Crossref
         keywords = self._parse_crossref_subjects(message.get("subject", []))
 
+        metadata: dict = {}
+        extra = extract_extra_fields(
+            message, config.source_extra_fields.get("DOI", {})
+        )
+        if extra:
+            abstract = (abstract + "\n\n" + format_extra_fields_for_content(extra)) if abstract else format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
         return ReferenceContent(
             reference_id=f"DOI:{doi}",
             title=title,
@@ -138,6 +150,7 @@ def _fetch_from_crossref(
             year=year,
             doi=doi,
             keywords=keywords,
+            metadata=metadata,
         )
 
     def _fetch_from_datacite(
@@ -195,6 +208,14 @@ def _fetch_from_datacite(
         # Fetch supplementary files from repository-specific APIs
         supplementary_files = self._fetch_repository_files(doi, config)
 
+        metadata: dict = {}
+        extra = extract_extra_fields(
+            attributes, config.source_extra_fields.get("DOI", {})
+        )
+        if extra:
+            abstract = (abstract + "\n\n" + format_extra_fields_for_content(extra)) if abstract else format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
         return ReferenceContent(
             reference_id=f"DOI:{doi}",
             title=title,
@@ -206,6 +227,7 @@ def _fetch_from_datacite(
             doi=doi,
             keywords=keywords,
             supplementary_files=supplementary_files,
+            metadata=metadata,
         )
 
     def _detect_repository(self, doi: str) -> Optional[str]:
diff --git a/src/linkml_reference_validator/etl/sources/entrez.py b/src/linkml_reference_validator/etl/sources/entrez.py
@@ -19,6 +19,10 @@
 
 from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
 from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
+from linkml_reference_validator.etl.sources.utils import (
+    extract_extra_fields,
+    format_extra_fields_for_content,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -124,14 +128,23 @@ def fetch(
 
         title = self._get_first_field_value(record, self.TITLE_FIELDS)
         content = self._get_first_field_value(record, self.CONTENT_FIELDS)
-        content_type = "summary" if content else "unavailable"
+
+        metadata: dict[str, Any] = {"entrez_db": self.ENTREZ_DB}
+        extra = extract_extra_fields(
+            record, config.source_extra_fields.get(self.prefix(), {})
+        )
+        if extra:
+            content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
+        content_type = "summary" if (content or "").strip() else "unavailable"
 
         return ReferenceContent(
             reference_id=f"{self.prefix()}:{identifier}",
             title=title,
             content=content,
             content_type=content_type,
-            metadata={"entrez_db": self.ENTREZ_DB},
+            metadata=metadata,
         )
 
     def _extract_record(self, records: Any) -> Optional[dict[str, Any]]:
@@ -247,14 +260,26 @@ def fetch(
 
         title = self._get_first_field_value(record, self.TITLE_FIELDS)
         content = self._get_first_field_value(record, self.CONTENT_FIELDS)
-        content_type = "summary" if content else "unavailable"
+
+        metadata: dict[str, Any] = {
+            "entrez_db": self.ENTREZ_DB,
+            "entrez_uid": uid,
+        }
+        extra = extract_extra_fields(
+            record, config.source_extra_fields.get(self.prefix(), {})
+        )
+        if extra:
+            content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
+        content_type = "summary" if (content or "").strip() else "unavailable"
 
         return ReferenceContent(
             reference_id=f"{self.prefix()}:{identifier}",
             title=title,
             content=content,
             content_type=content_type,
-            metadata={"entrez_db": self.ENTREZ_DB, "entrez_uid": uid},
+            metadata=metadata,
         )
 
     def _accession_to_uid(
diff --git a/src/linkml_reference_validator/etl/sources/pmid.py b/src/linkml_reference_validator/etl/sources/pmid.py
@@ -21,6 +21,10 @@
 
 from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig
 from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry
+from linkml_reference_validator.etl.sources.utils import (
+    extract_extra_fields,
+    format_extra_fields_for_content,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -143,6 +147,17 @@ def fetch(
             content = abstract
             content_type = "abstract_only" if abstract else "unavailable"
 
+        metadata: dict = {}
+        extra = extract_extra_fields(
+            record_dict, config.source_extra_fields.get("PMID", {})
+        )
+        if extra:
+            content = (content or "") + "\n\n" + format_extra_fields_for_content(extra)
+            metadata["extra_fields_captured"] = list(extra.keys())
+
+        if (content or "").strip() and content_type == "unavailable":
+            content_type = "summary"
+
         return ReferenceContent(
             reference_id=f"PMID:{pmid}",
             title=title,
@@ -153,6 +168,7 @@ def fetch(
             year=year,
             doi=doi,
             keywords=keywords,
+            metadata=metadata,
         )
 
     def _parse_authors(self, author_list: list) -> list[str]:
diff --git a/src/linkml_reference_validator/etl/sources/utils.py b/src/linkml_reference_validator/etl/sources/utils.py
@@ -0,0 +1,106 @@
+"""Shared utilities for reference source ETL modules.
+
+Provides helpers used across multiple built-in sources, such as extracting
+user-configured extra fields from raw API responses.
+"""
+
+import logging
+
+from jsonpath_ng import parse as jsonpath_parse
+from jsonpath_ng.exceptions import JsonPathParserError
+
+logger = logging.getLogger(__name__)
+
+
+def extract_extra_fields(data: dict, field_map: dict[str, str]) -> dict[str, str]:
+    """Extract extra fields from a raw API response using JSONPath expressions.
+
+    For each entry in *field_map*, the corresponding JSONPath expression is
+    evaluated against *data*.  Fields that produce no match, an empty value,
+    or an invalid JSONPath expression are omitted from the result.
+
+    Args:
+        data: Raw API response dictionary to extract from.
+        field_map: Mapping of ``field_name`` → ``JSONPath expression``.
+            Example: ``{"eligibility": "$.protocolSection.eligibilityModule.eligibilityCriteria"}``.
+
+    Returns:
+        A dict mapping ``field_name`` → extracted text string for each field
+        that had a non-empty value.  Use :func:`format_extra_fields_for_content`
+        to turn this into text to append to reference content, and
+        ``list(result.keys())`` for ``extra_fields_captured`` metadata.
+
+    Examples:
+        >>> extract_extra_fields({}, {})
+        {}
+        >>> extract_extra_fields({"title": "My Paper"}, {})
+        {}
+        >>> extract_extra_fields({}, {"eligibility": "$.eligibility"})
+        {}
+        >>> extract_extra_fields({"foo": "bar"}, {"foo": "$.foo"})
+        {'foo': 'bar'}
+        >>> result = extract_extra_fields(
+        ...     {"a": "alpha", "b": "beta"},
+        ...     {"a": "$.a", "b": "$.b"},
+        ... )
+        >>> result == {"a": "alpha", "b": "beta"}
+        True
+        >>> extract_extra_fields({"other": "x"}, {"missing": "$.missing"})
+        {}
+        >>> extract_extra_fields({"foo": "bar"}, {"bad": "not a valid $[[[jsonpath"})
+        {}
+    """
+    if not field_map or not data:
+        return {}
+
+    result: dict[str, str] = {}
+
+    for field_name, jsonpath_expr in field_map.items():
+        try:
+            parsed = jsonpath_parse(jsonpath_expr)
+        except JsonPathParserError as exc:
+            logger.warning("Invalid JSONPath expression '%s' for field '%s': %s", jsonpath_expr, field_name, exc)
+            continue
+
+        matches = parsed.find(data)
+        if not matches:
+            continue
+
+        raw_value = matches[0].value
+        if raw_value is None:
+            continue
+
+        if isinstance(raw_value, list):
+            text = " ".join(str(item) for item in raw_value if str(item).strip())
+        else:
+            text = str(raw_value)
+
+        if not text.strip():
+            continue
+
+        result[field_name] = text
+
+    return result
+
+
+def format_extra_fields_for_content(extra: dict[str, str]) -> str:
+    """Format an extra-fields dict as markdown sections for appending to reference content.
+
+    Args:
+        extra: Result of :func:`extract_extra_fields` (field_name → text).
+
+    Returns:
+        String of ``### field_name\\n\\ntext`` sections joined by ``\\n\\n``,
+        or empty string if *extra* is empty.
+
+    Examples:
+        >>> format_extra_fields_for_content({})
+        ''
+        >>> format_extra_fields_for_content({"foo": "bar content"})
+        '### foo\\n\\nbar content'
+        >>> format_extra_fields_for_content({"a": "alpha", "b": "beta"})
+        '### a\\n\\nalpha\\n\\n### b\\n\\nbeta'
+    """
+    if not extra:
+        return ""
+    return "\n\n".join(f"### {k}\n\n{v}" for k, v in extra.items())
diff --git a/src/linkml_reference_validator/models.py b/src/linkml_reference_validator/models.py
diff --git a/tests/test_reference_fetcher.py b/tests/test_reference_fetcher.py
diff --git a/tests/test_source_utils.py b/tests/test_source_utils.py
diff --git a/tests/test_sources.py b/tests/test_sources.py

-Original file line number
+Diff line change
 .idea
 # Local vscode editor config
 .vscode
 +.cursor