Merge pull request #38 from linkml/37-runtimeerror-couldnt-resolve-exlinksrv2-the-address-table-is-empty

caufieldjh · web-flow · commit 57d907745f49 · 2026-02-06T17:34:36.000-05:00
Fix for uncaught RuntimeError when retrieving some PMID entries without PMCID
diff --git a/src/linkml_reference_validator/etl/sources/pmid.py b/src/linkml_reference_validator/etl/sources/pmid.py
@@ -13,7 +13,7 @@
 import logging
 import re
 import time
-from typing import Optional
+from typing import Any, Optional
 
 from Bio import Entrez  # type: ignore
 from bs4 import BeautifulSoup  # type: ignore
@@ -118,13 +118,20 @@ def fetch(
 
         record = records[0] if isinstance(records, list) else records
 
+        if not isinstance(record, dict):
+            logger.warning(
+                "Unexpected record format for PMID:%s: %s", pmid, type(record))
+            return None
+
+        record_dict: dict[str, Any] = record
+
         # Convert Entrez StringElement objects to plain strings
-        title = str(record.get("Title", ""))
-        authors = self._parse_authors(record.get("AuthorList", []))
-        journal = str(record.get("Source", ""))
-        pub_date = record.get("PubDate", "")
+        title = str(record_dict.get("Title", ""))
+        authors = self._parse_authors(record_dict.get("AuthorList", []))
+        journal = str(record_dict.get("Source", ""))
+        pub_date = record_dict.get("PubDate", "")
         year = str(pub_date)[:4] if pub_date else ""
-        doi = str(record.get("DOI", "")) if record.get("DOI") else ""
+        doi = str(record_dict.get("DOI", "")) if record_dict.get("DOI") else ""
 
         abstract = self._fetch_abstract(pmid, config)
         full_text, content_type = self._fetch_pmc_fulltext(pmid, config)
@@ -178,7 +185,8 @@ def _fetch_abstract(
         """
         time.sleep(config.rate_limit_delay)
 
-        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
+        handle = Entrez.efetch(db="pubmed", id=pmid,
+                               rettype="abstract", retmode="text")
         abstract_text = handle.read()
         handle.close()
 
@@ -206,7 +214,8 @@ def _fetch_mesh_terms(
         """
         time.sleep(config.rate_limit_delay)
 
-        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="xml")
+        handle = Entrez.efetch(db="pubmed", id=pmid,
+                               rettype="xml", retmode="xml")
         xml_content = handle.read()
         handle.close()
 
@@ -271,14 +280,31 @@ def _get_pmcid(self, pmid: str, config: ReferenceValidationConfig) -> Optional[s
         """
         time.sleep(config.rate_limit_delay)
 
-        handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc")
-        result = Entrez.read(handle)
-        handle.close()
+        try:
+            handle = Entrez.elink(
+                dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc"
+            )
+        except Exception as exc:
+            logger.warning("Failed to link PMID:%s to PMC: %s", pmid, exc)
+            return None
+
+        try:
+            result = Entrez.read(handle)
+        except Exception as exc:
+            logger.warning(
+                "Failed to read PMC link for PMID:%s: %s", pmid, exc)
+            return None
+        finally:
+            handle.close()
 
-        if result and result[0].get("LinkSetDb"):
-            links = result[0]["LinkSetDb"][0].get("Link", [])
-            if links:
-                return links[0]["Id"]
+        if isinstance(result, list) and result and isinstance(result[0], dict):
+            link_set_db = result[0].get("LinkSetDb", [])
+            if isinstance(link_set_db, list) and link_set_db:
+                links = link_set_db[0].get("Link", [])
+                if isinstance(links, list) and links:
+                    first_link = links[0]
+                    if isinstance(first_link, dict) and "Id" in first_link:
+                        return str(first_link["Id"])
 
         return None
 
@@ -296,7 +322,8 @@ def _fetch_pmc_xml(
         """
         time.sleep(config.rate_limit_delay)
 
-        handle = Entrez.efetch(db="pmc", id=pmcid, rettype="xml", retmode="xml")
+        handle = Entrez.efetch(
+            db="pmc", id=pmcid, rettype="xml", retmode="xml")
         xml_content = handle.read()
         handle.close()
 
diff --git a/src/linkml_reference_validator/plugins/reference_validation_plugin.py b/src/linkml_reference_validator/plugins/reference_validation_plugin.py
@@ -94,6 +94,16 @@ def __init__(
             self.validator = SupportingTextValidator(config)
             self.schema_view: Optional[SchemaView] = None
 
+        @property
+        def cache_dir(self) -> Path:
+            """Return the cache directory for this plugin."""
+            return self.config.cache_dir
+
+        @cache_dir.setter
+        def cache_dir(self, value: Path) -> None:
+            """Update the cache directory for this plugin."""
+            self.config.cache_dir = value
+
         def pre_process(self, context: ValidationContext) -> None:
             """Pre-process hook called before validation.
 
@@ -293,17 +303,16 @@ def _get_converter(self) -> Optional[Converter]:
                 return None
             schema = self.schema_view.schema
             if schema and schema.prefixes:
-                # schema.prefixes is a dict of prefix name -> Prefix object
-                # The Prefix object has a prefix_reference attribute with the URI
-                prefix_map = {
-                    name: (
-                        prefix.prefix_reference
-                        if hasattr(prefix, "prefix_reference")
-                        else str(prefix)
-                    )
-                    for name, prefix in schema.prefixes.items()
-                }
-                return Converter.from_prefix_map(prefix_map)
+                # schema.prefixes is a list of Prefix objects
+                # Each Prefix object has a prefix_name and prefix_reference attribute
+                prefix_map: dict[str, str] = {}
+                for prefix in schema.prefixes:
+                    prefix_name = getattr(prefix, "prefix_name", None)
+                    prefix_reference = getattr(prefix, "prefix_reference", None)
+                    if isinstance(prefix_name, str) and prefix_reference is not None:
+                        prefix_map[prefix_name] = str(prefix_reference)
+                if prefix_map:
+                    return Converter.from_prefix_map(prefix_map)
             return None
 
         # type: ignore
diff --git a/tests/test_plugin_integration.py b/tests/test_plugin_integration.py
@@ -38,7 +38,8 @@ def test_plugin_initialization_with_params():
     plugin = ReferenceValidationPlugin(
         cache_dir="/tmp/cache",
     )
-    assert plugin.config.cache_dir.as_posix() == "/tmp/cache"
+    # Access the cache_dir attribute directly if config is not exposed
+    assert plugin.cache_dir.as_posix() == "/tmp/cache"
 
 
 def test_extract_reference_id_string(plugin):
diff --git a/tests/test_sources.py b/tests/test_sources.py
@@ -130,6 +130,7 @@ def test_fetch_relative_path_with_base_dir(self, tmp_path):
         result = source.fetch("notes.md", config)
 
         assert result is not None
+        assert result.content is not None
         assert "Some notes here." in result.content
 
     def test_fetch_relative_path_cwd_fallback(self, source, config, tmp_path, monkeypatch):
@@ -154,7 +155,8 @@ def test_fetch_nonexistent_file(self, source, config):
     def test_extract_title_from_markdown(self, source, config, tmp_path):
         """Should extract title from first heading."""
         test_file = tmp_path / "titled.md"
-        test_file.write_text("Some preamble\n\n# The Real Title\n\nContent here.")
+        test_file.write_text(
+            "Some preamble\n\n# The Real Title\n\nContent here.")
 
         result = source.fetch(str(test_file), config)
 
@@ -164,7 +166,8 @@ def test_extract_title_from_markdown(self, source, config, tmp_path):
     def test_html_content_preserved(self, source, config, tmp_path):
         """HTML content should be preserved as-is."""
         test_file = tmp_path / "test.html"
-        test_file.write_text("<html><body><p>Test &amp; content</p></body></html>")
+        test_file.write_text(
+            "<html><body><p>Test &amp; content</p></body></html>")
 
         result = source.fetch(str(test_file), config)
 
@@ -280,6 +283,27 @@ def test_can_handle_pmid(self, source):
         assert source.can_handle("PMID 12345678")
         assert not source.can_handle("DOI:10.1234/test")
 
+    @patch("linkml_reference_validator.etl.sources.pmid.Entrez.read")
+    @patch("linkml_reference_validator.etl.sources.pmid.Entrez.elink")
+    def test_get_pmcid_handles_entrez_error(
+        self,
+        mock_elink,
+        mock_read,
+        source,
+        config,
+    ):
+        """Should return None when Entrez.read raises an error."""
+        handle = MagicMock()
+        mock_elink.return_value = handle
+        mock_read.side_effect = RuntimeError(
+            "Couldn't resolve #exLinkSrv2, the address table is empty."
+        )
+
+        result = source._get_pmcid("12112053", config)
+
+        assert result is None
+        handle.close.assert_called_once()
+
 
 class TestDOISource:
     """Tests for DOISource (refactored from ReferenceFetcher)."""
@@ -470,7 +494,8 @@ def config(self, tmp_path):
                 "Project_Description",
                 "bioproject",
             ),
-            (BioSampleSource, "biosample:SAMN00000001", "Title", "Description", "biosample"),
+            (BioSampleSource, "biosample:SAMN00000001",
+             "Title", "Description", "biosample"),
         ],
     )
     @patch("linkml_reference_validator.etl.sources.entrez.Entrez.read")
@@ -505,7 +530,8 @@ def test_fetch_entrez_summary(
         assert result.content == "Example content summary."
         assert result.content_type == "summary"
         assert result.metadata["entrez_db"] == db_name
-        mock_esummary.assert_called_once_with(db=db_name, id=reference_id.split(":", 1)[1])
+        mock_esummary.assert_called_once_with(
+            db=db_name, id=reference_id.split(":", 1)[1])
         mock_handle.close.assert_called_once()
 
     @pytest.mark.parametrize(
@@ -578,7 +604,8 @@ def test_fetch_geo_converts_accession_to_uid(
         # Configure mock_read to return different values for esearch vs esummary
         mock_read.side_effect = [
             {"IdList": ["200067472"]},  # esearch result
-            [{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}],  # esummary result
+            # esummary result
+            [{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}],
         ]
 
         result = source.fetch("GSE67472", config)
@@ -592,7 +619,8 @@ def test_fetch_geo_converts_accession_to_uid(
         assert result.metadata["entrez_uid"] == "200067472"
 
         # Verify esearch was called with accession
-        mock_esearch.assert_called_once_with(db="gds", term="GSE67472[Accession]")
+        mock_esearch.assert_called_once_with(
+            db="gds", term="GSE67472[Accession]")
         # Verify esummary was called with UID, not accession
         mock_esummary.assert_called_once_with(db="gds", id="200067472")
 

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,8 @@ def test_plugin_initialization_with_params():`
`38`	`38`	`plugin = ReferenceValidationPlugin(`
`39`	`39`	`cache_dir="/tmp/cache",`
`40`	`40`	`)`
`41`		`- assert plugin.config.cache_dir.as_posix() == "/tmp/cache"`
	`41`	`+ # Access the cache_dir attribute directly if config is not exposed`
	`42`	`+ assert plugin.cache_dir.as_posix() == "/tmp/cache"`
`42`	`43`
`43`	`44`
`44`	`45`	`def test_extract_reference_id_string(plugin):`