Skip to content

Commit 57d9077

Browse files
authored
Merge pull request #38 from linkml/37-runtimeerror-couldnt-resolve-exlinksrv2-the-address-table-is-empty
Fix for uncaught RuntimeError when retrieving some PMID entries without PMCID
2 parents ac6a65d + f21c833 commit 57d9077

File tree

4 files changed

+99
-34
lines changed

4 files changed

+99
-34
lines changed

src/linkml_reference_validator/etl/sources/pmid.py

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import logging
1414
import re
1515
import time
16-
from typing import Optional
16+
from typing import Any, Optional
1717

1818
from Bio import Entrez # type: ignore
1919
from bs4 import BeautifulSoup # type: ignore
@@ -118,13 +118,20 @@ def fetch(
118118

119119
record = records[0] if isinstance(records, list) else records
120120

121+
if not isinstance(record, dict):
122+
logger.warning(
123+
"Unexpected record format for PMID:%s: %s", pmid, type(record))
124+
return None
125+
126+
record_dict: dict[str, Any] = record
127+
121128
# Convert Entrez StringElement objects to plain strings
122-
title = str(record.get("Title", ""))
123-
authors = self._parse_authors(record.get("AuthorList", []))
124-
journal = str(record.get("Source", ""))
125-
pub_date = record.get("PubDate", "")
129+
title = str(record_dict.get("Title", ""))
130+
authors = self._parse_authors(record_dict.get("AuthorList", []))
131+
journal = str(record_dict.get("Source", ""))
132+
pub_date = record_dict.get("PubDate", "")
126133
year = str(pub_date)[:4] if pub_date else ""
127-
doi = str(record.get("DOI", "")) if record.get("DOI") else ""
134+
doi = str(record_dict.get("DOI", "")) if record_dict.get("DOI") else ""
128135

129136
abstract = self._fetch_abstract(pmid, config)
130137
full_text, content_type = self._fetch_pmc_fulltext(pmid, config)
@@ -178,7 +185,8 @@ def _fetch_abstract(
178185
"""
179186
time.sleep(config.rate_limit_delay)
180187

181-
handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
188+
handle = Entrez.efetch(db="pubmed", id=pmid,
189+
rettype="abstract", retmode="text")
182190
abstract_text = handle.read()
183191
handle.close()
184192

@@ -206,7 +214,8 @@ def _fetch_mesh_terms(
206214
"""
207215
time.sleep(config.rate_limit_delay)
208216

209-
handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="xml")
217+
handle = Entrez.efetch(db="pubmed", id=pmid,
218+
rettype="xml", retmode="xml")
210219
xml_content = handle.read()
211220
handle.close()
212221

@@ -271,14 +280,31 @@ def _get_pmcid(self, pmid: str, config: ReferenceValidationConfig) -> Optional[s
271280
"""
272281
time.sleep(config.rate_limit_delay)
273282

274-
handle = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc")
275-
result = Entrez.read(handle)
276-
handle.close()
283+
try:
284+
handle = Entrez.elink(
285+
dbfrom="pubmed", db="pmc", id=pmid, linkname="pubmed_pmc"
286+
)
287+
except Exception as exc:
288+
logger.warning("Failed to link PMID:%s to PMC: %s", pmid, exc)
289+
return None
290+
291+
try:
292+
result = Entrez.read(handle)
293+
except Exception as exc:
294+
logger.warning(
295+
"Failed to read PMC link for PMID:%s: %s", pmid, exc)
296+
return None
297+
finally:
298+
handle.close()
277299

278-
if result and result[0].get("LinkSetDb"):
279-
links = result[0]["LinkSetDb"][0].get("Link", [])
280-
if links:
281-
return links[0]["Id"]
300+
if isinstance(result, list) and result and isinstance(result[0], dict):
301+
link_set_db = result[0].get("LinkSetDb", [])
302+
if isinstance(link_set_db, list) and link_set_db:
303+
links = link_set_db[0].get("Link", [])
304+
if isinstance(links, list) and links:
305+
first_link = links[0]
306+
if isinstance(first_link, dict) and "Id" in first_link:
307+
return str(first_link["Id"])
282308

283309
return None
284310

@@ -296,7 +322,8 @@ def _fetch_pmc_xml(
296322
"""
297323
time.sleep(config.rate_limit_delay)
298324

299-
handle = Entrez.efetch(db="pmc", id=pmcid, rettype="xml", retmode="xml")
325+
handle = Entrez.efetch(
326+
db="pmc", id=pmcid, rettype="xml", retmode="xml")
300327
xml_content = handle.read()
301328
handle.close()
302329

src/linkml_reference_validator/plugins/reference_validation_plugin.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ def __init__(
9494
self.validator = SupportingTextValidator(config)
9595
self.schema_view: Optional[SchemaView] = None
9696

97+
@property
98+
def cache_dir(self) -> Path:
99+
"""Return the cache directory for this plugin."""
100+
return self.config.cache_dir
101+
102+
@cache_dir.setter
103+
def cache_dir(self, value: Path) -> None:
104+
"""Update the cache directory for this plugin."""
105+
self.config.cache_dir = value
106+
97107
def pre_process(self, context: ValidationContext) -> None:
98108
"""Pre-process hook called before validation.
99109
@@ -293,17 +303,16 @@ def _get_converter(self) -> Optional[Converter]:
293303
return None
294304
schema = self.schema_view.schema
295305
if schema and schema.prefixes:
296-
# schema.prefixes is a dict of prefix name -> Prefix object
297-
# The Prefix object has a prefix_reference attribute with the URI
298-
prefix_map = {
299-
name: (
300-
prefix.prefix_reference
301-
if hasattr(prefix, "prefix_reference")
302-
else str(prefix)
303-
)
304-
for name, prefix in schema.prefixes.items()
305-
}
306-
return Converter.from_prefix_map(prefix_map)
306+
# schema.prefixes is a list of Prefix objects
307+
# Each Prefix object has a prefix_name and prefix_reference attribute
308+
prefix_map: dict[str, str] = {}
309+
for prefix in schema.prefixes:
310+
prefix_name = getattr(prefix, "prefix_name", None)
311+
prefix_reference = getattr(prefix, "prefix_reference", None)
312+
if isinstance(prefix_name, str) and prefix_reference is not None:
313+
prefix_map[prefix_name] = str(prefix_reference)
314+
if prefix_map:
315+
return Converter.from_prefix_map(prefix_map)
307316
return None
308317

309318
# type: ignore

tests/test_plugin_integration.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def test_plugin_initialization_with_params():
3838
plugin = ReferenceValidationPlugin(
3939
cache_dir="/tmp/cache",
4040
)
41-
assert plugin.config.cache_dir.as_posix() == "/tmp/cache"
41+
# Access the cache_dir attribute directly if config is not exposed
42+
assert plugin.cache_dir.as_posix() == "/tmp/cache"
4243

4344

4445
def test_extract_reference_id_string(plugin):

tests/test_sources.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def test_fetch_relative_path_with_base_dir(self, tmp_path):
130130
result = source.fetch("notes.md", config)
131131

132132
assert result is not None
133+
assert result.content is not None
133134
assert "Some notes here." in result.content
134135

135136
def test_fetch_relative_path_cwd_fallback(self, source, config, tmp_path, monkeypatch):
@@ -154,7 +155,8 @@ def test_fetch_nonexistent_file(self, source, config):
154155
def test_extract_title_from_markdown(self, source, config, tmp_path):
155156
"""Should extract title from first heading."""
156157
test_file = tmp_path / "titled.md"
157-
test_file.write_text("Some preamble\n\n# The Real Title\n\nContent here.")
158+
test_file.write_text(
159+
"Some preamble\n\n# The Real Title\n\nContent here.")
158160

159161
result = source.fetch(str(test_file), config)
160162

@@ -164,7 +166,8 @@ def test_extract_title_from_markdown(self, source, config, tmp_path):
164166
def test_html_content_preserved(self, source, config, tmp_path):
165167
"""HTML content should be preserved as-is."""
166168
test_file = tmp_path / "test.html"
167-
test_file.write_text("<html><body><p>Test &amp; content</p></body></html>")
169+
test_file.write_text(
170+
"<html><body><p>Test &amp; content</p></body></html>")
168171

169172
result = source.fetch(str(test_file), config)
170173

@@ -280,6 +283,27 @@ def test_can_handle_pmid(self, source):
280283
assert source.can_handle("PMID 12345678")
281284
assert not source.can_handle("DOI:10.1234/test")
282285

286+
@patch("linkml_reference_validator.etl.sources.pmid.Entrez.read")
287+
@patch("linkml_reference_validator.etl.sources.pmid.Entrez.elink")
288+
def test_get_pmcid_handles_entrez_error(
289+
self,
290+
mock_elink,
291+
mock_read,
292+
source,
293+
config,
294+
):
295+
"""Should return None when Entrez.read raises an error."""
296+
handle = MagicMock()
297+
mock_elink.return_value = handle
298+
mock_read.side_effect = RuntimeError(
299+
"Couldn't resolve #exLinkSrv2, the address table is empty."
300+
)
301+
302+
result = source._get_pmcid("12112053", config)
303+
304+
assert result is None
305+
handle.close.assert_called_once()
306+
283307

284308
class TestDOISource:
285309
"""Tests for DOISource (refactored from ReferenceFetcher)."""
@@ -470,7 +494,8 @@ def config(self, tmp_path):
470494
"Project_Description",
471495
"bioproject",
472496
),
473-
(BioSampleSource, "biosample:SAMN00000001", "Title", "Description", "biosample"),
497+
(BioSampleSource, "biosample:SAMN00000001",
498+
"Title", "Description", "biosample"),
474499
],
475500
)
476501
@patch("linkml_reference_validator.etl.sources.entrez.Entrez.read")
@@ -505,7 +530,8 @@ def test_fetch_entrez_summary(
505530
assert result.content == "Example content summary."
506531
assert result.content_type == "summary"
507532
assert result.metadata["entrez_db"] == db_name
508-
mock_esummary.assert_called_once_with(db=db_name, id=reference_id.split(":", 1)[1])
533+
mock_esummary.assert_called_once_with(
534+
db=db_name, id=reference_id.split(":", 1)[1])
509535
mock_handle.close.assert_called_once()
510536

511537
@pytest.mark.parametrize(
@@ -578,7 +604,8 @@ def test_fetch_geo_converts_accession_to_uid(
578604
# Configure mock_read to return different values for esearch vs esummary
579605
mock_read.side_effect = [
580606
{"IdList": ["200067472"]}, # esearch result
581-
[{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}], # esummary result
607+
# esummary result
608+
[{"title": "GEO Dataset Title", "summary": "GEO dataset summary."}],
582609
]
583610

584611
result = source.fetch("GSE67472", config)
@@ -592,7 +619,8 @@ def test_fetch_geo_converts_accession_to_uid(
592619
assert result.metadata["entrez_uid"] == "200067472"
593620

594621
# Verify esearch was called with accession
595-
mock_esearch.assert_called_once_with(db="gds", term="GSE67472[Accession]")
622+
mock_esearch.assert_called_once_with(
623+
db="gds", term="GSE67472[Accession]")
596624
# Verify esummary was called with UID, not accession
597625
mock_esummary.assert_called_once_with(db="gds", id="200067472")
598626

0 commit comments

Comments
 (0)