Skip to content

Commit 1d00d17

Browse files
committed
feat: add DOI DataCite fallback, supplementary files, and URL normalization
1 parent 1ca0c8b commit 1d00d17

26 files changed

+2074
-75
lines changed

.claude/settings.json

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,2 @@
11
{
2-
"permissions": {
3-
"allow": [
4-
"Bash(*)",
5-
"Edit",
6-
"MultiEdit",
7-
"NotebookEdit",
8-
"FileEdit",
9-
"WebFetch",
10-
"WebSearch",
11-
"Write"
12-
]
13-
}
142
}

.devcontainer/devcontainer.json

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"name": "odkfull-codespace",
3+
// consider also: Codespaces Prebuilds
4+
"image": "obolibrary/odkfull:latest",
5+
6+
// Mount your repo where ODK expects it
7+
"workspaceMount": "source=${localWorkspaceFolder},target=/work,type=bind,consistency=cached",
8+
"workspaceFolder": "/work",
9+
10+
// Keep VS Code in control of the container command (safer than using image ENTRYPOINT)
11+
"overrideCommand": true,
12+
13+
// Be root so you can add small tools if you need them
14+
"remoteUser": "root",
15+
//"features": {
16+
// "ghcr.io/devcontainers/features/node:1": { "version": "lts" }
17+
//},
18+
"extensions": [
19+
"anthropic.claude-code",
20+
"ms-python.python",
21+
"ms-azuretools.vscode-docker"
22+
],
23+
"remoteEnv": {
24+
"TEST_ENV": "123"
25+
},
26+
"postCreateCommand": "bash .devcontainer/post_create.sh"
27+
}

.devcontainer/post_create.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env bash
2+
set -e
3+
odk --help || true
4+
#curl -LsSf https://astral.sh/uv/install.sh | sh
5+
#uv pip install oaklib
6+
npm install -g @anthropic-ai/claude-code

.goosehints

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
AGENTS.md

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
CLAUDE.md

BUG_GEO_ACCESSION_TO_UID.md

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Bug: GEOSource fails to fetch GEO datasets
2+
3+
## Summary
4+
5+
The `GEOSource` class in `linkml_reference_validator/etl/sources/entrez.py` cannot fetch GEO dataset metadata because it passes GSE accessions directly to `esummary`, but the GDS Entrez database requires numeric UIDs.
6+
7+
## Error Observed
8+
9+
```
10+
WARNING:linkml_reference_validator.etl.sources.entrez:Failed to fetch Entrez summary for GEO:GSE67472: Invalid uid GSE67472 at position= 0
11+
```
12+
13+
## Root Cause
14+
15+
The `EntrezSummarySource.fetch()` method calls:
16+
17+
```python
18+
handle = Entrez.esummary(db=self.ENTREZ_DB, id=identifier)
19+
```
20+
21+
For GEO, this becomes `esummary(db='gds', id='GSE67472')`, but the GDS database doesn't accept accession numbers as IDs - it requires numeric UIDs like `200067472`.
22+
23+
## Proof of Concept
24+
25+
```python
26+
from Bio import Entrez
27+
Entrez.email = 'test@example.com'
28+
29+
# This FAILS - accession not accepted as UID
30+
handle = Entrez.esummary(db='gds', id='GSE67472')
31+
# Error: Invalid uid GSE67472 at position=0
32+
33+
# This WORKS - use esearch first to get UID
34+
handle = Entrez.esearch(db='gds', term='GSE67472[Accession]')
35+
result = Entrez.read(handle)
36+
handle.close()
37+
# result['IdList'] = ['200067472', ...]
38+
39+
uid = result['IdList'][0] # '200067472'
40+
41+
handle = Entrez.esummary(db='gds', id=uid)
42+
summary = Entrez.read(handle)
43+
handle.close()
44+
print(summary[0].get('title'))
45+
# Output: "Airway epithelial gene expression in asthma versus healthy controls"
46+
```
47+
48+
## Proposed Fix
49+
50+
Override `fetch()` in `GEOSource` to add an `esearch` step that converts accessions to UIDs:
51+
52+
```python
53+
@ReferenceSourceRegistry.register
54+
class GEOSource(EntrezSummarySource):
55+
"""Fetch GEO series and dataset summaries from Entrez."""
56+
57+
PREFIX = "GEO"
58+
ENTREZ_DB = "gds"
59+
TITLE_FIELDS = ("title", "description", "summary")
60+
CONTENT_FIELDS = ("summary", "description", "title")
61+
ID_PATTERNS = (r"^GSE\d+$", r"^GDS\d+$")
62+
63+
def fetch(
64+
self, identifier: str, config: ReferenceValidationConfig
65+
) -> Optional[ReferenceContent]:
66+
"""Fetch GEO dataset metadata, converting accession to UID first."""
67+
Entrez.email = config.email
68+
time.sleep(config.rate_limit_delay)
69+
70+
# Convert accession to UID via esearch
71+
uid = self._accession_to_uid(identifier)
72+
if not uid:
73+
logger.warning(f"Could not find GDS UID for {identifier}")
74+
return None
75+
76+
# Now fetch summary with numeric UID
77+
handle = None
78+
try:
79+
handle = Entrez.esummary(db=self.ENTREZ_DB, id=uid)
80+
records = Entrez.read(handle)
81+
except Exception as exc:
82+
logger.warning(f"Failed to fetch Entrez summary for {self.prefix()}:{identifier}: {exc}")
83+
return None
84+
finally:
85+
if handle is not None:
86+
handle.close()
87+
88+
record = self._extract_record(records)
89+
if not record:
90+
logger.warning(f"No Entrez summary found for {self.prefix()}:{identifier}")
91+
return None
92+
93+
title = self._get_first_field_value(record, self.TITLE_FIELDS)
94+
content = self._get_first_field_value(record, self.CONTENT_FIELDS)
95+
content_type = "summary" if content else "unavailable"
96+
97+
return ReferenceContent(
98+
reference_id=f"{self.prefix()}:{identifier}",
99+
title=title,
100+
content=content,
101+
content_type=content_type,
102+
metadata={"entrez_db": self.ENTREZ_DB, "entrez_uid": uid},
103+
)
104+
105+
def _accession_to_uid(self, accession: str) -> Optional[str]:
106+
"""Convert a GEO accession (GSE/GDS) to its Entrez UID."""
107+
handle = None
108+
try:
109+
handle = Entrez.esearch(db=self.ENTREZ_DB, term=f"{accession}[Accession]")
110+
result = Entrez.read(handle)
111+
if result.get("IdList"):
112+
return result["IdList"][0]
113+
except Exception as exc:
114+
logger.warning(f"esearch failed for {accession}: {exc}")
115+
finally:
116+
if handle is not None:
117+
handle.close()
118+
return None
119+
```
120+
121+
## Testing
122+
123+
After the fix, validation should catch title mismatches like:
124+
125+
```yaml
126+
# In kb/disorders/Asthma.yaml
127+
datasets:
128+
- accession: geo:GSE67472
129+
title: xxxAirway epithelial gene expression in asthma versus healthy controls # Wrong!
130+
```
131+
132+
Expected validation error:
133+
```
134+
[ERROR] Title mismatch for geo:GSE67472
135+
Expected: "Airway epithelial gene expression in asthma versus healthy controls"
136+
Found: "xxxAirway epithelial gene expression in asthma versus healthy controls"
137+
```

0 commit comments

Comments
 (0)