|
| 1 | +# Bug: GEOSource fails to fetch GEO datasets |
| 2 | + |
| 3 | +## Summary |
| 4 | + |
| 5 | +The `GEOSource` class in `linkml_reference_validator/etl/sources/entrez.py` cannot fetch GEO dataset metadata because it passes GSE accessions directly to `esummary`, but the GDS Entrez database requires numeric UIDs. |
| 6 | + |
| 7 | +## Error Observed |
| 8 | + |
| 9 | +``` |
| 10 | +WARNING:linkml_reference_validator.etl.sources.entrez:Failed to fetch Entrez summary for GEO:GSE67472: Invalid uid GSE67472 at position= 0 |
| 11 | +``` |
| 12 | + |
| 13 | +## Root Cause |
| 14 | + |
| 15 | +The `EntrezSummarySource.fetch()` method calls: |
| 16 | + |
| 17 | +```python |
| 18 | +handle = Entrez.esummary(db=self.ENTREZ_DB, id=identifier) |
| 19 | +``` |
| 20 | + |
| 21 | +For GEO, this becomes `esummary(db='gds', id='GSE67472')`, but the GDS database doesn't accept accession numbers as IDs - it requires numeric UIDs like `200067472`. |
| 22 | + |
| 23 | +## Proof of Concept |
| 24 | + |
| 25 | +```python |
| 26 | +from Bio import Entrez |
| 27 | +Entrez.email = 'test@example.com' |
| 28 | + |
| 29 | +# This FAILS - accession not accepted as UID |
| 30 | +handle = Entrez.esummary(db='gds', id='GSE67472') |
| 31 | +# Error: Invalid uid GSE67472 at position=0 |
| 32 | + |
| 33 | +# This WORKS - use esearch first to get UID |
| 34 | +handle = Entrez.esearch(db='gds', term='GSE67472[Accession]') |
| 35 | +result = Entrez.read(handle) |
| 36 | +handle.close() |
| 37 | +# result['IdList'] = ['200067472', ...] |
| 38 | + |
| 39 | +uid = result['IdList'][0] # '200067472' |
| 40 | + |
| 41 | +handle = Entrez.esummary(db='gds', id=uid) |
| 42 | +summary = Entrez.read(handle) |
| 43 | +handle.close() |
| 44 | +print(summary[0].get('title')) |
| 45 | +# Output: "Airway epithelial gene expression in asthma versus healthy controls" |
| 46 | +``` |
| 47 | + |
| 48 | +## Proposed Fix |
| 49 | + |
| 50 | +Override `fetch()` in `GEOSource` to add an `esearch` step that converts accessions to UIDs: |
| 51 | + |
| 52 | +```python |
| 53 | +@ReferenceSourceRegistry.register |
| 54 | +class GEOSource(EntrezSummarySource): |
| 55 | + """Fetch GEO series and dataset summaries from Entrez.""" |
| 56 | + |
| 57 | + PREFIX = "GEO" |
| 58 | + ENTREZ_DB = "gds" |
| 59 | + TITLE_FIELDS = ("title", "description", "summary") |
| 60 | + CONTENT_FIELDS = ("summary", "description", "title") |
| 61 | + ID_PATTERNS = (r"^GSE\d+$", r"^GDS\d+$") |
| 62 | + |
| 63 | + def fetch( |
| 64 | + self, identifier: str, config: ReferenceValidationConfig |
| 65 | + ) -> Optional[ReferenceContent]: |
| 66 | + """Fetch GEO dataset metadata, converting accession to UID first.""" |
| 67 | + Entrez.email = config.email |
| 68 | + time.sleep(config.rate_limit_delay) |
| 69 | + |
| 70 | + # Convert accession to UID via esearch |
| 71 | + uid = self._accession_to_uid(identifier) |
| 72 | + if not uid: |
| 73 | + logger.warning(f"Could not find GDS UID for {identifier}") |
| 74 | + return None |
| 75 | + |
| 76 | + # Now fetch summary with numeric UID |
| 77 | + handle = None |
| 78 | + try: |
| 79 | + handle = Entrez.esummary(db=self.ENTREZ_DB, id=uid) |
| 80 | + records = Entrez.read(handle) |
| 81 | + except Exception as exc: |
| 82 | + logger.warning(f"Failed to fetch Entrez summary for {self.prefix()}:{identifier}: {exc}") |
| 83 | + return None |
| 84 | + finally: |
| 85 | + if handle is not None: |
| 86 | + handle.close() |
| 87 | + |
| 88 | + record = self._extract_record(records) |
| 89 | + if not record: |
| 90 | + logger.warning(f"No Entrez summary found for {self.prefix()}:{identifier}") |
| 91 | + return None |
| 92 | + |
| 93 | + title = self._get_first_field_value(record, self.TITLE_FIELDS) |
| 94 | + content = self._get_first_field_value(record, self.CONTENT_FIELDS) |
| 95 | + content_type = "summary" if content else "unavailable" |
| 96 | + |
| 97 | + return ReferenceContent( |
| 98 | + reference_id=f"{self.prefix()}:{identifier}", |
| 99 | + title=title, |
| 100 | + content=content, |
| 101 | + content_type=content_type, |
| 102 | + metadata={"entrez_db": self.ENTREZ_DB, "entrez_uid": uid}, |
| 103 | + ) |
| 104 | + |
| 105 | + def _accession_to_uid(self, accession: str) -> Optional[str]: |
| 106 | + """Convert a GEO accession (GSE/GDS) to its Entrez UID.""" |
| 107 | + handle = None |
| 108 | + try: |
| 109 | + handle = Entrez.esearch(db=self.ENTREZ_DB, term=f"{accession}[Accession]") |
| 110 | + result = Entrez.read(handle) |
| 111 | + if result.get("IdList"): |
| 112 | + return result["IdList"][0] |
| 113 | + except Exception as exc: |
| 114 | + logger.warning(f"esearch failed for {accession}: {exc}") |
| 115 | + finally: |
| 116 | + if handle is not None: |
| 117 | + handle.close() |
| 118 | + return None |
| 119 | +``` |
| 120 | + |
| 121 | +## Testing |
| 122 | + |
| 123 | +After the fix, validation should catch title mismatches like: |
| 124 | + |
| 125 | +```yaml |
| 126 | +# In kb/disorders/Asthma.yaml |
| 127 | +datasets: |
| 128 | + - accession: geo:GSE67472 |
| 129 | + title: xxxAirway epithelial gene expression in asthma versus healthy controls # Wrong! |
| 130 | +``` |
| 131 | +
|
| 132 | +Expected validation error: |
| 133 | +``` |
| 134 | +[ERROR] Title mismatch for geo:GSE67472 |
| 135 | + Expected: "Airway epithelial gene expression in asthma versus healthy controls" |
| 136 | + Found: "xxxAirway epithelial gene expression in asthma versus healthy controls" |
| 137 | +``` |
0 commit comments