-
Notifications
You must be signed in to change notification settings - Fork 1
feat: searchable entry links — name-enriched links metadata + raw links.json (#143 #389) #390
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
1ce7f24
feat: add entry-reference extractor (shared discovery for #143 + #68/…
drernie ad0c7a0
feat: full EntryLink type map + classify_links; fix entity set (#389)
drernie 26bbfb1
feat: write references.json into each package; address review
drernie 9136f51
test: regression test for empty-string entity-link field values
drernie c6e5c3d
refactor: replace overloaded `packageable` with fetchable + eventable…
drernie f2af5e3
feat: searchable links metadata + raw links.json (entity name enrichm…
drernie 62fd7d2
chore: bump version to 0.17.3
drernie a927ce3
docs: changelog for 0.17.3 (searchable links metadata + links.json)
drernie bc876d1
chore: bump version to 0.18.0
drernie bbb2292
docs: align changelog heading to 0.18.0
drernie 8d2ccdc
docs: clarify note-link shape — id and webURL are optional
drernie File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,190 @@ | ||
| """Extract typed references out of a Benchling entry's structured data. | ||
|
|
||
| A Benchling entry can point at other Benchling objects in three places: | ||
|
|
||
| 1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. Entity | ||
| mentions appear as ``custom_entity`` / ``dna_sequence`` / ``aa_sequence`` / | ||
| ``dna_oligo`` / ``rna_oligo``; the same list also carries non-entity types | ||
| (e.g. ``sql_dashboard``), so callers must filter by ``type``. | ||
| 2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``, | ||
| carrying one or more entity IDs directly in ``value``. | ||
| 3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId`` | ||
| (the discovery site for assay results, issues #68/#69). | ||
|
|
||
| This module is pure: it operates on the entry dict already fetched by | ||
| ``EntryPackager`` and makes no Benchling API calls. Resolving each reference to a | ||
| full record (``get_by_id`` / ``bulk_get``) is the caller's job. | ||
|
|
||
| Shared discovery layer for entity packaging (#143) and assay results (#68/#69). | ||
| """ | ||
|
|
||
| from dataclasses import dataclass | ||
| from typing import Any, Dict, Iterator, List, Optional, Tuple | ||
|
drernie marked this conversation as resolved.
Outdated
|
||
|
|
||
| # Note-link / field ``type`` values that denote a registry entity. Used to keep | ||
| # entity references out of the non-entity links (dashboards, etc.) that share | ||
| # the same ``links[]`` array. | ||
| ENTITY_LINK_TYPES = frozenset( | ||
| { | ||
| "custom_entity", | ||
| "dna_sequence", | ||
| "aa_sequence", | ||
| "dna_oligo", | ||
| "rna_oligo", | ||
| } | ||
| ) | ||
|
|
||
| # Note ``type`` values that carry tabular assay results. | ||
| RESULTS_TABLE_NOTE_TYPES = frozenset( | ||
| { | ||
| "results_table", | ||
| "registration_table", | ||
| "table", | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class EntityReference: | ||
|
drernie marked this conversation as resolved.
Outdated
|
||
| """A reference to a Benchling entity discovered inside an entry. | ||
|
|
||
| ``type`` is the discovery type as seen in the entry (e.g. ``custom_entity`` | ||
| from a note link, or ``entity_link`` from a field) -- not necessarily the | ||
| entity's own schema type. ``source`` records where the reference was found. | ||
| """ | ||
|
|
||
| id: str | ||
| type: str | ||
| web_url: Optional[str] = None | ||
| source: str = "note_link" # "note_link" | "entity_field" | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class ResultsTableReference: | ||
| """A reference to an assay-results table embedded in an entry note.""" | ||
|
|
||
| assay_result_schema_id: str | ||
| api_id: Optional[str] = None | ||
| name: Optional[str] = None | ||
|
|
||
|
|
||
| def _iter_notes(entry_data: Dict[str, Any]) -> Iterator[Dict[str, Any]]: | ||
| """Yield every note across all days, defensively skipping malformed shapes.""" | ||
| for day in entry_data.get("days") or []: | ||
| if not isinstance(day, dict): | ||
| continue | ||
| for note in day.get("notes") or []: | ||
| if isinstance(note, dict): | ||
| yield note | ||
|
|
||
|
|
||
| def _iter_fields(entry_data: Dict[str, Any]) -> Iterator[Tuple[Optional[str], Dict[str, Any]]]: | ||
| """Yield ``(name, field)`` pairs. | ||
|
|
||
| Benchling renders entry ``fields`` as a name-keyed dict; some payloads use a | ||
| list of field objects instead, so both are accepted. | ||
| """ | ||
| fields = entry_data.get("fields") | ||
| if isinstance(fields, dict): | ||
| for name, fval in fields.items(): | ||
| if isinstance(fval, dict): | ||
| yield name, fval | ||
| elif isinstance(fields, list): | ||
| for fval in fields: | ||
| if isinstance(fval, dict): | ||
| yield fval.get("name"), fval | ||
|
|
||
|
|
||
| def _field_value_ids(fval: Dict[str, Any]) -> List[str]: | ||
| """Pull entity ID(s) out of a field value (single value or ``isMulti`` list).""" | ||
| val = fval.get("value") | ||
| if isinstance(val, str): | ||
| return [val] | ||
| if isinstance(val, list): | ||
| return [v for v in val if isinstance(v, str)] | ||
|
drernie marked this conversation as resolved.
Outdated
|
||
| return [] | ||
|
Copilot marked this conversation as resolved.
Outdated
|
||
|
|
||
|
|
||
| def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: | ||
| """Return every link object across all note bodies, unfiltered. | ||
|
|
||
| Lower-level primitive; most callers want :func:`extract_entity_references`. | ||
| """ | ||
| links: List[Dict[str, Any]] = [] | ||
| for note in _iter_notes(entry_data): | ||
| for link in note.get("links") or []: | ||
| if isinstance(link, dict): | ||
| links.append(link) | ||
| return links | ||
|
|
||
|
|
||
| def extract_entity_references( | ||
| entry_data: Dict[str, Any], | ||
| *, | ||
| types: "frozenset[str] | set[str]" = ENTITY_LINK_TYPES, | ||
| ) -> List[EntityReference]: | ||
| """Return deduped entity references from note links and entity-link fields. | ||
|
|
||
| Note links are filtered to ``types`` (default: all known entity types). | ||
| Entity-link fields are detected by an ``entity`` substring in the field | ||
| ``type`` and are included regardless of ``types``. References are deduped by | ||
| ID, preserving first-seen order (note links before fields). | ||
| """ | ||
| seen: set[str] = set() | ||
| refs: List[EntityReference] = [] | ||
|
|
||
| for link in extract_note_links(entry_data): | ||
| link_id = link.get("id") | ||
| link_type = link.get("type") | ||
| if not link_id or link_type not in types or link_id in seen: | ||
| continue | ||
| seen.add(link_id) | ||
| refs.append( | ||
| EntityReference( | ||
| id=str(link_id), | ||
| type=str(link_type), | ||
| web_url=link.get("webURL") or link.get("web_url"), | ||
| source="note_link", | ||
| ) | ||
| ) | ||
|
|
||
| for _name, fval in _iter_fields(entry_data): | ||
| ftype = fval.get("type") | ||
| if not ftype or "entity" not in str(ftype).lower(): | ||
| continue | ||
| for value_id in _field_value_ids(fval): | ||
| if value_id in seen: | ||
| continue | ||
| seen.add(value_id) | ||
| refs.append(EntityReference(id=value_id, type=str(ftype), source="entity_field")) | ||
|
|
||
| return refs | ||
|
|
||
|
|
||
| def extract_results_tables(entry_data: Dict[str, Any]) -> List[ResultsTableReference]: | ||
| """Return deduped assay-results-table references from entry notes. | ||
|
|
||
| Only notes whose ``type`` is a results-table type *and* that carry an | ||
| ``assayResultSchemaId`` are returned. Deduped by ``(api_id, schema_id)``. | ||
| """ | ||
| seen: set[Tuple[Optional[str], str]] = set() | ||
| tables: List[ResultsTableReference] = [] | ||
| for note in _iter_notes(entry_data): | ||
| if note.get("type") not in RESULTS_TABLE_NOTE_TYPES: | ||
| continue | ||
| schema_id = note.get("assayResultSchemaId") | ||
| if not schema_id: | ||
| continue | ||
| api_id = note.get("apiId") | ||
| key = (api_id, schema_id) | ||
| if key in seen: | ||
| continue | ||
| seen.add(key) | ||
| tables.append( | ||
| ResultsTableReference( | ||
| assay_result_schema_id=schema_id, | ||
| api_id=api_id, | ||
| name=note.get("name"), | ||
| ) | ||
| ) | ||
| return tables | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| """Tests for entry_references extractor (shared discovery for #143 + #68/#69).""" | ||
|
|
||
| from src.entry_references import ( | ||
| ENTITY_LINK_TYPES, | ||
| EntityReference, | ||
| ResultsTableReference, | ||
| extract_entity_references, | ||
| extract_note_links, | ||
| extract_results_tables, | ||
| ) | ||
|
|
||
|
|
||
| def _entry(*notes, fields=None): | ||
| """Build a minimal entry dict with one day holding the given notes.""" | ||
| return { | ||
| "id": "etr_test", | ||
| "days": [{"date": "2026-06-15", "title": "Day 1", "notes": list(notes)}], | ||
| "fields": fields or {}, | ||
| } | ||
|
|
||
|
|
||
| def _link_note(*links): | ||
| return {"type": "text", "text": "", "links": list(links)} | ||
|
|
||
|
|
||
| class TestExtractNoteLinks: | ||
| def test_collects_links_across_notes(self): | ||
| entry = _entry( | ||
| _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), | ||
| _link_note({"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}), | ||
| ) | ||
| ids = [link["id"] for link in extract_note_links(entry)] | ||
| assert ids == ["bfi_1", "seq_1"] | ||
|
|
||
| def test_empty_when_no_links(self): | ||
| assert extract_note_links(_entry(_link_note())) == [] | ||
|
|
||
| def test_tolerates_missing_days_notes_links(self): | ||
| assert extract_note_links({}) == [] | ||
| assert extract_note_links({"days": [{}]}) == [] | ||
| assert extract_note_links({"days": [{"notes": [{}]}]}) == [] | ||
|
|
||
|
|
||
| class TestExtractEntityReferences: | ||
| def test_returns_entity_links_only(self): | ||
| entry = _entry( | ||
| _link_note( | ||
| {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, | ||
| {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, | ||
| ) | ||
| ) | ||
| refs = extract_entity_references(entry) | ||
| assert refs == [EntityReference(id="bfi_1", type="custom_entity", web_url="u1", source="note_link")] | ||
|
|
||
| def test_dedupes_repeated_mentions(self): | ||
| entry = _entry( | ||
| _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), | ||
| _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), | ||
| ) | ||
| refs = extract_entity_references(entry) | ||
| assert [r.id for r in refs] == ["bfi_1"] | ||
|
|
||
| def test_all_known_entity_types_pass_filter(self): | ||
| notes = [_link_note({"id": f"id_{t}", "type": t, "webURL": "u"}) for t in sorted(ENTITY_LINK_TYPES)] | ||
| refs = extract_entity_references(_entry(*notes)) | ||
| assert {r.type for r in refs} == set(ENTITY_LINK_TYPES) | ||
|
|
||
| def test_pulls_entity_link_fields_single_and_multi(self): | ||
| entry = _entry( | ||
| fields={ | ||
| "Cell Line": {"type": "entity_link", "value": "bfi_field"}, | ||
| "Plasmids": {"type": "entity_link", "isMulti": True, "value": ["seq_a", "seq_b"]}, | ||
| "Project": {"type": "text", "value": "ignored"}, | ||
| } | ||
| ) | ||
| refs = extract_entity_references(entry) | ||
| assert [(r.id, r.source) for r in refs] == [ | ||
| ("bfi_field", "entity_field"), | ||
| ("seq_a", "entity_field"), | ||
| ("seq_b", "entity_field"), | ||
| ] | ||
|
drernie marked this conversation as resolved.
|
||
|
|
||
| def test_field_ids_deduped_against_note_links(self): | ||
| entry = _entry( | ||
| _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), | ||
| fields={"Cell Line": {"type": "entity_link", "value": "bfi_1"}}, | ||
| ) | ||
| refs = extract_entity_references(entry) | ||
| assert [r.id for r in refs] == ["bfi_1"] | ||
| assert refs[0].source == "note_link" # first-seen wins | ||
|
|
||
| def test_custom_type_filter(self): | ||
| entry = _entry( | ||
| _link_note( | ||
| {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, | ||
| {"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}, | ||
| ) | ||
| ) | ||
| refs = extract_entity_references(entry, types={"dna_sequence"}) | ||
| assert [r.id for r in refs] == ["seq_1"] | ||
|
|
||
| def test_fields_as_list_shape(self): | ||
| entry = { | ||
| "days": [], | ||
| "fields": [{"name": "Cell Line", "type": "entity_link", "value": "bfi_x"}], | ||
| } | ||
| refs = extract_entity_references(entry) | ||
| assert [r.id for r in refs] == ["bfi_x"] | ||
|
|
||
|
|
||
| class TestExtractResultsTables: | ||
| def test_returns_tables_with_schema_id(self): | ||
| entry = _entry( | ||
| {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"}, | ||
| {"type": "text", "links": []}, | ||
| ) | ||
| assert extract_results_tables(entry) == [ | ||
| ResultsTableReference(assay_result_schema_id="assaysch_1", api_id="tbl_1", name="T1") | ||
| ] | ||
|
drernie marked this conversation as resolved.
|
||
|
|
||
| def test_skips_tables_without_schema_id(self): | ||
| entry = _entry({"type": "results_table", "apiId": "tbl_1"}) | ||
| assert extract_results_tables(entry) == [] | ||
|
|
||
| def test_dedupes_by_api_and_schema(self): | ||
| note = {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1"} | ||
| entry = _entry(dict(note), dict(note)) | ||
| assert len(extract_results_tables(entry)) == 1 | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.