Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions docker/src/entry_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""Extract typed references out of a Benchling entry's structured data.

A Benchling entry can point at other Benchling objects in three places:

1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. Entity
mentions appear as ``custom_entity`` / ``dna_sequence`` / ``aa_sequence`` /
``dna_oligo`` / ``rna_oligo``; the same list also carries non-entity types
(e.g. ``sql_dashboard``), so callers must filter by ``type``.
2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``,
carrying one or more entity IDs directly in ``value``.
3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId``
(the discovery site for assay results, issues #68/#69).
Comment thread
drernie marked this conversation as resolved.

This module is pure: it operates on the entry dict already fetched by
``EntryPackager`` and makes no Benchling API calls. Resolving each reference to a
full record (``get_by_id`` / ``bulk_get``) is the caller's job.

Shared discovery layer for entity packaging (#143) and assay results (#68/#69).
"""

from dataclasses import dataclass
from typing import Any, Dict, Iterator, List, Optional, Tuple
Comment thread
drernie marked this conversation as resolved.
Outdated

# Note-link / field ``type`` values that denote a registry entity. Used to keep
# entity references out of the non-entity links (dashboards, etc.) that share
# the same ``links[]`` array.
ENTITY_LINK_TYPES = frozenset(
{
"custom_entity",
"dna_sequence",
"aa_sequence",
"dna_oligo",
"rna_oligo",
}
)

# Note ``type`` values that carry tabular assay results.
RESULTS_TABLE_NOTE_TYPES = frozenset(
{
"results_table",
"registration_table",
"table",
}
)


@dataclass(frozen=True)
class EntityReference:
Comment thread
drernie marked this conversation as resolved.
Outdated
"""A reference to a Benchling entity discovered inside an entry.

``type`` is the discovery type as seen in the entry (e.g. ``custom_entity``
from a note link, or ``entity_link`` from a field) -- not necessarily the
entity's own schema type. ``source`` records where the reference was found.
"""

id: str
type: str
web_url: Optional[str] = None
source: str = "note_link" # "note_link" | "entity_field"


@dataclass(frozen=True)
class ResultsTableReference:
"""A reference to an assay-results table embedded in an entry note."""

assay_result_schema_id: str
api_id: Optional[str] = None
name: Optional[str] = None


def _iter_notes(entry_data: Dict[str, Any]) -> Iterator[Dict[str, Any]]:
"""Yield every note across all days, defensively skipping malformed shapes."""
for day in entry_data.get("days") or []:
if not isinstance(day, dict):
continue
for note in day.get("notes") or []:
if isinstance(note, dict):
yield note


def _iter_fields(entry_data: Dict[str, Any]) -> Iterator[Tuple[Optional[str], Dict[str, Any]]]:
"""Yield ``(name, field)`` pairs.

Benchling renders entry ``fields`` as a name-keyed dict; some payloads use a
list of field objects instead, so both are accepted.
"""
fields = entry_data.get("fields")
if isinstance(fields, dict):
for name, fval in fields.items():
if isinstance(fval, dict):
yield name, fval
elif isinstance(fields, list):
for fval in fields:
if isinstance(fval, dict):
yield fval.get("name"), fval


def _field_value_ids(fval: Dict[str, Any]) -> List[str]:
"""Pull entity ID(s) out of a field value (single value or ``isMulti`` list)."""
val = fval.get("value")
if isinstance(val, str):
return [val]
if isinstance(val, list):
return [v for v in val if isinstance(v, str)]
Comment thread
drernie marked this conversation as resolved.
Outdated
return []
Comment thread
Copilot marked this conversation as resolved.
Outdated


def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Return every link object across all note bodies, unfiltered.

Lower-level primitive; most callers want :func:`extract_entity_references`.
"""
links: List[Dict[str, Any]] = []
for note in _iter_notes(entry_data):
for link in note.get("links") or []:
if isinstance(link, dict):
links.append(link)
return links


def extract_entity_references(
entry_data: Dict[str, Any],
*,
types: "frozenset[str] | set[str]" = ENTITY_LINK_TYPES,
) -> List[EntityReference]:
"""Return deduped entity references from note links and entity-link fields.

Note links are filtered to ``types`` (default: all known entity types).
Entity-link fields are detected by an ``entity`` substring in the field
``type`` and are included regardless of ``types``. References are deduped by
ID, preserving first-seen order (note links before fields).
"""
seen: set[str] = set()
refs: List[EntityReference] = []

for link in extract_note_links(entry_data):
link_id = link.get("id")
link_type = link.get("type")
if not link_id or link_type not in types or link_id in seen:
continue
seen.add(link_id)
refs.append(
EntityReference(
id=str(link_id),
type=str(link_type),
web_url=link.get("webURL") or link.get("web_url"),
source="note_link",
)
)

for _name, fval in _iter_fields(entry_data):
ftype = fval.get("type")
if not ftype or "entity" not in str(ftype).lower():
continue
for value_id in _field_value_ids(fval):
if value_id in seen:
continue
seen.add(value_id)
refs.append(EntityReference(id=value_id, type=str(ftype), source="entity_field"))

return refs


def extract_results_tables(entry_data: Dict[str, Any]) -> List[ResultsTableReference]:
"""Return deduped assay-results-table references from entry notes.

Only notes whose ``type`` is a results-table type *and* that carry an
``assayResultSchemaId`` are returned. Deduped by ``(api_id, schema_id)``.
"""
seen: set[Tuple[Optional[str], str]] = set()
tables: List[ResultsTableReference] = []
for note in _iter_notes(entry_data):
if note.get("type") not in RESULTS_TABLE_NOTE_TYPES:
continue
schema_id = note.get("assayResultSchemaId")
if not schema_id:
continue
api_id = note.get("apiId")
key = (api_id, schema_id)
if key in seen:
continue
seen.add(key)
tables.append(
ResultsTableReference(
assay_result_schema_id=schema_id,
api_id=api_id,
name=note.get("name"),
)
)
return tables
128 changes: 128 additions & 0 deletions docker/tests/test_entry_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Tests for entry_references extractor (shared discovery for #143 + #68/#69)."""

from src.entry_references import (
ENTITY_LINK_TYPES,
EntityReference,
ResultsTableReference,
extract_entity_references,
extract_note_links,
extract_results_tables,
)


def _entry(*notes, fields=None):
"""Build a minimal entry dict with one day holding the given notes."""
return {
"id": "etr_test",
"days": [{"date": "2026-06-15", "title": "Day 1", "notes": list(notes)}],
"fields": fields or {},
}


def _link_note(*links):
return {"type": "text", "text": "", "links": list(links)}


class TestExtractNoteLinks:
def test_collects_links_across_notes(self):
entry = _entry(
_link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}),
_link_note({"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}),
)
ids = [link["id"] for link in extract_note_links(entry)]
assert ids == ["bfi_1", "seq_1"]

def test_empty_when_no_links(self):
assert extract_note_links(_entry(_link_note())) == []

def test_tolerates_missing_days_notes_links(self):
assert extract_note_links({}) == []
assert extract_note_links({"days": [{}]}) == []
assert extract_note_links({"days": [{"notes": [{}]}]}) == []


class TestExtractEntityReferences:
def test_returns_entity_links_only(self):
entry = _entry(
_link_note(
{"id": "bfi_1", "type": "custom_entity", "webURL": "u1"},
{"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"},
)
)
refs = extract_entity_references(entry)
assert refs == [EntityReference(id="bfi_1", type="custom_entity", web_url="u1", source="note_link")]

def test_dedupes_repeated_mentions(self):
entry = _entry(
_link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}),
_link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}),
)
refs = extract_entity_references(entry)
assert [r.id for r in refs] == ["bfi_1"]

def test_all_known_entity_types_pass_filter(self):
notes = [_link_note({"id": f"id_{t}", "type": t, "webURL": "u"}) for t in sorted(ENTITY_LINK_TYPES)]
refs = extract_entity_references(_entry(*notes))
assert {r.type for r in refs} == set(ENTITY_LINK_TYPES)

def test_pulls_entity_link_fields_single_and_multi(self):
entry = _entry(
fields={
"Cell Line": {"type": "entity_link", "value": "bfi_field"},
"Plasmids": {"type": "entity_link", "isMulti": True, "value": ["seq_a", "seq_b"]},
"Project": {"type": "text", "value": "ignored"},
}
)
refs = extract_entity_references(entry)
assert [(r.id, r.source) for r in refs] == [
("bfi_field", "entity_field"),
("seq_a", "entity_field"),
("seq_b", "entity_field"),
]
Comment thread
drernie marked this conversation as resolved.

def test_field_ids_deduped_against_note_links(self):
entry = _entry(
_link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}),
fields={"Cell Line": {"type": "entity_link", "value": "bfi_1"}},
)
refs = extract_entity_references(entry)
assert [r.id for r in refs] == ["bfi_1"]
assert refs[0].source == "note_link" # first-seen wins

def test_custom_type_filter(self):
entry = _entry(
_link_note(
{"id": "bfi_1", "type": "custom_entity", "webURL": "u1"},
{"id": "seq_1", "type": "dna_sequence", "webURL": "u2"},
)
)
refs = extract_entity_references(entry, types={"dna_sequence"})
assert [r.id for r in refs] == ["seq_1"]

def test_fields_as_list_shape(self):
entry = {
"days": [],
"fields": [{"name": "Cell Line", "type": "entity_link", "value": "bfi_x"}],
}
refs = extract_entity_references(entry)
assert [r.id for r in refs] == ["bfi_x"]


class TestExtractResultsTables:
def test_returns_tables_with_schema_id(self):
entry = _entry(
{"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"},
{"type": "text", "links": []},
)
assert extract_results_tables(entry) == [
ResultsTableReference(assay_result_schema_id="assaysch_1", api_id="tbl_1", name="T1")
]
Comment thread
drernie marked this conversation as resolved.

def test_skips_tables_without_schema_id(self):
entry = _entry({"type": "results_table", "apiId": "tbl_1"})
assert extract_results_tables(entry) == []

def test_dedupes_by_api_and_schema(self):
note = {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1"}
entry = _entry(dict(note), dict(note))
assert len(extract_results_tables(entry)) == 1