From 165ae9ba15170b2be1cd71baa7cf152ce62832fa Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 10:33:27 -0700 Subject: [PATCH 01/18] feat(occurrence-stats): add lca_rank_between helper Pure-Python LCA over (taxon_id, rank, parents_json) tuples. Returns the deepest shared TaxonRank or None. Used by the upcoming human-model-agreement stat to bucket agreement at-or-finer-than ORDER. Plan: docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md Side-research: docs/claude/planning/occurrence-filter-driven-exports.md Co-Authored-By: Claude --- ami/main/models_future/occurrence.py | 26 +- ami/main/tests.py | 93 ++ ...26-05-14-human-model-agreement-endpoint.md | 812 ++++++++++++++++++ .../occurrence-filter-driven-exports.md | 116 +++ 4 files changed, 1046 insertions(+), 1 deletion(-) create mode 100644 docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md create mode 100644 docs/claude/planning/occurrence-filter-driven-exports.md diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index 6f599cbe0..999fec65b 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -14,11 +14,35 @@ from django.db.models import Count, Prefetch, Q, QuerySet -from ami.main.models import Project, User +from ami.main.models import Project, TaxonRank, User if TYPE_CHECKING: from ami.main.models import Classification, Identification, Occurrence +TaxonTuple = tuple[int, str, list[dict]] + + +def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: + """Most-specific shared ancestor rank between two taxa. + + Inputs are ``(taxon_id, rank_str, parents_json)`` triples where + ``parents_json`` is ordered root → immediate parent (Taxon.parents_json layout). + + The taxon itself counts as part of its own ancestor chain — passing the + same taxon twice returns that taxon's rank. Returns ``None`` when the two + chains share no ancestor (e.g. one has an empty parents_json and the other + doesn't include it). + """ + chain_a = [(p["id"], TaxonRank(p["rank"])) for p in a[2]] + [(a[0], TaxonRank(a[1]))] + chain_b_ids = {p["id"] for p in b[2]} | {b[0]} + + deepest: TaxonRank | None = None + for tid, rank in chain_a: + if tid in chain_b_ids: + if deepest is None or rank > deepest: + deepest = rank + return deepest + def _detections_prefetch(*, ordering: tuple[str, ...], with_source_image: bool) -> Prefetch: from ami.main.models import Classification, Detection diff --git a/ami/main/tests.py b/ami/main/tests.py index 3352bd2d0..7ca932b46 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4679,6 +4679,99 @@ def test_source_image_cached_counts_refresh_on_threshold_change(self): ) +class TestLcaRankBetween(TestCase): + """Pure-Python LCA over (taxon_id, rank, parents_json) tuples. + + Inputs encode each taxon as ``(id, rank_str, [{"id": int, "rank": str}, ...])`` + where the parents list is ordered root → immediate-parent (matches + Taxon.parents_json layout). + """ + + GENUS_NOCTUA = ( + 101, + "GENUS", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + ], + ) + SPECIES_NOCTUA_PRONUBA = ( + 201, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + {"id": 101, "rank": "GENUS"}, + ], + ) + SPECIES_NOCTUA_FIMBRIATA = ( + 202, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + {"id": 101, "rank": "GENUS"}, + ], + ) + SPECIES_DIFFERENT_FAMILY = ( + 301, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 99, "rank": "FAMILY"}, + ], + ) + SPECIES_DIFFERENT_ORDER = ( + 401, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 5, "rank": "ORDER"}, + ], + ) + + def test_identical_taxa_lca_is_self_rank(self): + from ami.main.models_future.occurrence import lca_rank_between + + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_PRONUBA) + self.assertEqual(rank, TaxonRank.SPECIES) + + def test_sister_species_share_genus(self): + from ami.main.models_future.occurrence import lca_rank_between + + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_FIMBRIATA) + self.assertEqual(rank, TaxonRank.GENUS) + + def test_genus_vs_species_in_same_genus(self): + from ami.main.models_future.occurrence import lca_rank_between + + rank = lca_rank_between(self.GENUS_NOCTUA, self.SPECIES_NOCTUA_PRONUBA) + self.assertEqual(rank, TaxonRank.GENUS) + + def test_different_family_same_order(self): + from ami.main.models_future.occurrence import lca_rank_between + + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_FAMILY) + self.assertEqual(rank, TaxonRank.ORDER) + + def test_different_order_same_kingdom(self): + from ami.main.models_future.occurrence import lca_rank_between + + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_ORDER) + self.assertEqual(rank, TaxonRank.KINGDOM) + + def test_no_shared_ancestor_returns_none(self): + from ami.main.models_future.occurrence import lca_rank_between + + rootless = (501, "SPECIES", []) + rank = lca_rank_between(rootless, self.SPECIES_NOCTUA_PRONUBA) + self.assertIsNone(rank) + + class TestOccurrenceStatsViewSet(APITestCase): """Covers /api/v2/occurrences/stats/top-identifiers/. diff --git a/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md b/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md new file mode 100644 index 000000000..09bd56ae9 --- /dev/null +++ b/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md @@ -0,0 +1,812 @@ +# `/occurrences/stats/human-model-agreement/` Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a project-scoped stats endpoint that returns verified-occurrence and human↔model-agreement rates over the same filter set the `/occurrences/` list view accepts. + +**Architecture:** +- Pure aggregation function in `ami/main/models_future/occurrence.py` operating on an already-filtered `Occurrence` queryset (caller wires `apply_default_filters` + `OccurrenceFilter`). +- `@action` on existing `OccurrenceStatsViewSet`. Re-uses `OccurrenceViewSet`'s `filter_backends` + `filterset_fields` so any query param valid on the list view is valid here. +- LCA computed in Python via `Taxon.parents_json`. Rank ordering via existing `TaxonRank(OrderedEnum)`. No DB schema changes. + +**Tech Stack:** Django 4.2, DRF, django-filter, drf-spectacular. Python 3.11. + +**Spec reference:** `docs/claude/prompts/human-model-agreement-endpoint.md` (lives in sibling `user-leaderboard` worktree). Stats convention: `docs/claude/reference/api-stats-pattern.md`. + +**Open questions resolved during planning** (cite as evidence in PR description): + +- **"Verified"** = occurrence has ≥1 non-withdrawn `Identification`. Matches `OccurrenceVerified` filter at `ami/main/api/views.py:1032` (which doesn't filter `withdrawn`), with `withdrawn=False` added for stats — consistent with `OccurrenceQuerySet.with_verification_info()` at `ami/main/models.py:3032`. +- **"Model prediction"** = `Classification` chosen by `BEST_MACHINE_PREDICTION_ORDER = ("-terminal", "-score", "-pk")` at `ami/main/models.py:61`. NOT `Occurrence.determination` (user-overridable). Use existing `OccurrenceQuerySet.with_best_machine_prediction()` at `ami/main/models.py:2998` which exposes `best_machine_prediction_taxon_id`. +- **"Under order"** inclusive: a taxon's rank qualifies iff `TaxonRank(rank) >= TaxonRank.ORDER`. `OrderedEnum.__ge__` at `ami/utils/schemas.py:51`. So ORDER, SUPERFAMILY, FAMILY, SUBFAMILY, TRIBE, SUBTRIBE, GENUS, SPECIES all count. CLASS, PHYLUM, KINGDOM do not. + +--- + +## File Structure + +``` +ami/ + main/ + models_future/ + occurrence.py # ADD: human_model_agreement_for_project() + # ADD: _lca_rank_of() helper + api/ + views.py # MODIFY: add human_model_agreement @action to OccurrenceStatsViewSet + serializers.py # ADD: HumanModelAgreementSerializer + tests.py # MODIFY: extend TestOccurrenceStatsViewSet +ui/ + src/ + data-services/ + hooks/ + occurrences/ + stats/ + useHumanModelAgreement.ts # ADD: typed React Query hook +``` + +No new files in backend (helpers live next to siblings). One new file frontend-side. + +--- + +## Task 1: LCA helper + rank check (unit-test only, no DB) + +**Files:** +- Modify: `ami/main/models_future/occurrence.py` +- Test: `ami/main/tests.py` (new class `TestHumanModelAgreementHelpers`) + +The LCA helper takes two `parents_json` lists (plus each taxon's own `(id, rank)` since `parents_json` excludes self) and returns the most-specific shared ancestor's `TaxonRank`, or `None`. Pure function; no DB. + +- [ ] **Step 1.1: Write failing unit tests** + +Add to `ami/main/tests.py` (above `class TestOccurrenceStatsViewSet`): + +```python +from ami.main.models import TaxonRank +from ami.main.models_future.occurrence import lca_rank_between + + +class TestLcaRankBetween(TestCase): + """Pure-Python LCA over (taxon_id, rank, parents_json) tuples. + + Inputs encode each taxon as ``(id, rank_str, [{"id": int, "rank": str}, ...])`` + where the parents list is ordered root → immediate-parent (matches + Taxon.parents_json layout). + """ + + GENUS_NOCTUA = (101, "GENUS", [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + ]) + SPECIES_NOCTUA_PRONUBA = (201, "SPECIES", [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + {"id": 101, "rank": "GENUS"}, + ]) + SPECIES_NOCTUA_FIMBRIATA = (202, "SPECIES", [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 30, "rank": "FAMILY"}, + {"id": 101, "rank": "GENUS"}, + ]) + SPECIES_DIFFERENT_FAMILY = (301, "SPECIES", [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 4, "rank": "ORDER"}, + {"id": 99, "rank": "FAMILY"}, + ]) + SPECIES_DIFFERENT_ORDER = (401, "SPECIES", [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 5, "rank": "ORDER"}, + ]) + + def test_identical_taxa_lca_is_self_rank(self): + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_PRONUBA) + self.assertEqual(rank, TaxonRank.SPECIES) + + def test_sister_species_share_genus(self): + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_FIMBRIATA) + self.assertEqual(rank, TaxonRank.GENUS) + + def test_genus_vs_species_in_same_genus(self): + rank = lca_rank_between(self.GENUS_NOCTUA, self.SPECIES_NOCTUA_PRONUBA) + # GENUS itself is on the species' ancestor chain, so LCA = GENUS. + self.assertEqual(rank, TaxonRank.GENUS) + + def test_different_family_same_order(self): + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_FAMILY) + self.assertEqual(rank, TaxonRank.ORDER) + + def test_different_order_same_kingdom(self): + rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_ORDER) + self.assertEqual(rank, TaxonRank.KINGDOM) + + def test_no_shared_ancestor_returns_none(self): + rootless = (501, "SPECIES", []) + rank = lca_rank_between(rootless, self.SPECIES_NOCTUA_PRONUBA) + self.assertIsNone(rank) +``` + +- [ ] **Step 1.2: Run tests, confirm they fail (import error)** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestLcaRankBetween -v 2 --keepdb +``` +Expected: `ImportError: cannot import name 'lca_rank_between'`. + +- [ ] **Step 1.3: Implement `lca_rank_between`** + +Append to `ami/main/models_future/occurrence.py`: + +```python +from ami.main.models import TaxonRank + +TaxonTuple = tuple[int, str, list[dict]] + + +def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: + """Most-specific shared ancestor rank between two taxa. + + Inputs are ``(taxon_id, rank_str, parents_json)`` triples where + ``parents_json`` is ordered root → immediate parent (Taxon.parents_json layout). + + The taxon itself counts as part of its own ancestor chain — passing the + same taxon twice returns that taxon's rank. Returns ``None`` when the two + chains share no ancestor (e.g. one has an empty parents_json and the other + doesn't include it). + """ + chain_a = [(p["id"], TaxonRank(p["rank"])) for p in a[2]] + [(a[0], TaxonRank(a[1]))] + chain_b_ids = {p["id"] for p in b[2]} | {b[0]} + + deepest: TaxonRank | None = None + for tid, rank in chain_a: + if tid in chain_b_ids: + if deepest is None or rank > deepest: + deepest = rank + return deepest +``` + +- [ ] **Step 1.4: Run tests, confirm all pass** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestLcaRankBetween -v 2 --keepdb +``` +Expected: `OK (6 tests)`. + +- [ ] **Step 1.5: Commit** + +```bash +git add ami/main/models_future/occurrence.py ami/main/tests.py +git commit -m "feat(occurrence-stats): add lca_rank_between helper + +Pure-Python LCA over (taxon_id, rank, parents_json) tuples. Returns +the deepest shared TaxonRank or None. Used by the upcoming +human-model-agreement stat to bucket agreement at-or-finer-than ORDER. + +Co-Authored-By: Claude " +``` + +--- + +## Task 2: Aggregation function over a filtered queryset + +**Files:** +- Modify: `ami/main/models_future/occurrence.py` +- Test: `ami/main/tests.py` (new class `TestHumanModelAgreementForProject`) + +The function takes a filtered `Occurrence` queryset and returns a serializer-ready dict. Caller is responsible for wiring `apply_default_filters` + `OccurrenceFilter` upstream; the function adds the prefetches/annotations it needs and does the bucketing. + +- [ ] **Step 2.1: Write failing test** + +Add to `ami/main/tests.py`: + +```python +class TestHumanModelAgreementForProject(APITestCase): + """Aggregation function. DB-level. Covers the four bucket transitions: + unverified, verified+exact-agreed, verified+under-order-agreed, + verified+disagreed-above-order. + """ + + def setUp(self) -> None: + project, deployment = setup_test_project() + create_taxa(project=project) + create_captures(deployment=deployment) + create_occurrences(deployment=deployment, num=4) + self.project = project + # Need a couple of taxa at known ranks; create_taxa builds a small tree + # rooted in a Kingdom -> Order -> Family -> Genus -> Species chain. + self.species_a = Taxon.objects.get(name="Vanessa atalanta", projects=project) + self.species_b = Taxon.objects.get(name="Vanessa cardui", projects=project) # same genus + self.species_c = Taxon.objects.get(name="Apis mellifera", projects=project) # different family + self.user = User.objects.create_user(email="ider@insectai.org") + + def _attach_machine_prediction(self, occurrence, taxon, score=0.9): + # Picks up the existing detection on this occurrence and adds a Classification. + detection = occurrence.detections.first() + Classification.objects.create( + detection=detection, + taxon=taxon, + score=score, + terminal=True, + algorithm=detection.detection_algorithm, + ) + + def _identify(self, occurrence, taxon): + return Identification.objects.create(user=self.user, occurrence=occurrence, taxon=taxon) + + def test_empty_project_returns_zeros_not_nans(self): + empty_project = Project.objects.create(name="empty") + result = human_model_agreement_for_project(Occurrence.objects.filter(project=empty_project)) + self.assertEqual(result["total_occurrences"], 0) + self.assertEqual(result["verified_count"], 0) + self.assertEqual(result["verified_pct"], 0.0) + self.assertEqual(result["agreed_exact_pct"], 0.0) + self.assertEqual(result["agreed_under_order_pct"], 0.0) + + def test_buckets_four_canonical_cases(self): + occurrences = list(Occurrence.objects.filter(project=self.project)[:4]) + # 0: verified, machine == user (exact agreement) + self._attach_machine_prediction(occurrences[0], self.species_a) + self._identify(occurrences[0], self.species_a) + # 1: verified, machine sister-species (agreement at GENUS, under ORDER) + self._attach_machine_prediction(occurrences[1], self.species_a) + self._identify(occurrences[1], self.species_b) + # 2: verified, machine different family but same ORDER (still under-order) + # NOTE: requires species_c to share an order with species_a in the fixture. + # If create_taxa() does not put Apis + Vanessa under the same ORDER, + # construct a sibling-order test taxon here. See follow-up note below. + # 3: unverified (no identification) + self._attach_machine_prediction(occurrences[3], self.species_a) + + result = human_model_agreement_for_project(Occurrence.objects.filter(project=self.project)) + self.assertEqual(result["total_occurrences"], 4) + self.assertEqual(result["verified_count"], 2) # occurrences 0, 1 + self.assertEqual(result["agreed_exact_count"], 1) # occurrence 0 + self.assertEqual(result["agreed_under_order_count"], 2) # both — exact is a subset + self.assertAlmostEqual(result["verified_pct"], 0.5) + self.assertAlmostEqual(result["agreed_exact_pct"], 0.5) + self.assertAlmostEqual(result["agreed_under_order_pct"], 1.0) +``` + +Note on `species_c`: if `create_taxa()` doesn't already place an Apis + Vanessa pair under a shared ORDER, drop that assertion and add a dedicated taxon fixture inside the test. Check `ami/main/tests.py` `create_taxa()` first. + +- [ ] **Step 2.2: Run test, confirm import error** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestHumanModelAgreementForProject -v 2 --keepdb +``` + +- [ ] **Step 2.3: Implement aggregation** + +Append to `ami/main/models_future/occurrence.py`: + +```python +def human_model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: + """Verified / agreement stats over a pre-filtered Occurrence queryset. + + The queryset MUST already be filtered down to the project + user-supplied + filters (caller wires apply_default_filters + OccurrenceFilter). This + function adds the prefetches/annotations it needs and returns a dict + matching HumanModelAgreementSerializer's field set (without project_id — + the view layer adds that). + + "Verified" means the occurrence has at least one non-withdrawn + Identification. "Model prediction" means the Classification chosen by + BEST_MACHINE_PREDICTION_ORDER. "Under-order" agreement means the user's + taxon and the model's prediction share an ancestor at rank >= ORDER + (inclusive of ORDER itself). + """ + from ami.main.models import Classification, Taxon + + qs = ( + queryset + .with_best_machine_prediction() # annotates best_machine_prediction_taxon_id + .prefetch_related( + Prefetch( + "identifications", + queryset=Identification.objects.filter(withdrawn=False) + .select_related("taxon") + .order_by("-created_at", "-pk"), + to_attr="_non_withdrawn_idents", + ) + ) + ) + + # Collect every taxon id we'll need (best-machine + best-user) to do a + # single batched Taxon fetch for parents_json/rank. + rows = list(qs.values( + "pk", + "best_machine_prediction_taxon_id", + )) + # NOTE: .values() drops the prefetched _non_withdrawn_idents; re-iterate qs + # for identification access. + occurrences = list(qs) + + needed_taxa_ids: set[int] = set() + for occ in occurrences: + if occ.best_machine_prediction_taxon_id: + needed_taxa_ids.add(occ.best_machine_prediction_taxon_id) + idents = getattr(occ, "_non_withdrawn_idents", []) + if idents: + needed_taxa_ids.add(idents[0].taxon_id) + + taxa_by_id: dict[int, tuple[int, str, list[dict]]] = { + t.pk: (t.pk, t.rank, [p.dict() if hasattr(p, "dict") else p for p in t.parents_json]) + for t in Taxon.objects.filter(pk__in=needed_taxa_ids).only("pk", "rank", "parents_json") + } + + total = len(occurrences) + verified = 0 + agreed_exact = 0 + agreed_under_order = 0 + + for occ in occurrences: + idents = getattr(occ, "_non_withdrawn_idents", []) + if not idents: + continue + verified += 1 + user_taxon_id = idents[0].taxon_id + machine_taxon_id = occ.best_machine_prediction_taxon_id + if not machine_taxon_id or not user_taxon_id: + continue + if user_taxon_id == machine_taxon_id: + agreed_exact += 1 + agreed_under_order += 1 + continue + user_tuple = taxa_by_id.get(user_taxon_id) + machine_tuple = taxa_by_id.get(machine_taxon_id) + if not user_tuple or not machine_tuple: + continue + lca = lca_rank_between(user_tuple, machine_tuple) + if lca is not None and lca >= TaxonRank.ORDER: + agreed_under_order += 1 + + def _pct(num: int, denom: int) -> float: + return round(num / denom, 4) if denom else 0.0 + + return { + "total_occurrences": total, + "verified_count": verified, + "verified_pct": _pct(verified, total), + "agreed_exact_count": agreed_exact, + "agreed_exact_pct": _pct(agreed_exact, verified), + "agreed_under_order_count": agreed_under_order, + "agreed_under_order_pct": _pct(agreed_under_order, verified), + } +``` + +Note: `agreed_exact_count` is a subset of `agreed_under_order_count` by definition (exact match implies LCA = SPECIES which is >= ORDER). Document this in the serializer's docstring. + +- [ ] **Step 2.4: Run tests; confirm pass** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestHumanModelAgreementForProject -v 2 --keepdb +``` + +- [ ] **Step 2.5: Commit** + +```bash +git add ami/main/models_future/occurrence.py ami/main/tests.py +git commit -m "feat(occurrence-stats): aggregate human-model agreement over filtered queryset + +Pure aggregation; caller wires apply_default_filters + OccurrenceFilter. +Annotates best machine prediction, prefetches non-withdrawn identifications, +batches Taxon fetch for parents_json, buckets exact / under-order / above-order. + +Co-Authored-By: Claude " +``` + +--- + +## Task 3: Response serializer + +**Files:** +- Modify: `ami/main/api/serializers.py` + +- [ ] **Step 3.1: Add serializer** + +Locate the existing stats serializers (search for `TopIdentifiersResponseSerializer`) and add below: + +```python +class HumanModelAgreementSerializer(serializers.Serializer): + """Verified / agreement rates over the filtered Occurrence set. + + `agreed_exact_count` is a subset of `agreed_under_order_count` by + construction — an exact match implies an LCA at SPECIES, which is + deeper than ORDER. `*_pct` percentages are 0.0..1.0 (not 0..100). + """ + project_id = serializers.IntegerField() + total_occurrences = serializers.IntegerField() + verified_count = serializers.IntegerField() + verified_pct = serializers.FloatField(help_text="verified_count / total_occurrences") + agreed_exact_count = serializers.IntegerField() + agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_count") + agreed_under_order_count = serializers.IntegerField() + agreed_under_order_pct = serializers.FloatField(help_text="agreed_under_order_count / verified_count") +``` + +- [ ] **Step 3.2: Commit** + +```bash +git add ami/main/api/serializers.py +git commit -m "feat(occurrence-stats): add HumanModelAgreementSerializer + +Co-Authored-By: Claude " +``` + +--- + +## Task 4: Action on `OccurrenceStatsViewSet` with filter wiring + +**Files:** +- Modify: `ami/main/api/views.py` + +Pull `OccurrenceViewSet`'s filter backend + filterset_fields list into a module-level tuple so both viewsets share it without `OccurrenceStatsViewSet` having to inherit from `DefaultViewSet` (it stays a plain `GenericViewSet`). + +- [ ] **Step 4.1: Extract shared filter config** + +Above `class OccurrenceViewSet(DefaultViewSet, ProjectMixin):` at `ami/main/api/views.py:1171`, add: + +```python +OCCURRENCE_FILTER_BACKENDS = ( + CustomOccurrenceDeterminationFilter, + OccurrenceCollectionFilter, + OccurrenceAlgorithmFilter, + OccurrenceDateFilter, + OccurrenceVerified, + OccurrenceVerifiedByMeFilter, + OccurrenceTaxaListFilter, +) + +OCCURRENCE_FILTERSET_FIELDS = ( + "event", + "deployment", + "determination__rank", + "detections__source_image", +) +``` + +Then replace the literal lists in `OccurrenceViewSet`: + +```python + filter_backends = DefaultViewSetMixin.filter_backends + list(OCCURRENCE_FILTER_BACKENDS) + filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) +``` + +- [ ] **Step 4.2: Wire filter machinery onto `OccurrenceStatsViewSet`** + +In `OccurrenceStatsViewSet` at `ami/main/api/views.py:1268`, add (above `permission_classes`): + +```python + queryset = Occurrence.objects.none() # hint for filterset introspection + filter_backends = list(OCCURRENCE_FILTER_BACKENDS) + filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) +``` + +(DRF's `filter_queryset` is only called when an action invokes it — `top_identifiers` doesn't, so no behavior change there.) + +- [ ] **Step 4.3: Add `human_model_agreement` action** + +Add to `OccurrenceStatsViewSet`, below `top_identifiers`: + +```python + @extend_schema( + parameters=[project_id_doc_param], + responses=HumanModelAgreementSerializer, + ) + @action(detail=False, methods=["get"], url_path="human-model-agreement") + def human_model_agreement(self, request): + """Verified / human↔model agreement rates over the filtered occurrence set. + + Accepts every query param the `/occurrences/` list endpoint accepts. + Reuses `apply_default_filters` so `apply_defaults=false` bypasses + project default taxa lists + score thresholds. + """ + project = self.get_active_project() + assert project is not None # require_project=True + if not Project.objects.visible_for_user(request.user).filter(pk=project.pk).exists(): + raise NotFound("Project not found.") + + base_qs = ( + Occurrence.objects.filter(project=project) + .valid() + .apply_default_filters(project, request) + ) + filtered_qs = self.filter_queryset(base_qs) + payload = human_model_agreement_for_project(filtered_qs) + payload["project_id"] = project.pk + return Response( + HumanModelAgreementSerializer(payload, context={"request": request}).data + ) +``` + +Add the import at the top of `ami/main/api/views.py`: + +```python +from ami.main.models_future.occurrence import ( + human_model_agreement_for_project, + top_identifiers_for_project, +) +``` + +And the serializer import: + +```python +from ami.main.api.serializers import ( + ..., + HumanModelAgreementSerializer, +) +``` + +- [ ] **Step 4.4: Lint + spectacular** + +```bash +docker compose run --rm django flake8 ami/main/api/views.py ami/main/api/serializers.py +docker compose run --rm django python manage.py spectacular --api-version 'api' --format openapi --file /tmp/schema.yaml +``` +Expected: lint clean. spectacular emits no new warnings about the new action. + +- [ ] **Step 4.5: Commit** + +```bash +git add ami/main/api/views.py ami/main/api/serializers.py +git commit -m "feat(occurrence-stats): wire human-model-agreement action + +Extracts the OccurrenceViewSet filter backends + filterset_fields into a +module-level tuple, then attaches them to OccurrenceStatsViewSet so the +new action can reuse OccurrenceFilter pass-through unchanged. The +top_identifiers action keeps its current behavior — filter_queryset is +only invoked by actions that opt in. + +Co-Authored-By: Claude " +``` + +--- + +## Task 5: Endpoint tests + +**Files:** +- Modify: `ami/main/tests.py` + +- [ ] **Step 5.1: Add HTTP-level tests** + +Append inside `class TestOccurrenceStatsViewSet`: + +```python + agreement_url = "/api/v2/occurrences/stats/human-model-agreement/" + + def _make_machine_prediction(self, occurrence, taxon, score=0.9): + detection = occurrence.detections.first() + Classification.objects.create( + detection=detection, + taxon=taxon, + score=score, + terminal=True, + algorithm=detection.detection_algorithm, + ) + + def test_agreement_no_project_id_returns_400(self): + response = self.client.get(self.agreement_url) + self.assertEqual(response.status_code, 400) + + def test_agreement_draft_project_404_for_anon(self): + self.project.draft = True + self.project.save() + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 404) + + def test_agreement_empty_returns_zero_pcts(self): + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertEqual(body["project_id"], self.project.pk) + self.assertEqual(body["total_occurrences"], 4) + self.assertEqual(body["verified_count"], 0) + self.assertEqual(body["verified_pct"], 0.0) + self.assertEqual(body["agreed_exact_pct"], 0.0) + self.assertEqual(body["agreed_under_order_pct"], 0.0) + + def test_agreement_happy_path(self): + occurrences = list(Occurrence.objects.filter(project=self.project)[:3]) + taxon_a = Taxon.objects.get(name="Vanessa atalanta", projects=self.project) + taxon_b = Taxon.objects.get(name="Vanessa cardui", projects=self.project) + self._make_machine_prediction(occurrences[0], taxon_a) + self._id(self.alice, occurrences[0]) # exact agreement (taxon_a == self.taxon? confirm in fixture) + # ... fill in remaining cases mirroring TestHumanModelAgreementForProject ... + + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertEqual(body["total_occurrences"], 4) + self.assertEqual(body["verified_count"], 1) + + def test_agreement_filter_passthrough(self): + """`?deployment=` should narrow the set.""" + other_deployment = Deployment.objects.create(name="other", project=self.project) + response = self.client.get( + f"{self.agreement_url}?project_id={self.project.pk}&deployment={other_deployment.pk}" + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json()["total_occurrences"], 0) + + def test_agreement_apply_defaults_false_bypasses_project_filters(self): + """Setting a score threshold on the project should reduce counts; apply_defaults=false restores them.""" + self.project.classification_threshold = 0.99 + self.project.save() + gated = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}").json() + bypassed = self.client.get( + f"{self.agreement_url}?project_id={self.project.pk}&apply_defaults=false" + ).json() + self.assertGreaterEqual(bypassed["total_occurrences"], gated["total_occurrences"]) +``` + +- [ ] **Step 5.2: Run full stats viewset tests** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestOccurrenceStatsViewSet \ + ami.main.tests.TestHumanModelAgreementForProject \ + ami.main.tests.TestLcaRankBetween -v 2 --keepdb +``` + +Expected: all pass. + +- [ ] **Step 5.3: Commit** + +```bash +git add ami/main/tests.py +git commit -m "test(occurrence-stats): HTTP coverage for human-model-agreement action + +Covers: missing project_id 400, draft 404, empty zeros, happy path +bucket transitions, deployment filter pass-through, apply_defaults=false bypass. + +Co-Authored-By: Claude " +``` + +--- + +## Task 6: Frontend hook + +**Files:** +- Create: `ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts` + +- [ ] **Step 6.1: Read the sibling hook** + +```bash +cat ui/src/data-services/hooks/occurrences/stats/useTopIdentifiers.ts +``` + +- [ ] **Step 6.2: Write hook mirroring the pattern** + +```typescript +import { useQuery } from '@tanstack/react-query' +import { axios } from 'data-services/api/axios' +import { API_ROUTES, API_URL } from 'data-services/constants' + +export interface HumanModelAgreement { + project_id: number + total_occurrences: number + verified_count: number + verified_pct: number + agreed_exact_count: number + agreed_exact_pct: number + agreed_under_order_count: number + agreed_under_order_pct: number +} + +export const useHumanModelAgreement = (params: Record) => { + const cleanParams = Object.fromEntries( + Object.entries(params).filter(([, v]) => v !== undefined && v !== ''), + ) + return useQuery({ + queryKey: ['occurrences', 'stats', 'human-model-agreement', cleanParams], + queryFn: async () => { + const res = await axios.get( + `${API_URL}/occurrences/stats/human-model-agreement/`, + { params: cleanParams }, + ) + return res.data + }, + enabled: !!cleanParams.project_id, + }) +} +``` + +Adjust import paths/constants to match the actual `useTopIdentifiers.ts` (file uses repo-local aliases; copy them verbatim from the reference hook rather than guessing). + +- [ ] **Step 6.3: Typecheck** + +```bash +cd ui && yarn tsc --noEmit +``` + +- [ ] **Step 6.4: Commit** + +```bash +git add ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts +git commit -m "feat(ui): useHumanModelAgreement hook for occurrence stats + +Mirrors useTopIdentifiers. Accepts arbitrary filter params so the +occurrence list page's filter state can be threaded through unchanged. + +Co-Authored-By: Claude " +``` + +--- + +## Task 7: Verification + PR + +- [ ] **Step 7.1: Full test sweep** + +```bash +docker compose run --rm django python manage.py test \ + ami.main.tests.TestOccurrenceStatsViewSet \ + ami.main.tests.TestHumanModelAgreementForProject \ + ami.main.tests.TestLcaRankBetween \ + ami.main.tests.TestOccurrenceListQueryCount -v 2 --keepdb +``` + +The `TestOccurrenceListQueryCount` run guards against accidentally regressing the list endpoint's prefetch contract when editing `OccurrenceViewSet` filter config. + +- [ ] **Step 7.2: Manual smoke** + +```bash +curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18" | jq +curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18&deployment=42" | jq +curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18&apply_defaults=false" | jq + +# Sanity: total_occurrences should match the list endpoint's count. +curl -s "http://localhost:8000/api/v2/occurrences/?project_id=18" | jq .count +``` + +- [ ] **Step 7.3: Push + open PR** + +```bash +git push -u origin feat/human-model-agreement-endpoint +gh pr create --title "feat(occurrence-stats): /occurrences/stats/human-model-agreement/" --body "$(cat <<'EOF' +## Summary + +- New scalar stats action on `OccurrenceStatsViewSet` returning verified-occurrence and human↔model agreement rates over a filtered occurrence queryset. +- Reuses `OccurrenceViewSet`'s filter backends + `apply_default_filters` so any query param valid on `/occurrences/` is valid here. +- LCA computed in Python via `Taxon.parents_json` + `TaxonRank(OrderedEnum)`; "under-order" agreement is inclusive of ORDER itself. + +## Decisions & evidence + +- "Model prediction" = `BEST_MACHINE_PREDICTION_ORDER`-selected `Classification`, NOT `Occurrence.determination` (user-overridable). +- "Verified" = ≥1 non-withdrawn `Identification`. Consistent with `with_verification_info()` semantics, slightly stricter than `OccurrenceVerified` filter (which doesn't filter `withdrawn`). +- `agreed_exact_count` is a subset of `agreed_under_order_count` by construction — exact match implies LCA = SPECIES which is deeper than ORDER. Surfaced in the serializer docstring. + +## Test plan + +- [x] Unit: `TestLcaRankBetween` covers identical, sister-species, genus-vs-species, different-family, different-order, no-shared-ancestor. +- [x] Aggregation: `TestHumanModelAgreementForProject` covers empty project + four bucket transitions. +- [x] HTTP: `TestOccurrenceStatsViewSet.test_agreement_*` covers 400/404, empty-pct, happy path, filter pass-through, apply_defaults bypass. +- [x] Regression: `TestOccurrenceListQueryCount` still passes after filter config refactor. +- [ ] Smoke against project 18 via curl (see commands in plan). +EOF +)" +``` + +--- + +## Self-review checklist (run before declaring done) + +- [ ] Every step has either code or an exact command — no "implement appropriate handling". +- [ ] Function/method names match across tasks: `lca_rank_between`, `human_model_agreement_for_project`, `HumanModelAgreementSerializer`, `human_model_agreement` action, `useHumanModelAgreement` hook. +- [ ] Test class names are unique and don't collide with existing classes in `ami/main/tests.py`. +- [ ] No new external dependencies introduced. +- [ ] Plan covers every requirement listed in `docs/claude/prompts/human-model-agreement-endpoint.md` (worktree `user-leaderboard`): + - Response shape ✓ (Task 3) + - OccurrenceFilter pass-through ✓ (Task 4) + - `apply_defaults=false` ✓ (Task 4 base_qs + Task 5 test) + - LCA via `parents_json` ✓ (Task 1) + - Tests: happy / filter pass-through / empty / rank-LCA / draft 404 ✓ (Task 5) + - FE hook ✓ (Task 6) + +## Out of scope (deferred follow-ups) + +- **Postgres-side rank ordering operator.** `TaxonRank` is `OrderedEnum` in Python; pushing rank comparisons into SQL would require materializing rank → int (e.g. a small mapping table or `CASE` expression). Useful when the stats grow to a per-rank breakdown chart, but the current LCA pass batch-fetches taxa once so it isn't on the hot path. File a follow-up ticket if a future stats kind genuinely scans more taxa than fit in one batch. +- **Disagreed-above-order breakdown.** The current response collapses "verified but no shared ancestor at-or-finer-than ORDER" into the residual `verified_count - agreed_under_order_count`. If the dashboard wants to chart that residual explicitly, expose `disagreed_above_order_count` derived in the serializer's `to_representation` (no extra compute). +- **OccurrenceFilter-driven export.** Tracked separately in `docs/claude/planning/occurrence-filter-driven-exports.md` (TBD — subagent stub). diff --git a/docs/claude/planning/occurrence-filter-driven-exports.md b/docs/claude/planning/occurrence-filter-driven-exports.md new file mode 100644 index 000000000..adf2cee96 --- /dev/null +++ b/docs/claude/planning/occurrence-filter-driven-exports.md @@ -0,0 +1,116 @@ +# Filter-driven occurrence exports — scoping stub + +**Status:** scoping only. No API design, no task breakdown, no migrations. +**Goal:** let a user on `/occurrences/` apply filters in the UI, click "Export", +and get a job whose output matches exactly that filtered set — without first +having to materialize a `SourceImageCollection`. + +## 1. Current export architecture + +- Entry point: `ami/exports/views.py:30-87` `ExportViewSet.create()` — validates + format + filters, optionally looks up a `SourceImageCollection` from + `filters["collection_id"]`, creates `DataExport`, wires it to a `Job`, calls + `job.enqueue()`. +- Persistence: `ami/exports/models.py:23-35` — `DataExport` stores + `format`, `filters` (JSONB), `project`, `user`, `file_url`. +- Worker side: `ami/exports/base.py:17-28` — `BaseExporter.__init__` calls + `apply_filters(queryset, filters, filter_backends)` using + `get_filter_backends()` which today returns just + `[OccurrenceCollectionFilter]` (`base.py:42-45`). +- Filter replay: `ami/exports/utils.py:13-72` — `generate_fake_request()` + builds a DRF `Request` from a path + query-param dict, then + `apply_filters()` runs the backends against the synthetic request. +- Format-specific querysets: `ami/exports/format_types.py:46-63` (JSON) and + `212-234` (CSV) — both call `Occurrence.objects.valid().filter(project=...)` + and layer custom queryset annotations on top. + +So the export infra **already** has a "filters JSON → re-run backends in +worker" pattern. The catch: it's hard-wired to `OccurrenceCollectionFilter` +and never sees the rest of the `/occurrences/` filter stack. + +## 2. `/occurrences/` list filter stack + +`ami/main/api/views.py:1171-1209` registers: + +- `DefaultViewSetMixin.filter_backends` (DjangoFilter, ordering, search) +- `CustomOccurrenceDeterminationFilter` (`views.py:968-987`) — taxon + descendants +- `OccurrenceCollectionFilter` (`views.py:988-1006`) +- `OccurrenceAlgorithmFilter` (`views.py:1008-1030`) +- `OccurrenceDateFilter` (`views.py:1084-1102`) +- `OccurrenceVerified` (`views.py:1032-1049`) +- `OccurrenceVerifiedByMeFilter` (`views.py:1051-1066`) — **reads `request.user`** +- `OccurrenceTaxaListFilter` (`views.py:1105-1152`) + +Plus `filterset_fields = ["event", "deployment", "determination__rank", +"detections__source_image"]` (DjangoFilter), and the project-level default +filter chain via `qs.apply_default_filters(project, self.request)` +(`views.py:1232`) which layers score thresholds + include/exclude taxa +from `ami/main/models_future/filters.py`. + +## 3. The gap + +What an async export needs that a raw filter dict doesn't supply on its own: + +- **Pickleability.** Celery serializes args; the snapshot must be plain JSON + (already true for `DataExport.filters`). +- **User identity for user-scoped filters.** `verified_by_me` and + `apply_default_filters` both read `request.user` — `generate_fake_request` + currently builds an anonymous request, so these silently no-op or behave + differently than the user expected. +- **Drift between submit and run.** If a project's default-filter config, + taxa lists, or score thresholds change between job enqueue and worker + execution, the export may not match what the user previewed. +- **Pagination semantics don't transfer.** The user filtered to 12k rows; we + need to export all 12k, not a single page. Trivial today (no `limit`/ + `offset` in the JSON) but worth stating. +- **Ordering preservation.** `ordering=` may or may not matter for an export + consumer; needs a call. +- **Large result streaming.** Already partially handled by + `get_data_in_batches` (`utils.py:75-105`), but only after the filtered + queryset materializes — needs verification at the scale users will hit. + +## 4. Proposed approaches + +**A. Persist filter params as JSON, re-run pipeline in the worker.** Extend +`BaseExporter.get_filter_backends()` to return the full `/occurrences/` stack +and feed the JSON through `apply_filters()` as today. Also stash `user_id` +on `DataExport` (already present) and stitch it into the synthetic request +so user-scoped filters work. Lowest infra change; highest drift risk +(re-resolves against live project config at run time). + +**B. Materialize a transient `SourceImageCollection` from the filtered set.** +At submit time, resolve the filter to a list of `SourceImage` ids, create a +hidden collection, point the existing export job at it. Reuses every +existing code path. Heaviest write at submit (could be slow for 100k+ rows); +collection-as-snapshot semantics are misleading because collections are +SourceImage-rooted, not Occurrence-rooted. + +**C. New `ExportFilter` model snapshotting params + resolved querystring + +user + project-default-filter version.** Adds explicit provenance ("this +export reflects filters X under project config version Y"). Most fidelity, +most surface area; only worth it if (A) drift bites in practice. + +Rough ordering by effort: **A < B < C**. Rough ordering by drift safety: +**C > B > A**. + +## 5. Open questions + +- How should `apply_default_filters` be re-evaluated at worker time vs. + frozen at submit? (Today's behaviour is implicitly "re-evaluate.") +- For `verified_by_me`, do we trust `DataExport.user` as the identity, or + require the submit-time `request.user` to match? +- Should `ordering` be preserved, or is unordered export acceptable? +- What's the realistic upper bound on exported occurrences, and does + `get_data_in_batches` hold up there? +- Does the UI need a preview count before the job is enqueued? (Today + `update_record_count()` runs synchronously in the view — fine for small + filtered sets, awkward for huge ones.) +- Should the export job snapshot the project's default-filter config so + re-runs are reproducible? + +## 6. Out of scope for this doc + +- Concrete API design (request/response shapes, field names). +- Task breakdown / sequencing. +- Schema migrations. +- UI changes on `/occurrences/`. From 7b1660c3532fc0e02af5203e14a981ba3ece4fc0 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 10:37:40 -0700 Subject: [PATCH 02/18] feat(occurrence-stats): aggregate human-model agreement over filtered queryset Pure aggregation; caller wires apply_default_filters + OccurrenceFilter. Annotates best machine prediction, prefetches non-withdrawn identifications, batches Taxon fetch for parents_json, buckets exact / under-order / above-order. Co-Authored-By: Claude --- ami/main/models_future/occurrence.py | 83 ++++++++++++++++++++++++++++ ami/main/tests.py | 67 ++++++++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index 999fec65b..2b706e040 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -157,6 +157,89 @@ def detection_image_urls_from_prefetch(occurrence: Occurrence, limit: int | None return [get_media_url(det.path) for det in detections] +def human_model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: + """Verified / agreement stats over a pre-filtered Occurrence queryset. + + The queryset MUST already be filtered to the project + user-supplied + filters (caller wires apply_default_filters + OccurrenceFilter). This + function adds the prefetches/annotations it needs and returns a dict + matching HumanModelAgreementSerializer's field set (without project_id — + the view layer adds that). + + "Verified" means the occurrence has at least one non-withdrawn + Identification. "Model prediction" means the Classification chosen by + BEST_MACHINE_PREDICTION_ORDER. "Under-order" agreement means the user's + taxon and the model's prediction share an ancestor at rank >= ORDER + (inclusive of ORDER itself). + """ + from ami.main.models import Identification, Taxon + + qs = queryset.with_best_machine_prediction().prefetch_related( # type: ignore[attr-defined] + Prefetch( + "identifications", + queryset=Identification.objects.filter(withdrawn=False) + .select_related("taxon") + .order_by("-created_at", "-pk"), + to_attr="_non_withdrawn_idents", + ) + ) + + occurrences = list(qs) + + needed_taxa_ids: set[int] = set() + for occ in occurrences: + machine_id = getattr(occ, "best_machine_prediction_taxon_id", None) + if machine_id: + needed_taxa_ids.add(machine_id) + idents = getattr(occ, "_non_withdrawn_idents", []) + if idents: + needed_taxa_ids.add(idents[0].taxon_id) + + taxa_by_id: dict[int, TaxonTuple] = {} + for t in Taxon.objects.filter(pk__in=needed_taxa_ids): + parents = [{"id": p.id, "rank": p.rank.name if hasattr(p.rank, "name") else p.rank} for p in t.parents_json] + taxa_by_id[t.pk] = (t.pk, t.rank, parents) + + total = len(occurrences) + verified = 0 + agreed_exact = 0 + agreed_under_order = 0 + + for occ in occurrences: + idents = getattr(occ, "_non_withdrawn_idents", []) + if not idents: + continue + verified += 1 + user_taxon_id = idents[0].taxon_id + machine_taxon_id = getattr(occ, "best_machine_prediction_taxon_id", None) + if not machine_taxon_id or not user_taxon_id: + continue + if user_taxon_id == machine_taxon_id: + agreed_exact += 1 + agreed_under_order += 1 + continue + user_tuple = taxa_by_id.get(user_taxon_id) + machine_tuple = taxa_by_id.get(machine_taxon_id) + if not user_tuple or not machine_tuple: + continue + lca = lca_rank_between(user_tuple, machine_tuple) + if lca is not None and lca >= TaxonRank.ORDER: + agreed_under_order += 1 + + def _pct(num: int, denom: int) -> float: + return round(num / denom, 4) if denom else 0.0 + + return { + "total_occurrences": total, + "verified_count": verified, + "verified_pct": _pct(verified, total), + "agreed_exact_count": agreed_exact, + "agreed_exact_pct": _pct(agreed_exact, verified), + "agreed_under_order_count": agreed_under_order, + "agreed_under_order_pct": _pct(agreed_under_order, verified), + } + + def top_identifiers_for_project(project: Project) -> QuerySet[User]: """Project users ranked by distinct occurrences they identified. diff --git a/ami/main/tests.py b/ami/main/tests.py index 7ca932b46..cf459fc8d 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4772,6 +4772,73 @@ def test_no_shared_ancestor_returns_none(self): self.assertIsNone(rank) +class TestHumanModelAgreementForProject(APITestCase): + """Aggregation function over a filtered Occurrence queryset. + + Covers the four bucket transitions: unverified, verified+exact-agreed, + verified+under-order-agreed, verified+disagreed-above-order. + """ + + def setUp(self) -> None: + project, deployment = setup_test_project() + create_taxa(project=project) + create_captures(deployment=deployment) + # Add a sibling family + species under the same ORDER so we can exercise + # the "different family, same order → under-order" bucket. + lepidoptera = Taxon.objects.get(name="Lepidoptera", projects=project) + pieridae = Taxon.objects.create(name="Pieridae", parent=lepidoptera, rank=TaxonRank.FAMILY.name) + pieridae.projects.add(project) + pieris = Taxon.objects.create(name="Pieris brassicae", parent=pieridae, rank=TaxonRank.SPECIES.name) + pieris.projects.add(project) + # Use Vanessa atalanta as the baseline machine prediction so all + # occurrences start with a known classification. + self.vanessa_atalanta = Taxon.objects.get(name="Vanessa atalanta", projects=project) + create_occurrences(deployment=deployment, num=4, taxon=self.vanessa_atalanta) + # Populate parents_json on every taxon — fixtures don't do this. + Taxon.objects.update_all_parents() + self.project = project + self.deployment = deployment + self.vanessa_cardui = Taxon.objects.get(name="Vanessa cardui", projects=project) + self.pieris_brassicae = pieris + self.user = User.objects.create_user(email="ider@insectai.org") # type: ignore + + def _identify(self, occurrence: Occurrence, taxon: Taxon) -> Identification: + return Identification.objects.create(user=self.user, occurrence=occurrence, taxon=taxon) + + def test_empty_project_returns_zeros_not_nans(self): + from ami.main.models_future.occurrence import human_model_agreement_for_project + + empty_project = Project.objects.create(name="empty") + result = human_model_agreement_for_project(Occurrence.objects.filter(project=empty_project)) + self.assertEqual(result["total_occurrences"], 0) + self.assertEqual(result["verified_count"], 0) + self.assertEqual(result["verified_pct"], 0.0) + self.assertEqual(result["agreed_exact_pct"], 0.0) + self.assertEqual(result["agreed_under_order_pct"], 0.0) + + def test_buckets_canonical_cases(self): + from ami.main.models_future.occurrence import human_model_agreement_for_project + + occurrences = list(Occurrence.objects.filter(project=self.project).order_by("pk")) + self.assertEqual(len(occurrences), 4) + # 0: verified, machine == user (exact agreement at SPECIES) + self._identify(occurrences[0], self.vanessa_atalanta) + # 1: verified, sister species (under-order at GENUS) + self._identify(occurrences[1], self.vanessa_cardui) + # 2: verified, different family same order (under-order at ORDER) + self._identify(occurrences[2], self.pieris_brassicae) + # 3: unverified + + result = human_model_agreement_for_project(Occurrence.objects.filter(project=self.project)) + self.assertEqual(result["total_occurrences"], 4) + self.assertEqual(result["verified_count"], 3) + self.assertEqual(result["agreed_exact_count"], 1) + self.assertEqual(result["agreed_under_order_count"], 3) + self.assertAlmostEqual(result["verified_pct"], 0.75) + self.assertAlmostEqual(result["agreed_exact_pct"], 1 / 3, places=3) + self.assertAlmostEqual(result["agreed_under_order_pct"], 1.0) + + class TestOccurrenceStatsViewSet(APITestCase): """Covers /api/v2/occurrences/stats/top-identifiers/. From 3110418a93a3b3c90c4a57c204623ab4bdc76a0b Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 10:39:29 -0700 Subject: [PATCH 03/18] feat(occurrence-stats): wire human-model-agreement action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds HumanModelAgreementSerializer and the human_model_agreement action on OccurrenceStatsViewSet. Extracts OccurrenceViewSet's filter backends + filterset_fields into a module-level tuple so OccurrenceStatsViewSet can reuse the same OccurrenceFilter pass-through (deployment, event, taxa lists, verified, score thresholds, apply_defaults=false, etc). The top_identifiers action keeps its current behavior — filter_queryset is only invoked by actions that opt in. Co-Authored-By: Claude --- ami/main/api/serializers.py | 18 ++++++++++ ami/main/api/views.py | 67 +++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 6d70906e1..8b7474db8 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1751,3 +1751,21 @@ class TopIdentifiersResponseSerializer(serializers.Serializer): project_id = serializers.IntegerField() top_identifiers = UserIdentificationCountSerializer(many=True) + + +class HumanModelAgreementSerializer(serializers.Serializer): + """Verified / agreement rates over the filtered Occurrence set. + + `agreed_exact_count` is a subset of `agreed_under_order_count` by + construction — an exact match implies an LCA at SPECIES, which is + deeper than ORDER. `*_pct` percentages are 0.0..1.0 (not 0..100). + """ + + project_id = serializers.IntegerField() + total_occurrences = serializers.IntegerField() + verified_count = serializers.IntegerField() + verified_pct = serializers.FloatField(help_text="verified_count / total_occurrences") + agreed_exact_count = serializers.IntegerField() + agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_count") + agreed_under_order_count = serializers.IntegerField() + agreed_under_order_pct = serializers.FloatField(help_text="agreed_under_order_count / verified_count") diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 5d21b9b20..e003b8cc4 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -31,7 +31,7 @@ from ami.base.views import ProjectMixin from ami.main.api.schemas import limit_doc_param, project_id_doc_param from ami.main.api.serializers import TagSerializer -from ami.main.models_future.occurrence import top_identifiers_for_project +from ami.main.models_future.occurrence import human_model_agreement_for_project, top_identifiers_for_project from ami.utils.requests import get_default_classification_threshold from ami.utils.storages import ConnectionTestResult @@ -70,6 +70,7 @@ EventListSerializer, EventSerializer, EventTimelineSerializer, + HumanModelAgreementSerializer, IdentificationSerializer, OccurrenceListSerializer, OccurrenceSerializer, @@ -1202,6 +1203,24 @@ def filter_queryset(self, request, queryset, view): return queryset +OCCURRENCE_FILTER_BACKENDS = ( + CustomOccurrenceDeterminationFilter, + OccurrenceCollectionFilter, + OccurrenceAlgorithmFilter, + OccurrenceDateFilter, + OccurrenceVerified, + OccurrenceVerifiedByMeFilter, + OccurrenceTaxaListFilter, +) + +OCCURRENCE_FILTERSET_FIELDS = ( + "event", + "deployment", + "determination__rank", + "detections__source_image", +) + + class OccurrenceViewSet(DefaultViewSet, ProjectMixin): """ API endpoint that allows occurrences to be viewed or edited. @@ -1211,22 +1230,8 @@ class OccurrenceViewSet(DefaultViewSet, ProjectMixin): queryset = Occurrence.objects.all() serializer_class = OccurrenceSerializer - # filter_backends = [CustomDeterminationFilter, DjangoFilterBackend, NullsLastOrderingFilter, SearchFilter] - filter_backends = DefaultViewSetMixin.filter_backends + [ - CustomOccurrenceDeterminationFilter, - OccurrenceCollectionFilter, - OccurrenceAlgorithmFilter, - OccurrenceDateFilter, - OccurrenceVerified, - OccurrenceVerifiedByMeFilter, - OccurrenceTaxaListFilter, - ] - filterset_fields = [ - "event", - "deployment", - "determination__rank", - "detections__source_image", - ] + filter_backends = DefaultViewSetMixin.filter_backends + list(OCCURRENCE_FILTER_BACKENDS) + filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) ordering_fields = [ "created_at", "updated_at", @@ -1324,6 +1329,11 @@ class OccurrenceStatsViewSet(viewsets.GenericViewSet, ProjectMixin): permission_classes = [IsActiveStaffOrReadOnly] require_project = True + # Filter machinery for actions that opt into `self.filter_queryset(...)`. + # `top_identifiers` doesn't call it, so its behavior is unchanged. + queryset = Occurrence.objects.none() + filter_backends = list(OCCURRENCE_FILTER_BACKENDS) + filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) @extend_schema( parameters=[project_id_doc_param, limit_doc_param], @@ -1354,6 +1364,29 @@ def top_identifiers(self, request): ) return Response(serializer.data) + @extend_schema( + parameters=[project_id_doc_param], + responses=HumanModelAgreementSerializer, + ) + @action(detail=False, methods=["get"], url_path="human-model-agreement") + def human_model_agreement(self, request): + """Verified / human↔model agreement rates over the filtered occurrence set. + + Accepts every query param the `/occurrences/` list endpoint accepts. + Reuses `apply_default_filters` so `apply_defaults=false` bypasses + project default taxa lists + score thresholds. + """ + project = self.get_active_project() + assert project is not None # require_project=True guarantees this + if not Project.objects.visible_for_user(request.user).filter(pk=project.pk).exists(): + raise NotFound("Project not found.") + + base_qs = Occurrence.objects.filter(project=project).valid().apply_default_filters(project, request) + filtered_qs = self.filter_queryset(base_qs) + payload = human_model_agreement_for_project(filtered_qs) + payload["project_id"] = project.pk + return Response(HumanModelAgreementSerializer(payload, context={"request": request}).data) + class TaxonTaxaListFilter(filters.BaseFilterBackend): """ From ba9c901c24c0c12e14d3224269442f3669f0a404 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 10:41:48 -0700 Subject: [PATCH 04/18] test(occurrence-stats): HTTP coverage for human-model-agreement action Adds 6 HTTP-level tests: missing project_id 400, draft 404, empty zeros, happy-path exact match, deployment filter pass-through, apply_defaults=false score-threshold bypass. Also adds DjangoFilterBackend to OccurrenceStatsViewSet.filter_backends so filterset_fields (event, deployment, determination__rank, ...) actually take effect. Without DjangoFilterBackend, filterset_fields are silently ignored and ?deployment=N returns the unfiltered set. Co-Authored-By: Claude --- ami/main/api/views.py | 2 +- ami/main/tests.py | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index e003b8cc4..14735d909 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1332,7 +1332,7 @@ class OccurrenceStatsViewSet(viewsets.GenericViewSet, ProjectMixin): # Filter machinery for actions that opt into `self.filter_queryset(...)`. # `top_identifiers` doesn't call it, so its behavior is unchanged. queryset = Occurrence.objects.none() - filter_backends = list(OCCURRENCE_FILTER_BACKENDS) + filter_backends = [DjangoFilterBackend, *OCCURRENCE_FILTER_BACKENDS] filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) @extend_schema( diff --git a/ami/main/tests.py b/ami/main/tests.py index cf459fc8d..18efd88c6 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4922,6 +4922,72 @@ def test_registration_order_preserves_occurrence_retrieve(self): self.assertEqual(stats_response.status_code, 200, "stats URL must resolve") self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work") + # ----- /occurrences/stats/human-model-agreement/ ----- + + agreement_url = "/api/v2/occurrences/stats/human-model-agreement/" + + def test_agreement_no_project_id_returns_400(self): + response = self.client.get(self.agreement_url) + self.assertEqual(response.status_code, 400) + + def test_agreement_draft_project_404_for_anon(self): + self.project.draft = True + self.project.save() + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 404) + + def test_agreement_empty_returns_zero_pcts(self): + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertEqual(body["project_id"], self.project.pk) + self.assertEqual(body["total_occurrences"], 4) + self.assertEqual(body["verified_count"], 0) + self.assertEqual(body["verified_pct"], 0.0) + self.assertEqual(body["agreed_exact_pct"], 0.0) + self.assertEqual(body["agreed_under_order_pct"], 0.0) + + def test_agreement_happy_path(self): + """One verified occurrence; user agrees with the machine prediction → exact match. + + The fixture creates a single classification per occurrence via + `create_occurrences()`, which uses a random taxon. We identify the + first occurrence with that same taxon to force an exact match. + """ + occurrence = Occurrence.objects.filter(project=self.project).order_by("pk").first() + # The machine prediction is whatever `create_occurrences()` picked — match it. + machine_taxon = occurrence.detections.first().classifications.first().taxon + Taxon.objects.update_all_parents() + Identification.objects.create(user=self.alice, occurrence=occurrence, taxon=machine_taxon) + + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertEqual(body["total_occurrences"], 4) + self.assertEqual(body["verified_count"], 1) + self.assertEqual(body["agreed_exact_count"], 1) + self.assertEqual(body["agreed_under_order_count"], 1) + + def test_agreement_filter_passthrough(self): + """`?deployment=` should narrow the set.""" + other_deployment = Deployment.objects.create(name="other", project=self.project) + response = self.client.get( + f"{self.agreement_url}?project_id={self.project.pk}&deployment={other_deployment.pk}" + ) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json()["total_occurrences"], 0) + + def test_agreement_apply_defaults_false_bypasses_score_threshold(self): + """A score threshold filters out occurrences; apply_defaults=false restores them.""" + self.project.default_filters_score_threshold = 0.99 + self.project.save() + gated = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}").json() + bypassed = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}&apply_defaults=false").json() + self.assertGreaterEqual(bypassed["total_occurrences"], gated["total_occurrences"]) + # Sanity: with threshold=0.99 and fixture's score=0.9, gated should be 0. + self.assertEqual(gated["total_occurrences"], 0) + self.assertEqual(bypassed["total_occurrences"], 4) + class TestTaxaVerification(APITestCase): """Per-taxon verification + human/model agreement annotations and the verified filter (#1316).""" From 5b1bde71cf2118b746aa58d9a7731efa33dcb67b Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 10:43:06 -0700 Subject: [PATCH 05/18] feat(ui): useHumanModelAgreement hook for occurrence stats Mirrors useTopIdentifiers's useAuthorizedQuery pattern. Accepts an arbitrary filter map so the occurrence list page can thread its filter state through unchanged (deployment, event, taxon, score thresholds, apply_defaults). Co-Authored-By: Claude --- .../stats/useHumanModelAgreement.ts | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts diff --git a/ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts b/ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts new file mode 100644 index 000000000..aee2da546 --- /dev/null +++ b/ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts @@ -0,0 +1,51 @@ +import { API_ROUTES, API_URL } from 'data-services/constants' +import { useAuthorizedQuery } from '../../auth/useAuthorizedQuery' + +interface Response { + project_id: number + total_occurrences: number + verified_count: number + verified_pct: number + agreed_exact_count: number + agreed_exact_pct: number + agreed_under_order_count: number + agreed_under_order_pct: number +} + +// Accepts an arbitrary filter map so the occurrence list page's filter state +// can be threaded through unchanged (deployment, event, taxon, score +// thresholds, apply_defaults, etc). +export const useHumanModelAgreement = ( + projectId?: string, + filters?: Record +) => { + const url = `${API_URL}/${API_ROUTES.OCCURRENCES}/stats/human-model-agreement/` + + const params = new URLSearchParams() + if (projectId) params.set('project_id', projectId) + if (filters) { + Object.entries(filters).forEach(([key, value]) => { + if (value !== undefined && value !== '' && value !== null) { + params.set(key, String(value)) + } + }) + } + + const { data, isLoading, isFetching, error } = useAuthorizedQuery({ + queryKey: [ + API_ROUTES.OCCURRENCES, + 'stats', + 'human-model-agreement', + projectId, + filters, + ], + url: `${url}?${params.toString()}`, + }) + + return { + data, + isLoading, + isFetching, + error, + } +} From e050a1f401abd0efc254bc8d2a916d0d71802dae Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 14:22:59 -0700 Subject: [PATCH 06/18] =?UTF-8?q?docs(prompts):=20handoff=20for=20PR=20#13?= =?UTF-8?q?07=20rework=20=E2=80=94=20rename=20+=20SQL=20push-down=20+=20re?= =?UTF-8?q?view=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures: review findings from Copilot + CodeRabbit, perf bench evidence (43k rows → 159s timeout on apply_defaults=false), and the planned changes for the next session (rename to model-agreement, push aggregation into SQL/ORM, fix UNKNOWN rank LCA + denominator + verified_by_me anon gap + test gaps). Co-Authored-By: Claude --- docs/claude/prompts/NEXT_SESSION_PROMPT.md | 86 ++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 docs/claude/prompts/NEXT_SESSION_PROMPT.md diff --git a/docs/claude/prompts/NEXT_SESSION_PROMPT.md b/docs/claude/prompts/NEXT_SESSION_PROMPT.md new file mode 100644 index 000000000..9f3a54217 --- /dev/null +++ b/docs/claude/prompts/NEXT_SESSION_PROMPT.md @@ -0,0 +1,86 @@ +# Next session — PR #1307 rework + +**Branch:** `feat/human-model-agreement-endpoint` (worktree `occurrence-stats`) +**PR:** https://github.com/RolnickLab/antenna/pull/1307 +**Main stack override:** `/home/michael/Projects/AMI/antenna/docker-compose.override.yml` mounts this worktree's `ami/` + `config/` over the main `antenna` stack — `docker compose ps` already shows django/celeryworker recreated. Stack live; smoke against `http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18` returns 200. + +## Tasks for this session + +### 1. Rename: drop "human" + +User wants `human-model-agreement` → `model-agreement`, `HumanModelAgreement*` → `ModelAgreement*`, `human_model_agreement_for_project` → `model_agreement_for_project`. Files to touch: + +- `ami/main/models_future/occurrence.py:160` — fn name + docstring +- `ami/main/api/serializers.py` — `HumanModelAgreementSerializer` class +- `ami/main/api/views.py:35-38` — import; line 94 import; viewset action name + url_path; serializer references at the action site +- `ami/main/tests.py` — `TestHumanModelAgreementForProject` class; `agreement_url`; all imports +- `ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts` — rename file to `useModelAgreement.ts`; rename hook + `Response` interface to `ModelAgreementResponse` (Copilot review caught the DOM `Response` shadow) +- `docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md` — leave the old plan doc as historical record; cite the new endpoint name where relevant +- PR title + body + +### 2. Push aggregation into SQL (Copilot + CodeRabbit both flagged) + +**Evidence:** Vermont (project 18) has 43,149 occurrences; `?apply_defaults=false` curl hit 159s and timed out at the curl layer. Current Python iteration over the full filtered queryset doesn't scale. + +**Proposed approach** (validate before coding): + +1. Annotate the queryset with subqueries to expose `best_machine_taxon_id` (already there via `with_best_machine_prediction()`) and `best_user_taxon_id` (new — subquery over `Identification` ordered by `BEST_IDENTIFICATION_ORDER`). +2. Compute totals with `aggregate()` using `Count('pk', filter=Q(...), distinct=True)`: + - `total_occurrences = Count('pk')` + - `verified_count = Count('pk', filter=Q(best_user_taxon_id__isnull=False))` (drop the verified-without-prediction trap from Copilot finding #3 — see fix below) + - `agreed_exact_count = Count('pk', filter=Q(best_user_taxon_id=F('best_machine_taxon_id')))` +3. For `agreed_under_order_count` — the hard part — try one of: + - **(a)** Annotate `best_user_taxon_order_id` and `best_machine_taxon_order_id` via Postgres `jsonb_path_query_first(parents_json, '$[*] ? (@.rank == "ORDER").id')` raw expressions. Two taxa agree under-order iff their order ids match AND neither is null. Add a row-level Python check only if the user's own rank is at-or-below ORDER (since user might ID at FAMILY directly with no ORDER ancestor in parents_json — but the taxon's own rank should be checked too). + - **(b)** Denormalize: add `order_taxon_id` column on Taxon, populate in `update_parents()`. Cleaner queries, needs migration + backfill. + - **(c)** Hybrid: keep Python LCA but batch via single annotated `values_list('pk', 'best_user_taxon_id', 'best_machine_taxon_id')` query plus one batched `Taxon` lookup. Avoids the `list(qs)` materialization but still does Python LCA. Faster than current; not as clean as (a) or (b). +4. Bench against project 18 unfiltered AND with `apply_defaults=false` before merging. Target: subsecond. + +Read Copilot's comment at `ami/main/models_future/occurrence.py:227` and CodeRabbit's at `:187` for their exact wording. + +### 3. Fix correctness bugs flagged in review + +**3a. `TaxonRank.UNKNOWN` bug** (Copilot, `:227`) +`UNKNOWN` is defined AFTER `SPECIES` in `ami/utils/schemas.py`, so `TaxonRank.UNKNOWN >= TaxonRank.ORDER` is `True` by definition order. If either chain contains an `UNKNOWN` ancestor that happens to be the deepest shared one, LCA wrongly counts as under-order. Filter `UNKNOWN` out of `lca_rank_between`'s candidate ranks. Add a unit test. + +**3b. Denominator bug** (Copilot, `:240`) +`agreed_exact_pct` / `agreed_under_order_pct` divide by `verified` but `verified` includes occurrences with **no** machine prediction — those can never agree, so they drag the pct down. Two options: +- Change the denominator to `verified AND has_machine_prediction` and call the field `verified_with_prediction_count` (clearer semantics). +- Keep `verified` as the denominator but add a separate `no_prediction_count` so the consumer can adjust. + +User probably prefers option 1 + surface the `no_prediction_count` as a sibling field. Check with them before coding. + +**3c. Drop wasted `select_related("taxon")` on idents prefetch** (Copilot, `:182`) — only `taxon_id` is read; the related Taxon row is re-fetched in the batch. + +**3d. `verified_by_me` anon access** (Copilot, `ami/main/api/views.py:1303`) +`OccurrenceVerifiedByMeFilter` is now wired into `OccurrenceStatsViewSet` via the shared `OCCURRENCE_FILTER_BACKENDS` tuple. With `IsActiveStaffOrReadOnly` allowing anon reads, an anon `?verified_by_me=true` reads `request.user` (AnonymousUser) — the filter currently guards on `is_authenticated` so it short-circuits, but consider gating the action explicitly or filtering the backend list for anon. Decide before merging. + +### 4. Test gaps to fill + +**4a. Under-order-but-not-exact HTTP coverage** (Copilot, `tests.py:4969`) +`test_agreement_happy_path` only hits the exact-match shortcut. Add a test that wires a sister-species identification (matches the T2 aggregation test's "bucket 1") and asserts `agreed_exact_count=0, agreed_under_order_count=1`. + +**4b. `UNKNOWN` rank regression test** — covered above in 3a. + +**4c. `no_prediction_count` test** — if you add that field per 3b, test it. + +### 5. Markdown lint nit (CodeRabbit, plan doc:43) + +Add `text` lang specifier to the fenced "File Structure" block. + +## After the rework + +1. Run full sweep: `docker compose -f docker-compose.ci.yml run --rm django python manage.py test ami.main.tests.TestOccurrenceStatsViewSet ami.main.tests.TestModelAgreementForProject ami.main.tests.TestLcaRankBetween ami.main.tests.TestOccurrenceListQueryCount -v 1 --keepdb` +2. Bench against project 18 — log curl `time_total` for the unfiltered + `apply_defaults=false` cases. Memory budget: should not materialize 43k rows. +3. Reply to each Copilot/CodeRabbit thread with `**Claude says:** Fixed in ...` per CLAUDE.md PR comment workflow. +4. Resolve threads via GraphQL once replied. +5. Push, let CI run, then ping user. + +## Files to grep first + +- Existing SQL-side patterns: `OccurrenceQuerySet.with_best_machine_prediction()` at `ami/main/models.py:2998`, `with_verification_info()` at `:3022`, `unique_taxa()` at `:3051`. These all use `Subquery(...)` annotations — same pattern to follow. +- `parents_json` jsonb queries: `Taxon.objects.filter(parents_json__contains=[{"id": ...}])` at `ami/main/models.py:3661, 3776, 3787` — that's the standard ORM idiom. For `jsonb_path_query_first` you'll need `RawSQL` or a custom `Func` subclass. +- Override file (already mounted): `/home/michael/Projects/AMI/antenna/docker-compose.override.yml` — leave as-is. + +## Compaction note + +Current session committed 5 PR commits + the plan doc + side-research export stub at `docs/claude/planning/occurrence-filter-driven-exports.md`. PR #1307 open with CodeRabbit + Copilot reviews already on it. Memory file `MEMORY.md` should be updated to add a `project_pr_1307_human_model_agreement.md` entry summarizing state (TODO this session start). From f49c9ca08166f7cb41c1fc6aeda8fa8121897dd6 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 23:41:17 -0700 Subject: [PATCH 07/18] refactor(occurrence-stats): rename to model-agreement + push aggregation to SQL Addresses review feedback on PR #1307: Rename (drop "human"): - URL: /occurrences/stats/human-model-agreement/ -> /model-agreement/ - Function: human_model_agreement_for_project -> model_agreement_for_project - Serializer: HumanModelAgreementSerializer -> ModelAgreementSerializer - Viewset action + url_path: human_model_agreement -> model_agreement - FE hook: useHumanModelAgreement -> useModelAgreement (file + symbol) - FE type: Response -> ModelAgreementResponse (fixes DOM Response shadow) - Test class: TestHumanModelAgreementForProject -> TestModelAgreementForProject SQL push-down (Copilot+CodeRabbit perf flag): - Replace list(qs) full-row materialization with annotated aggregate(). - Annotate best_user_taxon_id via Subquery over Identification (BEST_IDENTIFICATION_ORDER). Drop the prefetch + select_related("taxon") on identifications since only taxon_id is read. - aggregate() Count(filter=Q(...)) for total/verified/exact/no-prediction. - For under-order disagreement: group disagreement set by distinct (user_taxon, machine_taxon) pair before LCA. Each pair's LCA runs once. - Bench against project 18 (43,149 occurrences): pre-rework apply_defaults=false curl timed out at 159s; post-rework 1.96s unfiltered / 3.4s with bypass (93,019 occurrences post-filter). Denominator fix (Copilot): - agreed_*_pct now divides by verified_with_prediction_count instead of verified_count. A verified occurrence with no machine prediction can't agree or disagree; including it in the denominator drags the rate down without representing actual model disagreement. - Surface no_prediction_count + verified_with_prediction_count as sibling fields so consumers can see how many such occurrences exist. UNKNOWN rank bug (Copilot): - TaxonRank.UNKNOWN sorts after SPECIES in OrderedEnum definition order, so without explicit exclusion UNKNOWN >= ORDER is True and a shared UNKNOWN ancestor would wrongly count as under-order agreement. Filter UNKNOWN out of lca_rank_between's candidate ranks. Add regression test. Tests: - New: test_unknown_rank_excluded_from_lca (LCA regression) - New: test_agreement_under_order_bucket (HTTP coverage for sister-species case, previously only exact-match shortcut was exercised) - Updated: happy-path asserts verified_with_prediction_count and no_prediction_count. 22/22 backend tests green: docker compose exec django python manage.py test ami.main.tests.TestLcaRankBetween ami.main.tests.TestModelAgreementForProject ami.main.tests.TestOccurrenceStatsViewSet Co-Authored-By: Claude --- ami/main/api/serializers.py | 23 +++- ami/main/api/views.py | 14 +- ami/main/models_future/occurrence.py | 125 ++++++++++-------- ami/main/tests.py | 75 ++++++++++- docs/claude/reference/api-stats-pattern.md | 2 +- ...ModelAgreement.ts => useModelAgreement.ts} | 29 ++-- 6 files changed, 182 insertions(+), 86 deletions(-) rename ui/src/data-services/hooks/occurrences/stats/{useHumanModelAgreement.ts => useModelAgreement.ts} (66%) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 8b7474db8..ab4a83d83 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1753,19 +1753,34 @@ class TopIdentifiersResponseSerializer(serializers.Serializer): top_identifiers = UserIdentificationCountSerializer(many=True) -class HumanModelAgreementSerializer(serializers.Serializer): +class ModelAgreementSerializer(serializers.Serializer): """Verified / agreement rates over the filtered Occurrence set. `agreed_exact_count` is a subset of `agreed_under_order_count` by construction — an exact match implies an LCA at SPECIES, which is deeper than ORDER. `*_pct` percentages are 0.0..1.0 (not 0..100). + + Denominator note: `agreed_*_pct` divide by `verified_with_prediction_count` + (verified occurrences that *also* have a machine prediction), NOT by + `verified_count`. A verified occurrence with no machine prediction can't + agree or disagree — including it in the denominator would drag the rate + down without representing actual model disagreement. `no_prediction_count` + is surfaced so the consumer can see how many such occurrences exist. """ project_id = serializers.IntegerField() total_occurrences = serializers.IntegerField() - verified_count = serializers.IntegerField() + verified_count = serializers.IntegerField(help_text="Occurrences with at least one non-withdrawn identification.") verified_pct = serializers.FloatField(help_text="verified_count / total_occurrences") + verified_with_prediction_count = serializers.IntegerField( + help_text="Verified occurrences that also have a machine prediction (denominator for agreed_*_pct)." + ) + no_prediction_count = serializers.IntegerField( + help_text="Verified occurrences with no machine prediction (excluded from agreement denominator)." + ) agreed_exact_count = serializers.IntegerField() - agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_count") + agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_with_prediction_count") agreed_under_order_count = serializers.IntegerField() - agreed_under_order_pct = serializers.FloatField(help_text="agreed_under_order_count / verified_count") + agreed_under_order_pct = serializers.FloatField( + help_text="agreed_under_order_count / verified_with_prediction_count" + ) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 14735d909..e76c59578 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -31,7 +31,7 @@ from ami.base.views import ProjectMixin from ami.main.api.schemas import limit_doc_param, project_id_doc_param from ami.main.api.serializers import TagSerializer -from ami.main.models_future.occurrence import human_model_agreement_for_project, top_identifiers_for_project +from ami.main.models_future.occurrence import model_agreement_for_project, top_identifiers_for_project from ami.utils.requests import get_default_classification_threshold from ami.utils.storages import ConnectionTestResult @@ -70,8 +70,8 @@ EventListSerializer, EventSerializer, EventTimelineSerializer, - HumanModelAgreementSerializer, IdentificationSerializer, + ModelAgreementSerializer, OccurrenceListSerializer, OccurrenceSerializer, PageListSerializer, @@ -1366,10 +1366,10 @@ def top_identifiers(self, request): @extend_schema( parameters=[project_id_doc_param], - responses=HumanModelAgreementSerializer, + responses=ModelAgreementSerializer, ) - @action(detail=False, methods=["get"], url_path="human-model-agreement") - def human_model_agreement(self, request): + @action(detail=False, methods=["get"], url_path="model-agreement") + def model_agreement(self, request): """Verified / human↔model agreement rates over the filtered occurrence set. Accepts every query param the `/occurrences/` list endpoint accepts. @@ -1383,9 +1383,9 @@ def human_model_agreement(self, request): base_qs = Occurrence.objects.filter(project=project).valid().apply_default_filters(project, request) filtered_qs = self.filter_queryset(base_qs) - payload = human_model_agreement_for_project(filtered_qs) + payload = model_agreement_for_project(filtered_qs) payload["project_id"] = project.pk - return Response(HumanModelAgreementSerializer(payload, context={"request": request}).data) + return Response(ModelAgreementSerializer(payload, context={"request": request}).data) class TaxonTaxaListFilter(filters.BaseFilterBackend): diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index 2b706e040..d203daa82 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING -from django.db.models import Count, Prefetch, Q, QuerySet +from django.db.models import Count, F, OuterRef, Prefetch, Q, QuerySet, Subquery from ami.main.models import Project, TaxonRank, User @@ -30,14 +30,20 @@ def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: The taxon itself counts as part of its own ancestor chain — passing the same taxon twice returns that taxon's rank. Returns ``None`` when the two - chains share no ancestor (e.g. one has an empty parents_json and the other - doesn't include it). + chains share no ancestor at a real taxonomic rank. + + ``TaxonRank.UNKNOWN`` is excluded from the candidate set even though it + sorts after SPECIES in OrderedEnum definition order — it isn't a real + taxonomic rank and treating it as deeper-than-ORDER produces false + under-order agreements when an UNKNOWN ancestor happens to be shared. """ chain_a = [(p["id"], TaxonRank(p["rank"])) for p in a[2]] + [(a[0], TaxonRank(a[1]))] chain_b_ids = {p["id"] for p in b[2]} | {b[0]} deepest: TaxonRank | None = None for tid, rank in chain_a: + if rank == TaxonRank.UNKNOWN: + continue if tid in chain_b_ids: if deepest is None or rank > deepest: deepest = rank @@ -157,74 +163,83 @@ def detection_image_urls_from_prefetch(occurrence: Occurrence, limit: int | None return [get_media_url(det.path) for det in detections] -def human_model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: +def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: """Verified / agreement stats over a pre-filtered Occurrence queryset. The queryset MUST already be filtered to the project + user-supplied filters (caller wires apply_default_filters + OccurrenceFilter). This - function adds the prefetches/annotations it needs and returns a dict - matching HumanModelAgreementSerializer's field set (without project_id — - the view layer adds that). + function adds the annotations it needs and returns a dict matching + ModelAgreementSerializer's field set (without project_id — the view + layer adds that). "Verified" means the occurrence has at least one non-withdrawn Identification. "Model prediction" means the Classification chosen by BEST_MACHINE_PREDICTION_ORDER. "Under-order" agreement means the user's taxon and the model's prediction share an ancestor at rank >= ORDER (inclusive of ORDER itself). + + Aggregation is SQL-side. Only the disagreement set (occurrences where + user and machine disagree at SPECIES) is materialized in Python, and + even then it's deduplicated to distinct (user_taxon, machine_taxon) + pairs so LCA runs once per pair, not once per occurrence. """ - from ami.main.models import Identification, Taxon + from ami.main.models import BEST_IDENTIFICATION_ORDER, Identification, Taxon - qs = queryset.with_best_machine_prediction().prefetch_related( # type: ignore[attr-defined] - Prefetch( - "identifications", - queryset=Identification.objects.filter(withdrawn=False) - .select_related("taxon") - .order_by("-created_at", "-pk"), - to_attr="_non_withdrawn_idents", - ) + best_user_ident = Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False).order_by( + *BEST_IDENTIFICATION_ORDER + ) + + qs = queryset.with_best_machine_prediction().annotate( # type: ignore[attr-defined] + best_user_taxon_id=Subquery(best_user_ident.values("taxon_id")[:1]), ) - occurrences = list(qs) + verified_q = Q(best_user_taxon_id__isnull=False) + has_pred_q = Q(best_machine_prediction_taxon_id__isnull=False) + exact_q = verified_q & has_pred_q & Q(best_user_taxon_id=F("best_machine_prediction_taxon_id")) - needed_taxa_ids: set[int] = set() - for occ in occurrences: - machine_id = getattr(occ, "best_machine_prediction_taxon_id", None) - if machine_id: - needed_taxa_ids.add(machine_id) - idents = getattr(occ, "_non_withdrawn_idents", []) - if idents: - needed_taxa_ids.add(idents[0].taxon_id) + aggregates = qs.aggregate( + total_occurrences=Count("pk"), + verified_count=Count("pk", filter=verified_q), + verified_with_prediction_count=Count("pk", filter=verified_q & has_pred_q), + no_prediction_count=Count("pk", filter=verified_q & ~has_pred_q), + agreed_exact_count=Count("pk", filter=exact_q), + ) + + # Under-order: only the disagreement set hits Python, grouped by distinct + # (user_taxon, machine_taxon) pair so each pair's LCA is computed once. + disagreement_pairs = ( + qs.filter(verified_q & has_pred_q) + .exclude(best_user_taxon_id=F("best_machine_prediction_taxon_id")) + .values("best_user_taxon_id", "best_machine_prediction_taxon_id") + .annotate(occurrence_count=Count("pk")) + ) + + pairs = list(disagreement_pairs) + needed_taxa_ids = {p["best_user_taxon_id"] for p in pairs} | {p["best_machine_prediction_taxon_id"] for p in pairs} taxa_by_id: dict[int, TaxonTuple] = {} - for t in Taxon.objects.filter(pk__in=needed_taxa_ids): - parents = [{"id": p.id, "rank": p.rank.name if hasattr(p.rank, "name") else p.rank} for p in t.parents_json] - taxa_by_id[t.pk] = (t.pk, t.rank, parents) - - total = len(occurrences) - verified = 0 - agreed_exact = 0 - agreed_under_order = 0 - - for occ in occurrences: - idents = getattr(occ, "_non_withdrawn_idents", []) - if not idents: - continue - verified += 1 - user_taxon_id = idents[0].taxon_id - machine_taxon_id = getattr(occ, "best_machine_prediction_taxon_id", None) - if not machine_taxon_id or not user_taxon_id: + if needed_taxa_ids: + for t in Taxon.objects.filter(pk__in=needed_taxa_ids): + parents = [ + {"id": p.id, "rank": p.rank.name if hasattr(p.rank, "name") else p.rank} for p in t.parents_json + ] + taxa_by_id[t.pk] = (t.pk, t.rank, parents) + + under_order_disagreement_count = 0 + for pair in pairs: + u = taxa_by_id.get(pair["best_user_taxon_id"]) + m = taxa_by_id.get(pair["best_machine_prediction_taxon_id"]) + if not u or not m: continue - if user_taxon_id == machine_taxon_id: - agreed_exact += 1 - agreed_under_order += 1 - continue - user_tuple = taxa_by_id.get(user_taxon_id) - machine_tuple = taxa_by_id.get(machine_taxon_id) - if not user_tuple or not machine_tuple: - continue - lca = lca_rank_between(user_tuple, machine_tuple) + lca = lca_rank_between(u, m) if lca is not None and lca >= TaxonRank.ORDER: - agreed_under_order += 1 + under_order_disagreement_count += pair["occurrence_count"] + + agreed_exact = aggregates["agreed_exact_count"] + agreed_under_order = agreed_exact + under_order_disagreement_count + total = aggregates["total_occurrences"] + verified = aggregates["verified_count"] + verified_with_pred = aggregates["verified_with_prediction_count"] def _pct(num: int, denom: int) -> float: return round(num / denom, 4) if denom else 0.0 @@ -233,10 +248,12 @@ def _pct(num: int, denom: int) -> float: "total_occurrences": total, "verified_count": verified, "verified_pct": _pct(verified, total), + "verified_with_prediction_count": verified_with_pred, + "no_prediction_count": aggregates["no_prediction_count"], "agreed_exact_count": agreed_exact, - "agreed_exact_pct": _pct(agreed_exact, verified), + "agreed_exact_pct": _pct(agreed_exact, verified_with_pred), "agreed_under_order_count": agreed_under_order, - "agreed_under_order_pct": _pct(agreed_under_order, verified), + "agreed_under_order_pct": _pct(agreed_under_order, verified_with_pred), } diff --git a/ami/main/tests.py b/ami/main/tests.py index 18efd88c6..e1a38b61f 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4771,8 +4771,36 @@ def test_no_shared_ancestor_returns_none(self): rank = lca_rank_between(rootless, self.SPECIES_NOCTUA_PRONUBA) self.assertIsNone(rank) + def test_unknown_rank_excluded_from_lca(self): + """TaxonRank.UNKNOWN sorts after SPECIES in OrderedEnum definition order, + so without explicit exclusion `UNKNOWN >= ORDER` would be True and a + shared UNKNOWN ancestor would wrongly count as under-order agreement. + """ + from ami.main.models_future.occurrence import lca_rank_between + + # Both chains share a KINGDOM ancestor and an UNKNOWN ancestor; the LCA + # at a real taxonomic rank is KINGDOM, not UNKNOWN. + unknown_a = ( + 701, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 999, "rank": "UNKNOWN"}, + ], + ) + unknown_b = ( + 702, + "SPECIES", + [ + {"id": 1, "rank": "KINGDOM"}, + {"id": 999, "rank": "UNKNOWN"}, + ], + ) + rank = lca_rank_between(unknown_a, unknown_b) + self.assertEqual(rank, TaxonRank.KINGDOM) -class TestHumanModelAgreementForProject(APITestCase): + +class TestModelAgreementForProject(APITestCase): """Aggregation function over a filtered Occurrence queryset. Covers the four bucket transitions: unverified, verified+exact-agreed, @@ -4806,10 +4834,10 @@ def _identify(self, occurrence: Occurrence, taxon: Taxon) -> Identification: return Identification.objects.create(user=self.user, occurrence=occurrence, taxon=taxon) def test_empty_project_returns_zeros_not_nans(self): - from ami.main.models_future.occurrence import human_model_agreement_for_project + from ami.main.models_future.occurrence import model_agreement_for_project empty_project = Project.objects.create(name="empty") - result = human_model_agreement_for_project(Occurrence.objects.filter(project=empty_project)) + result = model_agreement_for_project(Occurrence.objects.filter(project=empty_project)) self.assertEqual(result["total_occurrences"], 0) self.assertEqual(result["verified_count"], 0) self.assertEqual(result["verified_pct"], 0.0) @@ -4817,7 +4845,7 @@ def test_empty_project_returns_zeros_not_nans(self): self.assertEqual(result["agreed_under_order_pct"], 0.0) def test_buckets_canonical_cases(self): - from ami.main.models_future.occurrence import human_model_agreement_for_project + from ami.main.models_future.occurrence import model_agreement_for_project occurrences = list(Occurrence.objects.filter(project=self.project).order_by("pk")) self.assertEqual(len(occurrences), 4) @@ -4829,7 +4857,7 @@ def test_buckets_canonical_cases(self): self._identify(occurrences[2], self.pieris_brassicae) # 3: unverified - result = human_model_agreement_for_project(Occurrence.objects.filter(project=self.project)) + result = model_agreement_for_project(Occurrence.objects.filter(project=self.project)) self.assertEqual(result["total_occurrences"], 4) self.assertEqual(result["verified_count"], 3) self.assertEqual(result["agreed_exact_count"], 1) @@ -4922,9 +4950,9 @@ def test_registration_order_preserves_occurrence_retrieve(self): self.assertEqual(stats_response.status_code, 200, "stats URL must resolve") self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work") - # ----- /occurrences/stats/human-model-agreement/ ----- + # ----- /occurrences/stats/model-agreement/ ----- - agreement_url = "/api/v2/occurrences/stats/human-model-agreement/" + agreement_url = "/api/v2/occurrences/stats/model-agreement/" def test_agreement_no_project_id_returns_400(self): response = self.client.get(self.agreement_url) @@ -4965,9 +4993,42 @@ def test_agreement_happy_path(self): body = response.json() self.assertEqual(body["total_occurrences"], 4) self.assertEqual(body["verified_count"], 1) + self.assertEqual(body["verified_with_prediction_count"], 1) + self.assertEqual(body["no_prediction_count"], 0) self.assertEqual(body["agreed_exact_count"], 1) self.assertEqual(body["agreed_under_order_count"], 1) + def test_agreement_under_order_bucket(self): + """Disagreement at species but same genus → counted under-order, not exact. + + Pick the machine prediction's sister species (same parent genus) for the + identification. LCA between the two species is GENUS, which is >= ORDER, + so the occurrence falls into the under-order bucket without contributing + to agreed_exact_count. + """ + occurrence = Occurrence.objects.filter(project=self.project).order_by("pk").first() + machine_taxon = occurrence.detections.first().classifications.first().taxon + # Sister species: same parent (genus Vanessa), different SPECIES. + sister = ( + Taxon.objects.filter(parent=machine_taxon.parent, rank=TaxonRank.SPECIES.name) + .exclude(pk=machine_taxon.pk) + .first() + ) + self.assertIsNotNone(sister, "Test fixture must have a sister species under the same genus") + Taxon.objects.update_all_parents() + Identification.objects.create(user=self.alice, occurrence=occurrence, taxon=sister) + + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") + self.assertEqual(response.status_code, 200) + body = response.json() + self.assertEqual(body["verified_count"], 1) + self.assertEqual(body["verified_with_prediction_count"], 1) + self.assertEqual(body["agreed_exact_count"], 0) + self.assertEqual(body["agreed_under_order_count"], 1) + # 0/1 exact, 1/1 under-order + self.assertEqual(body["agreed_exact_pct"], 0.0) + self.assertEqual(body["agreed_under_order_pct"], 1.0) + def test_agreement_filter_passthrough(self): """`?deployment=` should narrow the set.""" other_deployment = Deployment.objects.create(name="other", project=self.project) diff --git a/docs/claude/reference/api-stats-pattern.md b/docs/claude/reference/api-stats-pattern.md index 0a2cd20d2..f802d6ad8 100644 --- a/docs/claude/reference/api-stats-pattern.md +++ b/docs/claude/reference/api-stats-pattern.md @@ -232,7 +232,7 @@ into pagination only if the kind genuinely needs it): - `GET /occurrences/stats/top-identifiers/` — done (this PR) - `GET /occurrences/stats/identifications-summary/` — total / distinct / verified counts -- `GET /occurrences/stats/human-model-agreement/` — model agreement rate +- `GET /occurrences/stats/model-agreement/` — model agreement rate - `GET /occurrences/stats/identifications-by-species/` — per-taxon ID counts - `GET /occurrences/stats/timeline/` — Plotly-shaped time series - `GET /deployments/stats/processed-images/` — processed images per station diff --git a/ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts b/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts similarity index 66% rename from ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts rename to ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts index aee2da546..bdc3b23e5 100644 --- a/ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts +++ b/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts @@ -1,11 +1,13 @@ import { API_ROUTES, API_URL } from 'data-services/constants' import { useAuthorizedQuery } from '../../auth/useAuthorizedQuery' -interface Response { +interface ModelAgreementResponse { project_id: number total_occurrences: number verified_count: number verified_pct: number + verified_with_prediction_count: number + no_prediction_count: number agreed_exact_count: number agreed_exact_pct: number agreed_under_order_count: number @@ -15,11 +17,11 @@ interface Response { // Accepts an arbitrary filter map so the occurrence list page's filter state // can be threaded through unchanged (deployment, event, taxon, score // thresholds, apply_defaults, etc). -export const useHumanModelAgreement = ( +export const useModelAgreement = ( projectId?: string, filters?: Record ) => { - const url = `${API_URL}/${API_ROUTES.OCCURRENCES}/stats/human-model-agreement/` + const url = `${API_URL}/${API_ROUTES.OCCURRENCES}/stats/model-agreement/` const params = new URLSearchParams() if (projectId) params.set('project_id', projectId) @@ -31,16 +33,17 @@ export const useHumanModelAgreement = ( }) } - const { data, isLoading, isFetching, error } = useAuthorizedQuery({ - queryKey: [ - API_ROUTES.OCCURRENCES, - 'stats', - 'human-model-agreement', - projectId, - filters, - ], - url: `${url}?${params.toString()}`, - }) + const { data, isLoading, isFetching, error } = + useAuthorizedQuery({ + queryKey: [ + API_ROUTES.OCCURRENCES, + 'stats', + 'model-agreement', + projectId, + filters, + ], + url: `${url}?${params.toString()}`, + }) return { data, From da2a2328e6a7c80b0747c99433ed4c9b748d44a9 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 14 May 2026 23:41:59 -0700 Subject: [PATCH 08/18] docs(plan): add text lang to fenced block (markdownlint MD040) Co-Authored-By: Claude --- .../planning/2026-05-14-human-model-agreement-endpoint.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md b/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md index 09bd56ae9..5ad323ba2 100644 --- a/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md +++ b/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md @@ -23,7 +23,7 @@ ## File Structure -``` +```text ami/ main/ models_future/ From 7ba8689b3b46f108bb305e2f873ad5f486a369ff Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 15 May 2026 00:08:01 -0700 Subject: [PATCH 09/18] perf(occurrence-stats): scope agreement subqueries to verified set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the .aggregate() over the full filtered queryset with a two-step approach: 1. SQL Count('pk') for total_occurrences (no joins, no subqueries). 2. Fetch the verified set (occurrences with at least one non-withdrawn ident) with both best_user_taxon_id and best_machine_prediction_taxon_id annotated, then bucket counts + LCA in Python. Why: the previous version evaluated two correlated subqueries (best user identification + best machine prediction) on every row of the filtered queryset. For typical projects, >95% of occurrences have no identification — those rows ran the user-ident subquery only to discover NULL, then ran the (much more expensive) machine-prediction subquery on detections that won't contribute to any agreement bucket. Scoping the subqueries to the verified set avoids that waste. Bench (cold, cache invalidated): Project Total Verified Pre Post P#85 SEC-SEQ 36,253 13,140 — 1.18s P#20 BCI 40,958 1,351 — 0.92s P#84 Pennsylvania 18,407 251 — 0.56s P#24 Atlantic Forestry 2,797 274 — 0.50s P#18 Vermont 43,149 45 ~928ms 0.35s P#23 Insectarium Montreal 20,393 74 — 0.43s Warm via django-cachalot: 122–343ms across all projects. For P#85 (highest absolute identification count in the system), the cost is dominated by apply_default_filters' score-threshold join, not the subqueries. apply_defaults=false actually runs faster (0.69s cold, 179,466 total / 13,140 verified) because the classification join is skipped. Co-Authored-By: Claude --- ami/main/models_future/occurrence.py | 87 ++++++++++++++++------------ 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index d203daa82..edb951325 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING -from django.db.models import Count, F, OuterRef, Prefetch, Q, QuerySet, Subquery +from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery from ami.main.models import Project, TaxonRank, User @@ -178,44 +178,63 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: taxon and the model's prediction share an ancestor at rank >= ORDER (inclusive of ORDER itself). - Aggregation is SQL-side. Only the disagreement set (occurrences where - user and machine disagree at SPECIES) is materialized in Python, and - even then it's deduplicated to distinct (user_taxon, machine_taxon) - pairs so LCA runs once per pair, not once per occurrence. + Performance: the heavy work — correlated subqueries over Identification + and Classification — is scoped to the verified set, which is typically + a tiny fraction of total occurrences. Computing those subqueries over + the full filtered queryset would do 99% wasted work picking the "best + user identification" for occurrences that have none. + + Step 1: total_occurrences = SQL Count(*). + Step 2: Fetch the verified set with (pk, best_user_taxon_id, + best_machine_prediction_taxon_id). Both correlated subqueries + evaluate only on verified rows. + Step 3: Bucket counts in Python (set is small). + Step 4: Dedupe disagreement to distinct (user, machine) pairs and run + one LCA per pair. + + Bench against project 18 (43,149 occurrences, 45 verified): ~80ms cold. """ + import collections + from ami.main.models import BEST_IDENTIFICATION_ORDER, Identification, Taxon + total = queryset.count() + best_user_ident = Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False).order_by( *BEST_IDENTIFICATION_ORDER ) - qs = queryset.with_best_machine_prediction().annotate( # type: ignore[attr-defined] - best_user_taxon_id=Subquery(best_user_ident.values("taxon_id")[:1]), + verified_rows = list( + queryset.filter(identifications__withdrawn=False) + .distinct() + .with_best_machine_prediction() # type: ignore[attr-defined] + .annotate(best_user_taxon_id=Subquery(best_user_ident.values("taxon_id")[:1])) + .values("pk", "best_machine_prediction_taxon_id", "best_user_taxon_id") ) - verified_q = Q(best_user_taxon_id__isnull=False) - has_pred_q = Q(best_machine_prediction_taxon_id__isnull=False) - exact_q = verified_q & has_pred_q & Q(best_user_taxon_id=F("best_machine_prediction_taxon_id")) - - aggregates = qs.aggregate( - total_occurrences=Count("pk"), - verified_count=Count("pk", filter=verified_q), - verified_with_prediction_count=Count("pk", filter=verified_q & has_pred_q), - no_prediction_count=Count("pk", filter=verified_q & ~has_pred_q), - agreed_exact_count=Count("pk", filter=exact_q), + verified = len(verified_rows) + no_prediction = sum(1 for r in verified_rows if r["best_machine_prediction_taxon_id"] is None) + verified_with_pred = verified - no_prediction + agreed_exact = sum( + 1 + for r in verified_rows + if r["best_machine_prediction_taxon_id"] is not None + and r["best_user_taxon_id"] == r["best_machine_prediction_taxon_id"] ) - # Under-order: only the disagreement set hits Python, grouped by distinct - # (user_taxon, machine_taxon) pair so each pair's LCA is computed once. - disagreement_pairs = ( - qs.filter(verified_q & has_pred_q) - .exclude(best_user_taxon_id=F("best_machine_prediction_taxon_id")) - .values("best_user_taxon_id", "best_machine_prediction_taxon_id") - .annotate(occurrence_count=Count("pk")) - ) + # Dedupe disagreement pairs so each (user_taxon, machine_taxon) LCA runs once. + pair_counts: collections.Counter = collections.Counter() + for r in verified_rows: + m_id = r["best_machine_prediction_taxon_id"] + u_id = r["best_user_taxon_id"] + if m_id is None or u_id is None or u_id == m_id: + continue + pair_counts[(u_id, m_id)] += 1 - pairs = list(disagreement_pairs) - needed_taxa_ids = {p["best_user_taxon_id"] for p in pairs} | {p["best_machine_prediction_taxon_id"] for p in pairs} + needed_taxa_ids: set[int] = set() + for u_id, m_id in pair_counts: + needed_taxa_ids.add(u_id) + needed_taxa_ids.add(m_id) taxa_by_id: dict[int, TaxonTuple] = {} if needed_taxa_ids: @@ -226,20 +245,16 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: taxa_by_id[t.pk] = (t.pk, t.rank, parents) under_order_disagreement_count = 0 - for pair in pairs: - u = taxa_by_id.get(pair["best_user_taxon_id"]) - m = taxa_by_id.get(pair["best_machine_prediction_taxon_id"]) + for (u_id, m_id), count in pair_counts.items(): + u = taxa_by_id.get(u_id) + m = taxa_by_id.get(m_id) if not u or not m: continue lca = lca_rank_between(u, m) if lca is not None and lca >= TaxonRank.ORDER: - under_order_disagreement_count += pair["occurrence_count"] + under_order_disagreement_count += count - agreed_exact = aggregates["agreed_exact_count"] agreed_under_order = agreed_exact + under_order_disagreement_count - total = aggregates["total_occurrences"] - verified = aggregates["verified_count"] - verified_with_pred = aggregates["verified_with_prediction_count"] def _pct(num: int, denom: int) -> float: return round(num / denom, 4) if denom else 0.0 @@ -249,7 +264,7 @@ def _pct(num: int, denom: int) -> float: "verified_count": verified, "verified_pct": _pct(verified, total), "verified_with_prediction_count": verified_with_pred, - "no_prediction_count": aggregates["no_prediction_count"], + "no_prediction_count": no_prediction, "agreed_exact_count": agreed_exact, "agreed_exact_pct": _pct(agreed_exact, verified_with_pred), "agreed_under_order_count": agreed_under_order, From 6ad1885efe91da7fb9dcfd42d385beb3b9d980cf Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 20 May 2026 17:51:56 -0700 Subject: [PATCH 10/18] feat(occurrence-stats): drop ORDER threshold; add coarsest_rank query param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces hardcoded `lca >= TaxonRank.ORDER` agreement gate with two layers: - Always returned: `agreed_any_rank_*` — exact matches plus any non-null LCA at a real rank (UNKNOWN excluded). The upstream filter (e.g. a Lepidoptera include list) is what bounds the meaningful scope, not a hardcoded threshold in this function. - Optional `?agreement_coarsest_rank=FAMILY`: when supplied, response also includes `agreed_coarser_rank_*` (exact + LCAs at or below the threshold). The applied rank is echoed in `agreement_coarsest_rank`; null when absent. Also addresses CodeRabbit feedback on the existing branch: - Dedupe base queryset before counting (joins from default-filter chain can inflate Occurrence rows). - Bound `*_pct` FloatFields to [0.0, 1.0] in the serializer. Param validation: invalid rank → 400; UNKNOWN rejected as not meaningful. Tests cover any-rank fallback, threshold filtering, invalid + UNKNOWN rejection, and threshold echo. Co-Authored-By: Claude --- ami/main/api/serializers.py | 55 ++++++++++++++--- ami/main/api/views.py | 22 ++++++- ami/main/models_future/occurrence.py | 47 +++++++++++---- ami/main/tests.py | 89 ++++++++++++++++++++++------ 4 files changed, 176 insertions(+), 37 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index ab4a83d83..ee9a5121d 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1756,9 +1756,9 @@ class TopIdentifiersResponseSerializer(serializers.Serializer): class ModelAgreementSerializer(serializers.Serializer): """Verified / agreement rates over the filtered Occurrence set. - `agreed_exact_count` is a subset of `agreed_under_order_count` by - construction — an exact match implies an LCA at SPECIES, which is - deeper than ORDER. `*_pct` percentages are 0.0..1.0 (not 0..100). + `agreed_exact_count` is a subset of `agreed_any_rank_count` by + construction — an exact match implies the LCA is the taxon itself. + `*_pct` percentages are 0.0..1.0 (not 0..100). Denominator note: `agreed_*_pct` divide by `verified_with_prediction_count` (verified occurrences that *also* have a machine prediction), NOT by @@ -1766,12 +1766,23 @@ class ModelAgreementSerializer(serializers.Serializer): agree or disagree — including it in the denominator would drag the rate down without representing actual model disagreement. `no_prediction_count` is surfaced so the consumer can see how many such occurrences exist. + + Optional rank threshold: when the caller passes + `?agreement_coarsest_rank=FAMILY`, the response also includes + `agreed_coarser_rank_*` counting only LCAs at that rank or deeper. The + threshold rank is echoed in `agreement_coarsest_rank`. When the param is + absent, the coarser-rank fields are null and `agreement_coarsest_rank` + is null. """ project_id = serializers.IntegerField() total_occurrences = serializers.IntegerField() verified_count = serializers.IntegerField(help_text="Occurrences with at least one non-withdrawn identification.") - verified_pct = serializers.FloatField(help_text="verified_count / total_occurrences") + verified_pct = serializers.FloatField( + min_value=0.0, + max_value=1.0, + help_text="verified_count / total_occurrences", + ) verified_with_prediction_count = serializers.IntegerField( help_text="Verified occurrences that also have a machine prediction (denominator for agreed_*_pct)." ) @@ -1779,8 +1790,36 @@ class ModelAgreementSerializer(serializers.Serializer): help_text="Verified occurrences with no machine prediction (excluded from agreement denominator)." ) agreed_exact_count = serializers.IntegerField() - agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_with_prediction_count") - agreed_under_order_count = serializers.IntegerField() - agreed_under_order_pct = serializers.FloatField( - help_text="agreed_under_order_count / verified_with_prediction_count" + agreed_exact_pct = serializers.FloatField( + min_value=0.0, + max_value=1.0, + help_text="agreed_exact_count / verified_with_prediction_count", + ) + agreed_any_rank_count = serializers.IntegerField( + help_text="Exact matches plus disagreements whose LCA is at any real rank (UNKNOWN excluded)." + ) + agreed_any_rank_pct = serializers.FloatField( + min_value=0.0, + max_value=1.0, + help_text="agreed_any_rank_count / verified_with_prediction_count", + ) + agreement_coarsest_rank = serializers.CharField( + allow_null=True, + required=False, + help_text="Threshold rank from ?agreement_coarsest_rank query param. Null when the param is absent.", + ) + agreed_coarser_rank_count = serializers.IntegerField( + allow_null=True, + required=False, + help_text=( + "Exact matches plus disagreements whose LCA is at `agreement_coarsest_rank` or deeper. " + "Null when no threshold was supplied." + ), + ) + agreed_coarser_rank_pct = serializers.FloatField( + min_value=0.0, + max_value=1.0, + allow_null=True, + required=False, + help_text="agreed_coarser_rank_count / verified_with_prediction_count. Null when no threshold supplied.", ) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index e76c59578..b27cf48c1 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -55,6 +55,7 @@ Tag, TaxaList, Taxon, + TaxonRank, User, update_detection_counts, ) @@ -1375,15 +1376,34 @@ def model_agreement(self, request): Accepts every query param the `/occurrences/` list endpoint accepts. Reuses `apply_default_filters` so `apply_defaults=false` bypasses project default taxa lists + score thresholds. + + Optional ?agreement_coarsest_rank= adds `agreed_coarser_rank_*` + counts — LCAs at the given rank or deeper. Valid values: any + TaxonRank name (FAMILY, GENUS, etc.); invalid → 400. """ project = self.get_active_project() assert project is not None # require_project=True guarantees this if not Project.objects.visible_for_user(request.user).filter(pk=project.pk).exists(): raise NotFound("Project not found.") + coarsest_rank_param = request.query_params.get("agreement_coarsest_rank") + coarsest_rank = None + if coarsest_rank_param: + try: + coarsest_rank = TaxonRank[coarsest_rank_param.upper()] + except KeyError: + valid = ", ".join(r.name for r in TaxonRank if r.name != "UNKNOWN") + raise api_exceptions.ValidationError( + {"agreement_coarsest_rank": f"Invalid rank '{coarsest_rank_param}'. Must be one of: {valid}."} + ) + if coarsest_rank == TaxonRank.UNKNOWN: + raise api_exceptions.ValidationError( + {"agreement_coarsest_rank": "UNKNOWN is not a valid threshold rank."} + ) + base_qs = Occurrence.objects.filter(project=project).valid().apply_default_filters(project, request) filtered_qs = self.filter_queryset(base_qs) - payload = model_agreement_for_project(filtered_qs) + payload = model_agreement_for_project(filtered_qs, coarsest_rank=coarsest_rank) payload["project_id"] = project.pk return Response(ModelAgreementSerializer(payload, context={"request": request}).data) diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index edb951325..cdce45cb8 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -163,7 +163,10 @@ def detection_image_urls_from_prefetch(occurrence: Occurrence, limit: int | None return [get_media_url(det.path) for det in detections] -def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: +def model_agreement_for_project( + queryset: QuerySet[Occurrence], + coarsest_rank: TaxonRank | None = None, +) -> dict: """Verified / agreement stats over a pre-filtered Occurrence queryset. The queryset MUST already be filtered to the project + user-supplied @@ -174,9 +177,16 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: "Verified" means the occurrence has at least one non-withdrawn Identification. "Model prediction" means the Classification chosen by - BEST_MACHINE_PREDICTION_ORDER. "Under-order" agreement means the user's - taxon and the model's prediction share an ancestor at rank >= ORDER - (inclusive of ORDER itself). + BEST_MACHINE_PREDICTION_ORDER. "Any-rank" agreement means the user's + taxon and the model's prediction share an ancestor at any real rank + (UNKNOWN excluded) — exact matches included. The upstream filter (e.g. + a Lepidoptera include list) is what bounds the meaningful scope, not + a hardcoded rank threshold in this function. + + When ``coarsest_rank`` is supplied, additionally compute "coarser-rank" + agreement: the LCA must be at ``coarsest_rank`` or deeper (e.g. passing + FAMILY only counts LCAs at FAMILY, GENUS, or SPECIES). Exact matches + always count regardless of rank. Performance: the heavy work — correlated subqueries over Identification and Classification — is scoped to the verified set, which is typically @@ -198,6 +208,10 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: from ami.main.models import BEST_IDENTIFICATION_ORDER, Identification, Taxon + # Default filters can join Identification (verified_by_me) and Taxon + # parents_json (taxa_list_id) which inflates row count if not deduped. + # Dedupe up front so total + verified counts share one canonical set. + queryset = queryset.distinct() total = queryset.count() best_user_ident = Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False).order_by( @@ -244,22 +258,27 @@ def model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: ] taxa_by_id[t.pk] = (t.pk, t.rank, parents) - under_order_disagreement_count = 0 + any_rank_disagreement_count = 0 + coarser_rank_disagreement_count = 0 for (u_id, m_id), count in pair_counts.items(): u = taxa_by_id.get(u_id) m = taxa_by_id.get(m_id) if not u or not m: continue lca = lca_rank_between(u, m) - if lca is not None and lca >= TaxonRank.ORDER: - under_order_disagreement_count += count + if lca is None: + continue + any_rank_disagreement_count += count + if coarsest_rank is not None and lca >= coarsest_rank: + coarser_rank_disagreement_count += count - agreed_under_order = agreed_exact + under_order_disagreement_count + agreed_any_rank = agreed_exact + any_rank_disagreement_count + agreed_coarser_rank = agreed_exact + coarser_rank_disagreement_count def _pct(num: int, denom: int) -> float: return round(num / denom, 4) if denom else 0.0 - return { + payload: dict = { "total_occurrences": total, "verified_count": verified, "verified_pct": _pct(verified, total), @@ -267,9 +286,15 @@ def _pct(num: int, denom: int) -> float: "no_prediction_count": no_prediction, "agreed_exact_count": agreed_exact, "agreed_exact_pct": _pct(agreed_exact, verified_with_pred), - "agreed_under_order_count": agreed_under_order, - "agreed_under_order_pct": _pct(agreed_under_order, verified_with_pred), + "agreed_any_rank_count": agreed_any_rank, + "agreed_any_rank_pct": _pct(agreed_any_rank, verified_with_pred), + "agreement_coarsest_rank": coarsest_rank.name if coarsest_rank is not None else None, + "agreed_coarser_rank_count": agreed_coarser_rank if coarsest_rank is not None else None, + "agreed_coarser_rank_pct": ( + _pct(agreed_coarser_rank, verified_with_pred) if coarsest_rank is not None else None + ), } + return payload def top_identifiers_for_project(project: Project) -> QuerySet[User]: diff --git a/ami/main/tests.py b/ami/main/tests.py index e1a38b61f..92a86837f 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4803,8 +4803,9 @@ def test_unknown_rank_excluded_from_lca(self): class TestModelAgreementForProject(APITestCase): """Aggregation function over a filtered Occurrence queryset. - Covers the four bucket transitions: unverified, verified+exact-agreed, - verified+under-order-agreed, verified+disagreed-above-order. + Covers four bucket transitions: unverified, verified+exact-agreed, + verified+any-rank-agreed (no threshold), verified+disagreed-no-shared-rank. + Optional coarsest_rank threshold cases handled in the viewset tests below. """ def setUp(self) -> None: @@ -4842,7 +4843,11 @@ def test_empty_project_returns_zeros_not_nans(self): self.assertEqual(result["verified_count"], 0) self.assertEqual(result["verified_pct"], 0.0) self.assertEqual(result["agreed_exact_pct"], 0.0) - self.assertEqual(result["agreed_under_order_pct"], 0.0) + self.assertEqual(result["agreed_any_rank_pct"], 0.0) + # No threshold passed → coarser-rank fields null. + self.assertIsNone(result["agreement_coarsest_rank"]) + self.assertIsNone(result["agreed_coarser_rank_count"]) + self.assertIsNone(result["agreed_coarser_rank_pct"]) def test_buckets_canonical_cases(self): from ami.main.models_future.occurrence import model_agreement_for_project @@ -4851,9 +4856,9 @@ def test_buckets_canonical_cases(self): self.assertEqual(len(occurrences), 4) # 0: verified, machine == user (exact agreement at SPECIES) self._identify(occurrences[0], self.vanessa_atalanta) - # 1: verified, sister species (under-order at GENUS) + # 1: verified, sister species (LCA at GENUS) self._identify(occurrences[1], self.vanessa_cardui) - # 2: verified, different family same order (under-order at ORDER) + # 2: verified, different family same order (LCA at ORDER) self._identify(occurrences[2], self.pieris_brassicae) # 3: unverified @@ -4861,10 +4866,33 @@ def test_buckets_canonical_cases(self): self.assertEqual(result["total_occurrences"], 4) self.assertEqual(result["verified_count"], 3) self.assertEqual(result["agreed_exact_count"], 1) - self.assertEqual(result["agreed_under_order_count"], 3) + self.assertEqual(result["agreed_any_rank_count"], 3) self.assertAlmostEqual(result["verified_pct"], 0.75) self.assertAlmostEqual(result["agreed_exact_pct"], 1 / 3, places=3) - self.assertAlmostEqual(result["agreed_under_order_pct"], 1.0) + self.assertAlmostEqual(result["agreed_any_rank_pct"], 1.0) + + def test_coarsest_rank_threshold_filters_shallow_lcas(self): + """With coarsest_rank=FAMILY, an ORDER-only LCA pair is excluded.""" + from ami.main.models import TaxonRank + from ami.main.models_future.occurrence import model_agreement_for_project + + occurrences = list(Occurrence.objects.filter(project=self.project).order_by("pk")) + # 0: exact (SPECIES) — counts in both + self._identify(occurrences[0], self.vanessa_atalanta) + # 1: sister species (LCA = GENUS, deeper than FAMILY) — counts in both + self._identify(occurrences[1], self.vanessa_cardui) + # 2: different family same order (LCA = ORDER, NOT >= FAMILY) — counts in any_rank only + self._identify(occurrences[2], self.pieris_brassicae) + + result = model_agreement_for_project( + Occurrence.objects.filter(project=self.project), + coarsest_rank=TaxonRank.FAMILY, + ) + self.assertEqual(result["agreed_any_rank_count"], 3) + self.assertEqual(result["agreement_coarsest_rank"], "FAMILY") + # exact + GENUS LCA = 2; ORDER LCA excluded + self.assertEqual(result["agreed_coarser_rank_count"], 2) + self.assertAlmostEqual(result["agreed_coarser_rank_pct"], 2 / 3, places=3) class TestOccurrenceStatsViewSet(APITestCase): @@ -4973,7 +5001,11 @@ def test_agreement_empty_returns_zero_pcts(self): self.assertEqual(body["verified_count"], 0) self.assertEqual(body["verified_pct"], 0.0) self.assertEqual(body["agreed_exact_pct"], 0.0) - self.assertEqual(body["agreed_under_order_pct"], 0.0) + self.assertEqual(body["agreed_any_rank_pct"], 0.0) + # No ?agreement_coarsest_rank → threshold + coarser fields null. + self.assertIsNone(body["agreement_coarsest_rank"]) + self.assertIsNone(body["agreed_coarser_rank_count"]) + self.assertIsNone(body["agreed_coarser_rank_pct"]) def test_agreement_happy_path(self): """One verified occurrence; user agrees with the machine prediction → exact match. @@ -4996,15 +5028,14 @@ def test_agreement_happy_path(self): self.assertEqual(body["verified_with_prediction_count"], 1) self.assertEqual(body["no_prediction_count"], 0) self.assertEqual(body["agreed_exact_count"], 1) - self.assertEqual(body["agreed_under_order_count"], 1) + self.assertEqual(body["agreed_any_rank_count"], 1) - def test_agreement_under_order_bucket(self): - """Disagreement at species but same genus → counted under-order, not exact. + def test_agreement_any_rank_bucket(self): + """Disagreement at species but same genus → counted as any-rank agreement, not exact. Pick the machine prediction's sister species (same parent genus) for the - identification. LCA between the two species is GENUS, which is >= ORDER, - so the occurrence falls into the under-order bucket without contributing - to agreed_exact_count. + identification. LCA between the two species is GENUS, so the occurrence + falls into the any-rank bucket without contributing to agreed_exact_count. """ occurrence = Occurrence.objects.filter(project=self.project).order_by("pk").first() machine_taxon = occurrence.detections.first().classifications.first().taxon @@ -5024,10 +5055,34 @@ def test_agreement_under_order_bucket(self): self.assertEqual(body["verified_count"], 1) self.assertEqual(body["verified_with_prediction_count"], 1) self.assertEqual(body["agreed_exact_count"], 0) - self.assertEqual(body["agreed_under_order_count"], 1) - # 0/1 exact, 1/1 under-order + self.assertEqual(body["agreed_any_rank_count"], 1) + # 0/1 exact, 1/1 any-rank self.assertEqual(body["agreed_exact_pct"], 0.0) - self.assertEqual(body["agreed_under_order_pct"], 1.0) + self.assertEqual(body["agreed_any_rank_pct"], 1.0) + + def test_agreement_coarsest_rank_invalid_returns_400(self): + response = self.client.get( + f"{self.agreement_url}?project_id={self.project.pk}&agreement_coarsest_rank=GARBAGE" + ) + self.assertEqual(response.status_code, 400) + self.assertIn("agreement_coarsest_rank", response.json()) + + def test_agreement_coarsest_rank_unknown_rejected(self): + """UNKNOWN is a real enum member but not a meaningful threshold.""" + response = self.client.get( + f"{self.agreement_url}?project_id={self.project.pk}&agreement_coarsest_rank=UNKNOWN" + ) + self.assertEqual(response.status_code, 400) + + def test_agreement_coarsest_rank_echoed_in_response(self): + response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}&agreement_coarsest_rank=family") + self.assertEqual(response.status_code, 200) + body = response.json() + # Param is case-insensitive; response echoes enum name (uppercase). + self.assertEqual(body["agreement_coarsest_rank"], "FAMILY") + # No verified occurrences in this fixture → coarser fields present but zero. + self.assertEqual(body["agreed_coarser_rank_count"], 0) + self.assertEqual(body["agreed_coarser_rank_pct"], 0.0) def test_agreement_filter_passthrough(self): """`?deployment=` should narrow the set.""" From 6f51da52e20dbdaad91829f14db4ae9c38c8b56c Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 20 May 2026 17:54:35 -0700 Subject: [PATCH 11/18] feat(ui): align model-agreement hook with BE rename + multi-value query params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename `agreed_under_order_*` → `agreed_any_rank_*` to match the endpoint's dropped ORDER threshold (0565f068). - Add optional `agreement_coarsest_rank` + `agreed_coarser_rank_*` fields to the response type (not consumed yet — UI follows in #1308). - Widen `filters` to accept arrays and append repeated query params so multi-value filters (e.g. `algorithm`, `not_algorithm` — backend reads via `request.query_params.getlist(...)`) survive. Per CodeRabbit review. Co-Authored-By: Claude --- .../occurrences/stats/useModelAgreement.ts | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts b/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts index bdc3b23e5..d783103df 100644 --- a/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts +++ b/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts @@ -10,16 +10,25 @@ interface ModelAgreementResponse { no_prediction_count: number agreed_exact_count: number agreed_exact_pct: number - agreed_under_order_count: number - agreed_under_order_pct: number + agreed_any_rank_count: number + agreed_any_rank_pct: number + // Only populated when the caller passes ?agreement_coarsest_rank=. + agreement_coarsest_rank: string | null + agreed_coarser_rank_count: number | null + agreed_coarser_rank_pct: number | null } +type FilterPrimitive = string | number | boolean +type FilterValue = FilterPrimitive | FilterPrimitive[] | null | undefined + // Accepts an arbitrary filter map so the occurrence list page's filter state // can be threaded through unchanged (deployment, event, taxon, score -// thresholds, apply_defaults, etc). +// thresholds, apply_defaults, etc). Arrays are appended as repeated query +// params so multi-select filters (e.g. `algorithm`, `not_algorithm`, which +// the backend reads via `request.query_params.getlist(...)`) survive. export const useModelAgreement = ( projectId?: string, - filters?: Record + filters?: Record ) => { const url = `${API_URL}/${API_ROUTES.OCCURRENCES}/stats/model-agreement/` @@ -27,11 +36,19 @@ export const useModelAgreement = ( if (projectId) params.set('project_id', projectId) if (filters) { Object.entries(filters).forEach(([key, value]) => { - if (value !== undefined && value !== '' && value !== null) { - params.set(key, String(value)) + if (value === undefined || value === null || value === '') return + if (Array.isArray(value)) { + value.forEach((item) => { + if (item !== undefined && item !== null && item !== '') { + params.append(key, String(item)) + } + }) + return } + params.set(key, String(value)) }) } + const queryString = params.toString() const { data, isLoading, isFetching, error } = useAuthorizedQuery({ @@ -40,9 +57,9 @@ export const useModelAgreement = ( 'stats', 'model-agreement', projectId, - filters, + queryString, ], - url: `${url}?${params.toString()}`, + url: `${url}?${queryString}`, }) return { From 7c144b0934ef6349566d10851b40cc4ad7ac8011 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 20 May 2026 18:12:47 -0700 Subject: [PATCH 12/18] chore(docs): drop NEXT_SESSION_PROMPT.md from PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session-scratchpad doc — belongs in local notes, not the merged branch. Co-Authored-By: Claude --- docs/claude/prompts/NEXT_SESSION_PROMPT.md | 86 ---------------------- 1 file changed, 86 deletions(-) delete mode 100644 docs/claude/prompts/NEXT_SESSION_PROMPT.md diff --git a/docs/claude/prompts/NEXT_SESSION_PROMPT.md b/docs/claude/prompts/NEXT_SESSION_PROMPT.md deleted file mode 100644 index 9f3a54217..000000000 --- a/docs/claude/prompts/NEXT_SESSION_PROMPT.md +++ /dev/null @@ -1,86 +0,0 @@ -# Next session — PR #1307 rework - -**Branch:** `feat/human-model-agreement-endpoint` (worktree `occurrence-stats`) -**PR:** https://github.com/RolnickLab/antenna/pull/1307 -**Main stack override:** `/home/michael/Projects/AMI/antenna/docker-compose.override.yml` mounts this worktree's `ami/` + `config/` over the main `antenna` stack — `docker compose ps` already shows django/celeryworker recreated. Stack live; smoke against `http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18` returns 200. - -## Tasks for this session - -### 1. Rename: drop "human" - -User wants `human-model-agreement` → `model-agreement`, `HumanModelAgreement*` → `ModelAgreement*`, `human_model_agreement_for_project` → `model_agreement_for_project`. Files to touch: - -- `ami/main/models_future/occurrence.py:160` — fn name + docstring -- `ami/main/api/serializers.py` — `HumanModelAgreementSerializer` class -- `ami/main/api/views.py:35-38` — import; line 94 import; viewset action name + url_path; serializer references at the action site -- `ami/main/tests.py` — `TestHumanModelAgreementForProject` class; `agreement_url`; all imports -- `ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts` — rename file to `useModelAgreement.ts`; rename hook + `Response` interface to `ModelAgreementResponse` (Copilot review caught the DOM `Response` shadow) -- `docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md` — leave the old plan doc as historical record; cite the new endpoint name where relevant -- PR title + body - -### 2. Push aggregation into SQL (Copilot + CodeRabbit both flagged) - -**Evidence:** Vermont (project 18) has 43,149 occurrences; `?apply_defaults=false` curl hit 159s and timed out at the curl layer. Current Python iteration over the full filtered queryset doesn't scale. - -**Proposed approach** (validate before coding): - -1. Annotate the queryset with subqueries to expose `best_machine_taxon_id` (already there via `with_best_machine_prediction()`) and `best_user_taxon_id` (new — subquery over `Identification` ordered by `BEST_IDENTIFICATION_ORDER`). -2. Compute totals with `aggregate()` using `Count('pk', filter=Q(...), distinct=True)`: - - `total_occurrences = Count('pk')` - - `verified_count = Count('pk', filter=Q(best_user_taxon_id__isnull=False))` (drop the verified-without-prediction trap from Copilot finding #3 — see fix below) - - `agreed_exact_count = Count('pk', filter=Q(best_user_taxon_id=F('best_machine_taxon_id')))` -3. For `agreed_under_order_count` — the hard part — try one of: - - **(a)** Annotate `best_user_taxon_order_id` and `best_machine_taxon_order_id` via Postgres `jsonb_path_query_first(parents_json, '$[*] ? (@.rank == "ORDER").id')` raw expressions. Two taxa agree under-order iff their order ids match AND neither is null. Add a row-level Python check only if the user's own rank is at-or-below ORDER (since user might ID at FAMILY directly with no ORDER ancestor in parents_json — but the taxon's own rank should be checked too). - - **(b)** Denormalize: add `order_taxon_id` column on Taxon, populate in `update_parents()`. Cleaner queries, needs migration + backfill. - - **(c)** Hybrid: keep Python LCA but batch via single annotated `values_list('pk', 'best_user_taxon_id', 'best_machine_taxon_id')` query plus one batched `Taxon` lookup. Avoids the `list(qs)` materialization but still does Python LCA. Faster than current; not as clean as (a) or (b). -4. Bench against project 18 unfiltered AND with `apply_defaults=false` before merging. Target: subsecond. - -Read Copilot's comment at `ami/main/models_future/occurrence.py:227` and CodeRabbit's at `:187` for their exact wording. - -### 3. Fix correctness bugs flagged in review - -**3a. `TaxonRank.UNKNOWN` bug** (Copilot, `:227`) -`UNKNOWN` is defined AFTER `SPECIES` in `ami/utils/schemas.py`, so `TaxonRank.UNKNOWN >= TaxonRank.ORDER` is `True` by definition order. If either chain contains an `UNKNOWN` ancestor that happens to be the deepest shared one, LCA wrongly counts as under-order. Filter `UNKNOWN` out of `lca_rank_between`'s candidate ranks. Add a unit test. - -**3b. Denominator bug** (Copilot, `:240`) -`agreed_exact_pct` / `agreed_under_order_pct` divide by `verified` but `verified` includes occurrences with **no** machine prediction — those can never agree, so they drag the pct down. Two options: -- Change the denominator to `verified AND has_machine_prediction` and call the field `verified_with_prediction_count` (clearer semantics). -- Keep `verified` as the denominator but add a separate `no_prediction_count` so the consumer can adjust. - -User probably prefers option 1 + surface the `no_prediction_count` as a sibling field. Check with them before coding. - -**3c. Drop wasted `select_related("taxon")` on idents prefetch** (Copilot, `:182`) — only `taxon_id` is read; the related Taxon row is re-fetched in the batch. - -**3d. `verified_by_me` anon access** (Copilot, `ami/main/api/views.py:1303`) -`OccurrenceVerifiedByMeFilter` is now wired into `OccurrenceStatsViewSet` via the shared `OCCURRENCE_FILTER_BACKENDS` tuple. With `IsActiveStaffOrReadOnly` allowing anon reads, an anon `?verified_by_me=true` reads `request.user` (AnonymousUser) — the filter currently guards on `is_authenticated` so it short-circuits, but consider gating the action explicitly or filtering the backend list for anon. Decide before merging. - -### 4. Test gaps to fill - -**4a. Under-order-but-not-exact HTTP coverage** (Copilot, `tests.py:4969`) -`test_agreement_happy_path` only hits the exact-match shortcut. Add a test that wires a sister-species identification (matches the T2 aggregation test's "bucket 1") and asserts `agreed_exact_count=0, agreed_under_order_count=1`. - -**4b. `UNKNOWN` rank regression test** — covered above in 3a. - -**4c. `no_prediction_count` test** — if you add that field per 3b, test it. - -### 5. Markdown lint nit (CodeRabbit, plan doc:43) - -Add `text` lang specifier to the fenced "File Structure" block. - -## After the rework - -1. Run full sweep: `docker compose -f docker-compose.ci.yml run --rm django python manage.py test ami.main.tests.TestOccurrenceStatsViewSet ami.main.tests.TestModelAgreementForProject ami.main.tests.TestLcaRankBetween ami.main.tests.TestOccurrenceListQueryCount -v 1 --keepdb` -2. Bench against project 18 — log curl `time_total` for the unfiltered + `apply_defaults=false` cases. Memory budget: should not materialize 43k rows. -3. Reply to each Copilot/CodeRabbit thread with `**Claude says:** Fixed in ...` per CLAUDE.md PR comment workflow. -4. Resolve threads via GraphQL once replied. -5. Push, let CI run, then ping user. - -## Files to grep first - -- Existing SQL-side patterns: `OccurrenceQuerySet.with_best_machine_prediction()` at `ami/main/models.py:2998`, `with_verification_info()` at `:3022`, `unique_taxa()` at `:3051`. These all use `Subquery(...)` annotations — same pattern to follow. -- `parents_json` jsonb queries: `Taxon.objects.filter(parents_json__contains=[{"id": ...}])` at `ami/main/models.py:3661, 3776, 3787` — that's the standard ORM idiom. For `jsonb_path_query_first` you'll need `RawSQL` or a custom `Func` subclass. -- Override file (already mounted): `/home/michael/Projects/AMI/antenna/docker-compose.override.yml` — leave as-is. - -## Compaction note - -Current session committed 5 PR commits + the plan doc + side-research export stub at `docs/claude/planning/occurrence-filter-driven-exports.md`. PR #1307 open with CodeRabbit + Copilot reviews already on it. Memory file `MEMORY.md` should be updated to add a `project_pr_1307_human_model_agreement.md` entry summarizing state (TODO this session start). From 34aace5de0ac8ef974d06ebc74410e7908c65585 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Wed, 20 May 2026 18:17:58 -0700 Subject: [PATCH 13/18] chore(docs): drop session-scratchpad planning docs from PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 2026-05-14-human-model-agreement-endpoint.md — design narrative; superseded by code + PR description. - occurrence-filter-driven-exports.md — side-research stub Copilot flagged as out-of-scope. Promoted to a PR-description follow-up item. Co-Authored-By: Claude --- ...26-05-14-human-model-agreement-endpoint.md | 812 ------------------ .../occurrence-filter-driven-exports.md | 116 --- 2 files changed, 928 deletions(-) delete mode 100644 docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md delete mode 100644 docs/claude/planning/occurrence-filter-driven-exports.md diff --git a/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md b/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md deleted file mode 100644 index 5ad323ba2..000000000 --- a/docs/claude/planning/2026-05-14-human-model-agreement-endpoint.md +++ /dev/null @@ -1,812 +0,0 @@ -# `/occurrences/stats/human-model-agreement/` Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add a project-scoped stats endpoint that returns verified-occurrence and human↔model-agreement rates over the same filter set the `/occurrences/` list view accepts. - -**Architecture:** -- Pure aggregation function in `ami/main/models_future/occurrence.py` operating on an already-filtered `Occurrence` queryset (caller wires `apply_default_filters` + `OccurrenceFilter`). -- `@action` on existing `OccurrenceStatsViewSet`. Re-uses `OccurrenceViewSet`'s `filter_backends` + `filterset_fields` so any query param valid on the list view is valid here. -- LCA computed in Python via `Taxon.parents_json`. Rank ordering via existing `TaxonRank(OrderedEnum)`. No DB schema changes. - -**Tech Stack:** Django 4.2, DRF, django-filter, drf-spectacular. Python 3.11. - -**Spec reference:** `docs/claude/prompts/human-model-agreement-endpoint.md` (lives in sibling `user-leaderboard` worktree). Stats convention: `docs/claude/reference/api-stats-pattern.md`. - -**Open questions resolved during planning** (cite as evidence in PR description): - -- **"Verified"** = occurrence has ≥1 non-withdrawn `Identification`. Matches `OccurrenceVerified` filter at `ami/main/api/views.py:1032` (which doesn't filter `withdrawn`), with `withdrawn=False` added for stats — consistent with `OccurrenceQuerySet.with_verification_info()` at `ami/main/models.py:3032`. -- **"Model prediction"** = `Classification` chosen by `BEST_MACHINE_PREDICTION_ORDER = ("-terminal", "-score", "-pk")` at `ami/main/models.py:61`. NOT `Occurrence.determination` (user-overridable). Use existing `OccurrenceQuerySet.with_best_machine_prediction()` at `ami/main/models.py:2998` which exposes `best_machine_prediction_taxon_id`. -- **"Under order"** inclusive: a taxon's rank qualifies iff `TaxonRank(rank) >= TaxonRank.ORDER`. `OrderedEnum.__ge__` at `ami/utils/schemas.py:51`. So ORDER, SUPERFAMILY, FAMILY, SUBFAMILY, TRIBE, SUBTRIBE, GENUS, SPECIES all count. CLASS, PHYLUM, KINGDOM do not. - ---- - -## File Structure - -```text -ami/ - main/ - models_future/ - occurrence.py # ADD: human_model_agreement_for_project() - # ADD: _lca_rank_of() helper - api/ - views.py # MODIFY: add human_model_agreement @action to OccurrenceStatsViewSet - serializers.py # ADD: HumanModelAgreementSerializer - tests.py # MODIFY: extend TestOccurrenceStatsViewSet -ui/ - src/ - data-services/ - hooks/ - occurrences/ - stats/ - useHumanModelAgreement.ts # ADD: typed React Query hook -``` - -No new files in backend (helpers live next to siblings). One new file frontend-side. - ---- - -## Task 1: LCA helper + rank check (unit-test only, no DB) - -**Files:** -- Modify: `ami/main/models_future/occurrence.py` -- Test: `ami/main/tests.py` (new class `TestHumanModelAgreementHelpers`) - -The LCA helper takes two `parents_json` lists (plus each taxon's own `(id, rank)` since `parents_json` excludes self) and returns the most-specific shared ancestor's `TaxonRank`, or `None`. Pure function; no DB. - -- [ ] **Step 1.1: Write failing unit tests** - -Add to `ami/main/tests.py` (above `class TestOccurrenceStatsViewSet`): - -```python -from ami.main.models import TaxonRank -from ami.main.models_future.occurrence import lca_rank_between - - -class TestLcaRankBetween(TestCase): - """Pure-Python LCA over (taxon_id, rank, parents_json) tuples. - - Inputs encode each taxon as ``(id, rank_str, [{"id": int, "rank": str}, ...])`` - where the parents list is ordered root → immediate-parent (matches - Taxon.parents_json layout). - """ - - GENUS_NOCTUA = (101, "GENUS", [ - {"id": 1, "rank": "KINGDOM"}, - {"id": 4, "rank": "ORDER"}, - {"id": 30, "rank": "FAMILY"}, - ]) - SPECIES_NOCTUA_PRONUBA = (201, "SPECIES", [ - {"id": 1, "rank": "KINGDOM"}, - {"id": 4, "rank": "ORDER"}, - {"id": 30, "rank": "FAMILY"}, - {"id": 101, "rank": "GENUS"}, - ]) - SPECIES_NOCTUA_FIMBRIATA = (202, "SPECIES", [ - {"id": 1, "rank": "KINGDOM"}, - {"id": 4, "rank": "ORDER"}, - {"id": 30, "rank": "FAMILY"}, - {"id": 101, "rank": "GENUS"}, - ]) - SPECIES_DIFFERENT_FAMILY = (301, "SPECIES", [ - {"id": 1, "rank": "KINGDOM"}, - {"id": 4, "rank": "ORDER"}, - {"id": 99, "rank": "FAMILY"}, - ]) - SPECIES_DIFFERENT_ORDER = (401, "SPECIES", [ - {"id": 1, "rank": "KINGDOM"}, - {"id": 5, "rank": "ORDER"}, - ]) - - def test_identical_taxa_lca_is_self_rank(self): - rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_PRONUBA) - self.assertEqual(rank, TaxonRank.SPECIES) - - def test_sister_species_share_genus(self): - rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_NOCTUA_FIMBRIATA) - self.assertEqual(rank, TaxonRank.GENUS) - - def test_genus_vs_species_in_same_genus(self): - rank = lca_rank_between(self.GENUS_NOCTUA, self.SPECIES_NOCTUA_PRONUBA) - # GENUS itself is on the species' ancestor chain, so LCA = GENUS. - self.assertEqual(rank, TaxonRank.GENUS) - - def test_different_family_same_order(self): - rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_FAMILY) - self.assertEqual(rank, TaxonRank.ORDER) - - def test_different_order_same_kingdom(self): - rank = lca_rank_between(self.SPECIES_NOCTUA_PRONUBA, self.SPECIES_DIFFERENT_ORDER) - self.assertEqual(rank, TaxonRank.KINGDOM) - - def test_no_shared_ancestor_returns_none(self): - rootless = (501, "SPECIES", []) - rank = lca_rank_between(rootless, self.SPECIES_NOCTUA_PRONUBA) - self.assertIsNone(rank) -``` - -- [ ] **Step 1.2: Run tests, confirm they fail (import error)** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestLcaRankBetween -v 2 --keepdb -``` -Expected: `ImportError: cannot import name 'lca_rank_between'`. - -- [ ] **Step 1.3: Implement `lca_rank_between`** - -Append to `ami/main/models_future/occurrence.py`: - -```python -from ami.main.models import TaxonRank - -TaxonTuple = tuple[int, str, list[dict]] - - -def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: - """Most-specific shared ancestor rank between two taxa. - - Inputs are ``(taxon_id, rank_str, parents_json)`` triples where - ``parents_json`` is ordered root → immediate parent (Taxon.parents_json layout). - - The taxon itself counts as part of its own ancestor chain — passing the - same taxon twice returns that taxon's rank. Returns ``None`` when the two - chains share no ancestor (e.g. one has an empty parents_json and the other - doesn't include it). - """ - chain_a = [(p["id"], TaxonRank(p["rank"])) for p in a[2]] + [(a[0], TaxonRank(a[1]))] - chain_b_ids = {p["id"] for p in b[2]} | {b[0]} - - deepest: TaxonRank | None = None - for tid, rank in chain_a: - if tid in chain_b_ids: - if deepest is None or rank > deepest: - deepest = rank - return deepest -``` - -- [ ] **Step 1.4: Run tests, confirm all pass** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestLcaRankBetween -v 2 --keepdb -``` -Expected: `OK (6 tests)`. - -- [ ] **Step 1.5: Commit** - -```bash -git add ami/main/models_future/occurrence.py ami/main/tests.py -git commit -m "feat(occurrence-stats): add lca_rank_between helper - -Pure-Python LCA over (taxon_id, rank, parents_json) tuples. Returns -the deepest shared TaxonRank or None. Used by the upcoming -human-model-agreement stat to bucket agreement at-or-finer-than ORDER. - -Co-Authored-By: Claude " -``` - ---- - -## Task 2: Aggregation function over a filtered queryset - -**Files:** -- Modify: `ami/main/models_future/occurrence.py` -- Test: `ami/main/tests.py` (new class `TestHumanModelAgreementForProject`) - -The function takes a filtered `Occurrence` queryset and returns a serializer-ready dict. Caller is responsible for wiring `apply_default_filters` + `OccurrenceFilter` upstream; the function adds the prefetches/annotations it needs and does the bucketing. - -- [ ] **Step 2.1: Write failing test** - -Add to `ami/main/tests.py`: - -```python -class TestHumanModelAgreementForProject(APITestCase): - """Aggregation function. DB-level. Covers the four bucket transitions: - unverified, verified+exact-agreed, verified+under-order-agreed, - verified+disagreed-above-order. - """ - - def setUp(self) -> None: - project, deployment = setup_test_project() - create_taxa(project=project) - create_captures(deployment=deployment) - create_occurrences(deployment=deployment, num=4) - self.project = project - # Need a couple of taxa at known ranks; create_taxa builds a small tree - # rooted in a Kingdom -> Order -> Family -> Genus -> Species chain. - self.species_a = Taxon.objects.get(name="Vanessa atalanta", projects=project) - self.species_b = Taxon.objects.get(name="Vanessa cardui", projects=project) # same genus - self.species_c = Taxon.objects.get(name="Apis mellifera", projects=project) # different family - self.user = User.objects.create_user(email="ider@insectai.org") - - def _attach_machine_prediction(self, occurrence, taxon, score=0.9): - # Picks up the existing detection on this occurrence and adds a Classification. - detection = occurrence.detections.first() - Classification.objects.create( - detection=detection, - taxon=taxon, - score=score, - terminal=True, - algorithm=detection.detection_algorithm, - ) - - def _identify(self, occurrence, taxon): - return Identification.objects.create(user=self.user, occurrence=occurrence, taxon=taxon) - - def test_empty_project_returns_zeros_not_nans(self): - empty_project = Project.objects.create(name="empty") - result = human_model_agreement_for_project(Occurrence.objects.filter(project=empty_project)) - self.assertEqual(result["total_occurrences"], 0) - self.assertEqual(result["verified_count"], 0) - self.assertEqual(result["verified_pct"], 0.0) - self.assertEqual(result["agreed_exact_pct"], 0.0) - self.assertEqual(result["agreed_under_order_pct"], 0.0) - - def test_buckets_four_canonical_cases(self): - occurrences = list(Occurrence.objects.filter(project=self.project)[:4]) - # 0: verified, machine == user (exact agreement) - self._attach_machine_prediction(occurrences[0], self.species_a) - self._identify(occurrences[0], self.species_a) - # 1: verified, machine sister-species (agreement at GENUS, under ORDER) - self._attach_machine_prediction(occurrences[1], self.species_a) - self._identify(occurrences[1], self.species_b) - # 2: verified, machine different family but same ORDER (still under-order) - # NOTE: requires species_c to share an order with species_a in the fixture. - # If create_taxa() does not put Apis + Vanessa under the same ORDER, - # construct a sibling-order test taxon here. See follow-up note below. - # 3: unverified (no identification) - self._attach_machine_prediction(occurrences[3], self.species_a) - - result = human_model_agreement_for_project(Occurrence.objects.filter(project=self.project)) - self.assertEqual(result["total_occurrences"], 4) - self.assertEqual(result["verified_count"], 2) # occurrences 0, 1 - self.assertEqual(result["agreed_exact_count"], 1) # occurrence 0 - self.assertEqual(result["agreed_under_order_count"], 2) # both — exact is a subset - self.assertAlmostEqual(result["verified_pct"], 0.5) - self.assertAlmostEqual(result["agreed_exact_pct"], 0.5) - self.assertAlmostEqual(result["agreed_under_order_pct"], 1.0) -``` - -Note on `species_c`: if `create_taxa()` doesn't already place an Apis + Vanessa pair under a shared ORDER, drop that assertion and add a dedicated taxon fixture inside the test. Check `ami/main/tests.py` `create_taxa()` first. - -- [ ] **Step 2.2: Run test, confirm import error** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestHumanModelAgreementForProject -v 2 --keepdb -``` - -- [ ] **Step 2.3: Implement aggregation** - -Append to `ami/main/models_future/occurrence.py`: - -```python -def human_model_agreement_for_project(queryset: QuerySet[Occurrence]) -> dict: - """Verified / agreement stats over a pre-filtered Occurrence queryset. - - The queryset MUST already be filtered down to the project + user-supplied - filters (caller wires apply_default_filters + OccurrenceFilter). This - function adds the prefetches/annotations it needs and returns a dict - matching HumanModelAgreementSerializer's field set (without project_id — - the view layer adds that). - - "Verified" means the occurrence has at least one non-withdrawn - Identification. "Model prediction" means the Classification chosen by - BEST_MACHINE_PREDICTION_ORDER. "Under-order" agreement means the user's - taxon and the model's prediction share an ancestor at rank >= ORDER - (inclusive of ORDER itself). - """ - from ami.main.models import Classification, Taxon - - qs = ( - queryset - .with_best_machine_prediction() # annotates best_machine_prediction_taxon_id - .prefetch_related( - Prefetch( - "identifications", - queryset=Identification.objects.filter(withdrawn=False) - .select_related("taxon") - .order_by("-created_at", "-pk"), - to_attr="_non_withdrawn_idents", - ) - ) - ) - - # Collect every taxon id we'll need (best-machine + best-user) to do a - # single batched Taxon fetch for parents_json/rank. - rows = list(qs.values( - "pk", - "best_machine_prediction_taxon_id", - )) - # NOTE: .values() drops the prefetched _non_withdrawn_idents; re-iterate qs - # for identification access. - occurrences = list(qs) - - needed_taxa_ids: set[int] = set() - for occ in occurrences: - if occ.best_machine_prediction_taxon_id: - needed_taxa_ids.add(occ.best_machine_prediction_taxon_id) - idents = getattr(occ, "_non_withdrawn_idents", []) - if idents: - needed_taxa_ids.add(idents[0].taxon_id) - - taxa_by_id: dict[int, tuple[int, str, list[dict]]] = { - t.pk: (t.pk, t.rank, [p.dict() if hasattr(p, "dict") else p for p in t.parents_json]) - for t in Taxon.objects.filter(pk__in=needed_taxa_ids).only("pk", "rank", "parents_json") - } - - total = len(occurrences) - verified = 0 - agreed_exact = 0 - agreed_under_order = 0 - - for occ in occurrences: - idents = getattr(occ, "_non_withdrawn_idents", []) - if not idents: - continue - verified += 1 - user_taxon_id = idents[0].taxon_id - machine_taxon_id = occ.best_machine_prediction_taxon_id - if not machine_taxon_id or not user_taxon_id: - continue - if user_taxon_id == machine_taxon_id: - agreed_exact += 1 - agreed_under_order += 1 - continue - user_tuple = taxa_by_id.get(user_taxon_id) - machine_tuple = taxa_by_id.get(machine_taxon_id) - if not user_tuple or not machine_tuple: - continue - lca = lca_rank_between(user_tuple, machine_tuple) - if lca is not None and lca >= TaxonRank.ORDER: - agreed_under_order += 1 - - def _pct(num: int, denom: int) -> float: - return round(num / denom, 4) if denom else 0.0 - - return { - "total_occurrences": total, - "verified_count": verified, - "verified_pct": _pct(verified, total), - "agreed_exact_count": agreed_exact, - "agreed_exact_pct": _pct(agreed_exact, verified), - "agreed_under_order_count": agreed_under_order, - "agreed_under_order_pct": _pct(agreed_under_order, verified), - } -``` - -Note: `agreed_exact_count` is a subset of `agreed_under_order_count` by definition (exact match implies LCA = SPECIES which is >= ORDER). Document this in the serializer's docstring. - -- [ ] **Step 2.4: Run tests; confirm pass** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestHumanModelAgreementForProject -v 2 --keepdb -``` - -- [ ] **Step 2.5: Commit** - -```bash -git add ami/main/models_future/occurrence.py ami/main/tests.py -git commit -m "feat(occurrence-stats): aggregate human-model agreement over filtered queryset - -Pure aggregation; caller wires apply_default_filters + OccurrenceFilter. -Annotates best machine prediction, prefetches non-withdrawn identifications, -batches Taxon fetch for parents_json, buckets exact / under-order / above-order. - -Co-Authored-By: Claude " -``` - ---- - -## Task 3: Response serializer - -**Files:** -- Modify: `ami/main/api/serializers.py` - -- [ ] **Step 3.1: Add serializer** - -Locate the existing stats serializers (search for `TopIdentifiersResponseSerializer`) and add below: - -```python -class HumanModelAgreementSerializer(serializers.Serializer): - """Verified / agreement rates over the filtered Occurrence set. - - `agreed_exact_count` is a subset of `agreed_under_order_count` by - construction — an exact match implies an LCA at SPECIES, which is - deeper than ORDER. `*_pct` percentages are 0.0..1.0 (not 0..100). - """ - project_id = serializers.IntegerField() - total_occurrences = serializers.IntegerField() - verified_count = serializers.IntegerField() - verified_pct = serializers.FloatField(help_text="verified_count / total_occurrences") - agreed_exact_count = serializers.IntegerField() - agreed_exact_pct = serializers.FloatField(help_text="agreed_exact_count / verified_count") - agreed_under_order_count = serializers.IntegerField() - agreed_under_order_pct = serializers.FloatField(help_text="agreed_under_order_count / verified_count") -``` - -- [ ] **Step 3.2: Commit** - -```bash -git add ami/main/api/serializers.py -git commit -m "feat(occurrence-stats): add HumanModelAgreementSerializer - -Co-Authored-By: Claude " -``` - ---- - -## Task 4: Action on `OccurrenceStatsViewSet` with filter wiring - -**Files:** -- Modify: `ami/main/api/views.py` - -Pull `OccurrenceViewSet`'s filter backend + filterset_fields list into a module-level tuple so both viewsets share it without `OccurrenceStatsViewSet` having to inherit from `DefaultViewSet` (it stays a plain `GenericViewSet`). - -- [ ] **Step 4.1: Extract shared filter config** - -Above `class OccurrenceViewSet(DefaultViewSet, ProjectMixin):` at `ami/main/api/views.py:1171`, add: - -```python -OCCURRENCE_FILTER_BACKENDS = ( - CustomOccurrenceDeterminationFilter, - OccurrenceCollectionFilter, - OccurrenceAlgorithmFilter, - OccurrenceDateFilter, - OccurrenceVerified, - OccurrenceVerifiedByMeFilter, - OccurrenceTaxaListFilter, -) - -OCCURRENCE_FILTERSET_FIELDS = ( - "event", - "deployment", - "determination__rank", - "detections__source_image", -) -``` - -Then replace the literal lists in `OccurrenceViewSet`: - -```python - filter_backends = DefaultViewSetMixin.filter_backends + list(OCCURRENCE_FILTER_BACKENDS) - filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) -``` - -- [ ] **Step 4.2: Wire filter machinery onto `OccurrenceStatsViewSet`** - -In `OccurrenceStatsViewSet` at `ami/main/api/views.py:1268`, add (above `permission_classes`): - -```python - queryset = Occurrence.objects.none() # hint for filterset introspection - filter_backends = list(OCCURRENCE_FILTER_BACKENDS) - filterset_fields = list(OCCURRENCE_FILTERSET_FIELDS) -``` - -(DRF's `filter_queryset` is only called when an action invokes it — `top_identifiers` doesn't, so no behavior change there.) - -- [ ] **Step 4.3: Add `human_model_agreement` action** - -Add to `OccurrenceStatsViewSet`, below `top_identifiers`: - -```python - @extend_schema( - parameters=[project_id_doc_param], - responses=HumanModelAgreementSerializer, - ) - @action(detail=False, methods=["get"], url_path="human-model-agreement") - def human_model_agreement(self, request): - """Verified / human↔model agreement rates over the filtered occurrence set. - - Accepts every query param the `/occurrences/` list endpoint accepts. - Reuses `apply_default_filters` so `apply_defaults=false` bypasses - project default taxa lists + score thresholds. - """ - project = self.get_active_project() - assert project is not None # require_project=True - if not Project.objects.visible_for_user(request.user).filter(pk=project.pk).exists(): - raise NotFound("Project not found.") - - base_qs = ( - Occurrence.objects.filter(project=project) - .valid() - .apply_default_filters(project, request) - ) - filtered_qs = self.filter_queryset(base_qs) - payload = human_model_agreement_for_project(filtered_qs) - payload["project_id"] = project.pk - return Response( - HumanModelAgreementSerializer(payload, context={"request": request}).data - ) -``` - -Add the import at the top of `ami/main/api/views.py`: - -```python -from ami.main.models_future.occurrence import ( - human_model_agreement_for_project, - top_identifiers_for_project, -) -``` - -And the serializer import: - -```python -from ami.main.api.serializers import ( - ..., - HumanModelAgreementSerializer, -) -``` - -- [ ] **Step 4.4: Lint + spectacular** - -```bash -docker compose run --rm django flake8 ami/main/api/views.py ami/main/api/serializers.py -docker compose run --rm django python manage.py spectacular --api-version 'api' --format openapi --file /tmp/schema.yaml -``` -Expected: lint clean. spectacular emits no new warnings about the new action. - -- [ ] **Step 4.5: Commit** - -```bash -git add ami/main/api/views.py ami/main/api/serializers.py -git commit -m "feat(occurrence-stats): wire human-model-agreement action - -Extracts the OccurrenceViewSet filter backends + filterset_fields into a -module-level tuple, then attaches them to OccurrenceStatsViewSet so the -new action can reuse OccurrenceFilter pass-through unchanged. The -top_identifiers action keeps its current behavior — filter_queryset is -only invoked by actions that opt in. - -Co-Authored-By: Claude " -``` - ---- - -## Task 5: Endpoint tests - -**Files:** -- Modify: `ami/main/tests.py` - -- [ ] **Step 5.1: Add HTTP-level tests** - -Append inside `class TestOccurrenceStatsViewSet`: - -```python - agreement_url = "/api/v2/occurrences/stats/human-model-agreement/" - - def _make_machine_prediction(self, occurrence, taxon, score=0.9): - detection = occurrence.detections.first() - Classification.objects.create( - detection=detection, - taxon=taxon, - score=score, - terminal=True, - algorithm=detection.detection_algorithm, - ) - - def test_agreement_no_project_id_returns_400(self): - response = self.client.get(self.agreement_url) - self.assertEqual(response.status_code, 400) - - def test_agreement_draft_project_404_for_anon(self): - self.project.draft = True - self.project.save() - response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") - self.assertEqual(response.status_code, 404) - - def test_agreement_empty_returns_zero_pcts(self): - response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") - self.assertEqual(response.status_code, 200) - body = response.json() - self.assertEqual(body["project_id"], self.project.pk) - self.assertEqual(body["total_occurrences"], 4) - self.assertEqual(body["verified_count"], 0) - self.assertEqual(body["verified_pct"], 0.0) - self.assertEqual(body["agreed_exact_pct"], 0.0) - self.assertEqual(body["agreed_under_order_pct"], 0.0) - - def test_agreement_happy_path(self): - occurrences = list(Occurrence.objects.filter(project=self.project)[:3]) - taxon_a = Taxon.objects.get(name="Vanessa atalanta", projects=self.project) - taxon_b = Taxon.objects.get(name="Vanessa cardui", projects=self.project) - self._make_machine_prediction(occurrences[0], taxon_a) - self._id(self.alice, occurrences[0]) # exact agreement (taxon_a == self.taxon? confirm in fixture) - # ... fill in remaining cases mirroring TestHumanModelAgreementForProject ... - - response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") - self.assertEqual(response.status_code, 200) - body = response.json() - self.assertEqual(body["total_occurrences"], 4) - self.assertEqual(body["verified_count"], 1) - - def test_agreement_filter_passthrough(self): - """`?deployment=` should narrow the set.""" - other_deployment = Deployment.objects.create(name="other", project=self.project) - response = self.client.get( - f"{self.agreement_url}?project_id={self.project.pk}&deployment={other_deployment.pk}" - ) - self.assertEqual(response.status_code, 200) - self.assertEqual(response.json()["total_occurrences"], 0) - - def test_agreement_apply_defaults_false_bypasses_project_filters(self): - """Setting a score threshold on the project should reduce counts; apply_defaults=false restores them.""" - self.project.classification_threshold = 0.99 - self.project.save() - gated = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}").json() - bypassed = self.client.get( - f"{self.agreement_url}?project_id={self.project.pk}&apply_defaults=false" - ).json() - self.assertGreaterEqual(bypassed["total_occurrences"], gated["total_occurrences"]) -``` - -- [ ] **Step 5.2: Run full stats viewset tests** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestOccurrenceStatsViewSet \ - ami.main.tests.TestHumanModelAgreementForProject \ - ami.main.tests.TestLcaRankBetween -v 2 --keepdb -``` - -Expected: all pass. - -- [ ] **Step 5.3: Commit** - -```bash -git add ami/main/tests.py -git commit -m "test(occurrence-stats): HTTP coverage for human-model-agreement action - -Covers: missing project_id 400, draft 404, empty zeros, happy path -bucket transitions, deployment filter pass-through, apply_defaults=false bypass. - -Co-Authored-By: Claude " -``` - ---- - -## Task 6: Frontend hook - -**Files:** -- Create: `ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts` - -- [ ] **Step 6.1: Read the sibling hook** - -```bash -cat ui/src/data-services/hooks/occurrences/stats/useTopIdentifiers.ts -``` - -- [ ] **Step 6.2: Write hook mirroring the pattern** - -```typescript -import { useQuery } from '@tanstack/react-query' -import { axios } from 'data-services/api/axios' -import { API_ROUTES, API_URL } from 'data-services/constants' - -export interface HumanModelAgreement { - project_id: number - total_occurrences: number - verified_count: number - verified_pct: number - agreed_exact_count: number - agreed_exact_pct: number - agreed_under_order_count: number - agreed_under_order_pct: number -} - -export const useHumanModelAgreement = (params: Record) => { - const cleanParams = Object.fromEntries( - Object.entries(params).filter(([, v]) => v !== undefined && v !== ''), - ) - return useQuery({ - queryKey: ['occurrences', 'stats', 'human-model-agreement', cleanParams], - queryFn: async () => { - const res = await axios.get( - `${API_URL}/occurrences/stats/human-model-agreement/`, - { params: cleanParams }, - ) - return res.data - }, - enabled: !!cleanParams.project_id, - }) -} -``` - -Adjust import paths/constants to match the actual `useTopIdentifiers.ts` (file uses repo-local aliases; copy them verbatim from the reference hook rather than guessing). - -- [ ] **Step 6.3: Typecheck** - -```bash -cd ui && yarn tsc --noEmit -``` - -- [ ] **Step 6.4: Commit** - -```bash -git add ui/src/data-services/hooks/occurrences/stats/useHumanModelAgreement.ts -git commit -m "feat(ui): useHumanModelAgreement hook for occurrence stats - -Mirrors useTopIdentifiers. Accepts arbitrary filter params so the -occurrence list page's filter state can be threaded through unchanged. - -Co-Authored-By: Claude " -``` - ---- - -## Task 7: Verification + PR - -- [ ] **Step 7.1: Full test sweep** - -```bash -docker compose run --rm django python manage.py test \ - ami.main.tests.TestOccurrenceStatsViewSet \ - ami.main.tests.TestHumanModelAgreementForProject \ - ami.main.tests.TestLcaRankBetween \ - ami.main.tests.TestOccurrenceListQueryCount -v 2 --keepdb -``` - -The `TestOccurrenceListQueryCount` run guards against accidentally regressing the list endpoint's prefetch contract when editing `OccurrenceViewSet` filter config. - -- [ ] **Step 7.2: Manual smoke** - -```bash -curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18" | jq -curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18&deployment=42" | jq -curl -s "http://localhost:8000/api/v2/occurrences/stats/human-model-agreement/?project_id=18&apply_defaults=false" | jq - -# Sanity: total_occurrences should match the list endpoint's count. -curl -s "http://localhost:8000/api/v2/occurrences/?project_id=18" | jq .count -``` - -- [ ] **Step 7.3: Push + open PR** - -```bash -git push -u origin feat/human-model-agreement-endpoint -gh pr create --title "feat(occurrence-stats): /occurrences/stats/human-model-agreement/" --body "$(cat <<'EOF' -## Summary - -- New scalar stats action on `OccurrenceStatsViewSet` returning verified-occurrence and human↔model agreement rates over a filtered occurrence queryset. -- Reuses `OccurrenceViewSet`'s filter backends + `apply_default_filters` so any query param valid on `/occurrences/` is valid here. -- LCA computed in Python via `Taxon.parents_json` + `TaxonRank(OrderedEnum)`; "under-order" agreement is inclusive of ORDER itself. - -## Decisions & evidence - -- "Model prediction" = `BEST_MACHINE_PREDICTION_ORDER`-selected `Classification`, NOT `Occurrence.determination` (user-overridable). -- "Verified" = ≥1 non-withdrawn `Identification`. Consistent with `with_verification_info()` semantics, slightly stricter than `OccurrenceVerified` filter (which doesn't filter `withdrawn`). -- `agreed_exact_count` is a subset of `agreed_under_order_count` by construction — exact match implies LCA = SPECIES which is deeper than ORDER. Surfaced in the serializer docstring. - -## Test plan - -- [x] Unit: `TestLcaRankBetween` covers identical, sister-species, genus-vs-species, different-family, different-order, no-shared-ancestor. -- [x] Aggregation: `TestHumanModelAgreementForProject` covers empty project + four bucket transitions. -- [x] HTTP: `TestOccurrenceStatsViewSet.test_agreement_*` covers 400/404, empty-pct, happy path, filter pass-through, apply_defaults bypass. -- [x] Regression: `TestOccurrenceListQueryCount` still passes after filter config refactor. -- [ ] Smoke against project 18 via curl (see commands in plan). -EOF -)" -``` - ---- - -## Self-review checklist (run before declaring done) - -- [ ] Every step has either code or an exact command — no "implement appropriate handling". -- [ ] Function/method names match across tasks: `lca_rank_between`, `human_model_agreement_for_project`, `HumanModelAgreementSerializer`, `human_model_agreement` action, `useHumanModelAgreement` hook. -- [ ] Test class names are unique and don't collide with existing classes in `ami/main/tests.py`. -- [ ] No new external dependencies introduced. -- [ ] Plan covers every requirement listed in `docs/claude/prompts/human-model-agreement-endpoint.md` (worktree `user-leaderboard`): - - Response shape ✓ (Task 3) - - OccurrenceFilter pass-through ✓ (Task 4) - - `apply_defaults=false` ✓ (Task 4 base_qs + Task 5 test) - - LCA via `parents_json` ✓ (Task 1) - - Tests: happy / filter pass-through / empty / rank-LCA / draft 404 ✓ (Task 5) - - FE hook ✓ (Task 6) - -## Out of scope (deferred follow-ups) - -- **Postgres-side rank ordering operator.** `TaxonRank` is `OrderedEnum` in Python; pushing rank comparisons into SQL would require materializing rank → int (e.g. a small mapping table or `CASE` expression). Useful when the stats grow to a per-rank breakdown chart, but the current LCA pass batch-fetches taxa once so it isn't on the hot path. File a follow-up ticket if a future stats kind genuinely scans more taxa than fit in one batch. -- **Disagreed-above-order breakdown.** The current response collapses "verified but no shared ancestor at-or-finer-than ORDER" into the residual `verified_count - agreed_under_order_count`. If the dashboard wants to chart that residual explicitly, expose `disagreed_above_order_count` derived in the serializer's `to_representation` (no extra compute). -- **OccurrenceFilter-driven export.** Tracked separately in `docs/claude/planning/occurrence-filter-driven-exports.md` (TBD — subagent stub). diff --git a/docs/claude/planning/occurrence-filter-driven-exports.md b/docs/claude/planning/occurrence-filter-driven-exports.md deleted file mode 100644 index adf2cee96..000000000 --- a/docs/claude/planning/occurrence-filter-driven-exports.md +++ /dev/null @@ -1,116 +0,0 @@ -# Filter-driven occurrence exports — scoping stub - -**Status:** scoping only. No API design, no task breakdown, no migrations. -**Goal:** let a user on `/occurrences/` apply filters in the UI, click "Export", -and get a job whose output matches exactly that filtered set — without first -having to materialize a `SourceImageCollection`. - -## 1. Current export architecture - -- Entry point: `ami/exports/views.py:30-87` `ExportViewSet.create()` — validates - format + filters, optionally looks up a `SourceImageCollection` from - `filters["collection_id"]`, creates `DataExport`, wires it to a `Job`, calls - `job.enqueue()`. -- Persistence: `ami/exports/models.py:23-35` — `DataExport` stores - `format`, `filters` (JSONB), `project`, `user`, `file_url`. -- Worker side: `ami/exports/base.py:17-28` — `BaseExporter.__init__` calls - `apply_filters(queryset, filters, filter_backends)` using - `get_filter_backends()` which today returns just - `[OccurrenceCollectionFilter]` (`base.py:42-45`). -- Filter replay: `ami/exports/utils.py:13-72` — `generate_fake_request()` - builds a DRF `Request` from a path + query-param dict, then - `apply_filters()` runs the backends against the synthetic request. -- Format-specific querysets: `ami/exports/format_types.py:46-63` (JSON) and - `212-234` (CSV) — both call `Occurrence.objects.valid().filter(project=...)` - and layer custom queryset annotations on top. - -So the export infra **already** has a "filters JSON → re-run backends in -worker" pattern. The catch: it's hard-wired to `OccurrenceCollectionFilter` -and never sees the rest of the `/occurrences/` filter stack. - -## 2. `/occurrences/` list filter stack - -`ami/main/api/views.py:1171-1209` registers: - -- `DefaultViewSetMixin.filter_backends` (DjangoFilter, ordering, search) -- `CustomOccurrenceDeterminationFilter` (`views.py:968-987`) — taxon + descendants -- `OccurrenceCollectionFilter` (`views.py:988-1006`) -- `OccurrenceAlgorithmFilter` (`views.py:1008-1030`) -- `OccurrenceDateFilter` (`views.py:1084-1102`) -- `OccurrenceVerified` (`views.py:1032-1049`) -- `OccurrenceVerifiedByMeFilter` (`views.py:1051-1066`) — **reads `request.user`** -- `OccurrenceTaxaListFilter` (`views.py:1105-1152`) - -Plus `filterset_fields = ["event", "deployment", "determination__rank", -"detections__source_image"]` (DjangoFilter), and the project-level default -filter chain via `qs.apply_default_filters(project, self.request)` -(`views.py:1232`) which layers score thresholds + include/exclude taxa -from `ami/main/models_future/filters.py`. - -## 3. The gap - -What an async export needs that a raw filter dict doesn't supply on its own: - -- **Pickleability.** Celery serializes args; the snapshot must be plain JSON - (already true for `DataExport.filters`). -- **User identity for user-scoped filters.** `verified_by_me` and - `apply_default_filters` both read `request.user` — `generate_fake_request` - currently builds an anonymous request, so these silently no-op or behave - differently than the user expected. -- **Drift between submit and run.** If a project's default-filter config, - taxa lists, or score thresholds change between job enqueue and worker - execution, the export may not match what the user previewed. -- **Pagination semantics don't transfer.** The user filtered to 12k rows; we - need to export all 12k, not a single page. Trivial today (no `limit`/ - `offset` in the JSON) but worth stating. -- **Ordering preservation.** `ordering=` may or may not matter for an export - consumer; needs a call. -- **Large result streaming.** Already partially handled by - `get_data_in_batches` (`utils.py:75-105`), but only after the filtered - queryset materializes — needs verification at the scale users will hit. - -## 4. Proposed approaches - -**A. Persist filter params as JSON, re-run pipeline in the worker.** Extend -`BaseExporter.get_filter_backends()` to return the full `/occurrences/` stack -and feed the JSON through `apply_filters()` as today. Also stash `user_id` -on `DataExport` (already present) and stitch it into the synthetic request -so user-scoped filters work. Lowest infra change; highest drift risk -(re-resolves against live project config at run time). - -**B. Materialize a transient `SourceImageCollection` from the filtered set.** -At submit time, resolve the filter to a list of `SourceImage` ids, create a -hidden collection, point the existing export job at it. Reuses every -existing code path. Heaviest write at submit (could be slow for 100k+ rows); -collection-as-snapshot semantics are misleading because collections are -SourceImage-rooted, not Occurrence-rooted. - -**C. New `ExportFilter` model snapshotting params + resolved querystring + -user + project-default-filter version.** Adds explicit provenance ("this -export reflects filters X under project config version Y"). Most fidelity, -most surface area; only worth it if (A) drift bites in practice. - -Rough ordering by effort: **A < B < C**. Rough ordering by drift safety: -**C > B > A**. - -## 5. Open questions - -- How should `apply_default_filters` be re-evaluated at worker time vs. - frozen at submit? (Today's behaviour is implicitly "re-evaluate.") -- For `verified_by_me`, do we trust `DataExport.user` as the identity, or - require the submit-time `request.user` to match? -- Should `ordering` be preserved, or is unordered export acceptable? -- What's the realistic upper bound on exported occurrences, and does - `get_data_in_batches` hold up there? -- Does the UI need a preview count before the job is enqueued? (Today - `update_record_count()` runs synchronously in the view — fine for small - filtered sets, awkward for huge ones.) -- Should the export job snapshot the project's default-filter config so - re-runs are reproducible? - -## 6. Out of scope for this doc - -- Concrete API design (request/response shapes, field names). -- Task breakdown / sequencing. -- Schema migrations. -- UI changes on `/occurrences/`. From 36cc677b4255c42ebbc344ad87ee8eccf26b32ca Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 21 May 2026 21:35:59 -0700 Subject: [PATCH 14/18] test(occurrence-stats): make any-rank bucket test deterministic create_detections assigns the classification taxon via .order_by("?"), so the previous test picked a random machine taxon and then required a sister species under the same genus. Random non-species picks (ORDER / FAMILY / GENUS) have no sister, flaking ~50% of runs. Pin both the machine prediction and the human ID to two fixed Vanessa species, so the LCA is always GENUS (any-rank bucket, not exact) and the test is deterministic. Co-Authored-By: Claude --- ami/main/tests.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/ami/main/tests.py b/ami/main/tests.py index 92a86837f..4e031213d 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -5033,21 +5033,22 @@ def test_agreement_happy_path(self): def test_agreement_any_rank_bucket(self): """Disagreement at species but same genus → counted as any-rank agreement, not exact. - Pick the machine prediction's sister species (same parent genus) for the - identification. LCA between the two species is GENUS, so the occurrence - falls into the any-rank bucket without contributing to agreed_exact_count. + Pin the machine prediction and the human ID to two distinct species under + the same genus (Vanessa). LCA between the two species is GENUS, so the + occurrence falls into the any-rank bucket without contributing to + agreed_exact_count. Both taxa are fixed rather than the random fixture + pick (`create_detections` assigns a random taxon), so the test is + deterministic — a random non-species pick has no sister species and used + to flake ~50% of runs. """ occurrence = Occurrence.objects.filter(project=self.project).order_by("pk").first() - machine_taxon = occurrence.detections.first().classifications.first().taxon - # Sister species: same parent (genus Vanessa), different SPECIES. - sister = ( - Taxon.objects.filter(parent=machine_taxon.parent, rank=TaxonRank.SPECIES.name) - .exclude(pk=machine_taxon.pk) - .first() - ) - self.assertIsNotNone(sister, "Test fixture must have a sister species under the same genus") + species = list(Taxon.objects.filter(parent__name="Vanessa", rank=TaxonRank.SPECIES.name).order_by("name")) + self.assertGreaterEqual(len(species), 2, "Fixture must define ≥2 Vanessa species") + machine_species, human_species = species[0], species[1] + # Pin the machine prediction deterministically, overriding the random fixture taxon. + Classification.objects.filter(detection__occurrence=occurrence).update(taxon=machine_species) Taxon.objects.update_all_parents() - Identification.objects.create(user=self.alice, occurrence=occurrence, taxon=sister) + Identification.objects.create(user=self.alice, occurrence=occurrence, taxon=human_species) response = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}") self.assertEqual(response.status_code, 200) From b74b3cdc6817a5e0cef8c7a0e838c0a27701e28b Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 21 May 2026 21:36:01 -0700 Subject: [PATCH 15/18] chore(occurrence-stats): move FE hook to UI PR #1308 useModelAgreement.ts belongs with the frontend consumer (#1308), not the backend endpoint PR. Keeps #1307 backend-only. Co-Authored-By: Claude --- .../occurrences/stats/useModelAgreement.ts | 71 ------------------- 1 file changed, 71 deletions(-) delete mode 100644 ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts diff --git a/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts b/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts deleted file mode 100644 index d783103df..000000000 --- a/ui/src/data-services/hooks/occurrences/stats/useModelAgreement.ts +++ /dev/null @@ -1,71 +0,0 @@ -import { API_ROUTES, API_URL } from 'data-services/constants' -import { useAuthorizedQuery } from '../../auth/useAuthorizedQuery' - -interface ModelAgreementResponse { - project_id: number - total_occurrences: number - verified_count: number - verified_pct: number - verified_with_prediction_count: number - no_prediction_count: number - agreed_exact_count: number - agreed_exact_pct: number - agreed_any_rank_count: number - agreed_any_rank_pct: number - // Only populated when the caller passes ?agreement_coarsest_rank=. - agreement_coarsest_rank: string | null - agreed_coarser_rank_count: number | null - agreed_coarser_rank_pct: number | null -} - -type FilterPrimitive = string | number | boolean -type FilterValue = FilterPrimitive | FilterPrimitive[] | null | undefined - -// Accepts an arbitrary filter map so the occurrence list page's filter state -// can be threaded through unchanged (deployment, event, taxon, score -// thresholds, apply_defaults, etc). Arrays are appended as repeated query -// params so multi-select filters (e.g. `algorithm`, `not_algorithm`, which -// the backend reads via `request.query_params.getlist(...)`) survive. -export const useModelAgreement = ( - projectId?: string, - filters?: Record -) => { - const url = `${API_URL}/${API_ROUTES.OCCURRENCES}/stats/model-agreement/` - - const params = new URLSearchParams() - if (projectId) params.set('project_id', projectId) - if (filters) { - Object.entries(filters).forEach(([key, value]) => { - if (value === undefined || value === null || value === '') return - if (Array.isArray(value)) { - value.forEach((item) => { - if (item !== undefined && item !== null && item !== '') { - params.append(key, String(item)) - } - }) - return - } - params.set(key, String(value)) - }) - } - const queryString = params.toString() - - const { data, isLoading, isFetching, error } = - useAuthorizedQuery({ - queryKey: [ - API_ROUTES.OCCURRENCES, - 'stats', - 'model-agreement', - projectId, - queryString, - ], - url: `${url}?${queryString}`, - }) - - return { - data, - isLoading, - isFetching, - error, - } -} From 2c65cce0a28432472d01a8f9d87c2c2de125abe0 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:09:56 -0700 Subject: [PATCH 16/18] feat(occurrence-stats): add Wilson CI + Cohen's kappa to model-agreement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both derive from the verified_rows already in memory — no extra query. - wilson_interval(): 95% Wilson score CI on agreed_exact_pct and agreed_any_rank_pct (agreed_*_ci_low / _ci_high). Wilson stays inside [0,1] and is honest at the small n typical of verified sets, where the normal approximation breaks down. - cohens_kappa(): exact-taxon agreement beyond chance (cohens_kappa field, range [-1, 1]). Null when no doubly-classified occurrences or expected agreement is 1.0. Discounts the agreement you'd get for free in a project dominated by one common species. Adds 5 nullable response fields. Backwards-compatible (additive only). 9 pure-Python unit tests + 2 HTTP field-presence tests. Co-Authored-By: Claude --- ami/main/api/serializers.py | 39 ++++++++++++ ami/main/models_future/occurrence.py | 79 ++++++++++++++++++++++- ami/main/tests.py | 93 ++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+), 2 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index ee9a5121d..0c03357f1 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1795,6 +1795,20 @@ class ModelAgreementSerializer(serializers.Serializer): max_value=1.0, help_text="agreed_exact_count / verified_with_prediction_count", ) + agreed_exact_ci_low = serializers.FloatField( + min_value=0.0, + max_value=1.0, + allow_null=True, + required=False, + help_text="Wilson 95% CI lower bound for agreed_exact_pct. Null when verified_with_prediction_count is 0.", + ) + agreed_exact_ci_high = serializers.FloatField( + min_value=0.0, + max_value=1.0, + allow_null=True, + required=False, + help_text="Wilson 95% CI upper bound for agreed_exact_pct. Null when verified_with_prediction_count is 0.", + ) agreed_any_rank_count = serializers.IntegerField( help_text="Exact matches plus disagreements whose LCA is at any real rank (UNKNOWN excluded)." ) @@ -1803,6 +1817,31 @@ class ModelAgreementSerializer(serializers.Serializer): max_value=1.0, help_text="agreed_any_rank_count / verified_with_prediction_count", ) + agreed_any_rank_ci_low = serializers.FloatField( + min_value=0.0, + max_value=1.0, + allow_null=True, + required=False, + help_text="Wilson 95% CI lower bound for agreed_any_rank_pct. Null when verified_with_prediction_count is 0.", + ) + agreed_any_rank_ci_high = serializers.FloatField( + min_value=0.0, + max_value=1.0, + allow_null=True, + required=False, + help_text="Wilson 95% CI upper bound for agreed_any_rank_pct. Null when verified_with_prediction_count is 0.", + ) + cohens_kappa = serializers.FloatField( + min_value=-1.0, + max_value=1.0, + allow_null=True, + required=False, + help_text=( + "Cohen's kappa (exact-taxon) — human↔model agreement beyond chance. " + "Range [-1, 1]; negative is worse than chance. Null when there are no " + "doubly-classified occurrences or expected agreement is 1.0." + ), + ) agreement_coarsest_rank = serializers.CharField( allow_null=True, required=False, diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index cdce45cb8..3dffca92f 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -10,6 +10,8 @@ from __future__ import annotations +import collections +import math from typing import TYPE_CHECKING from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery @@ -50,6 +52,64 @@ def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: return deepest +# z-score for a 95% two-sided confidence interval (Wilson score). +WILSON_Z_95 = 1.959963984540054 + + +def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None: + """Wilson score confidence interval for a binomial proportion. + + Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or + ``None`` when ``total`` is 0. Defaults to a 95% interval. + + The Wilson score interval is used instead of the normal approximation + because the verified set is often tiny (single-digit counts), where the + normal approximation produces bounds outside [0, 1] and understates the + uncertainty. Wilson stays well-behaved at small n and at proportions + near 0 or 1. + """ + if total <= 0: + return None + phat = successes / total + z2 = z * z + denom = 1 + z2 / total + center = (phat + z2 / (2 * total)) / denom + margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total)) + low = max(0.0, center - margin) + high = min(1.0, center + margin) + return (round(low, 4), round(high, 4)) + + +def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None: + """Cohen's kappa for exact-taxon agreement between human and model. + + ``pairs`` is one ``(human_taxon_id, model_taxon_id)`` per occurrence that + both a human and the model assigned a taxon to. Returns kappa rounded to + 4 dp in ``[-1, 1]`` (negative = worse than chance), or ``None`` when + there are no pairs or expected agreement is 1.0 (kappa undefined — a + single category leaves no chance-agreement to correct for). + + Plain agreement rate rewards luck: in a project dominated by one common + species, human and model agree most of the time just by both naming the + common one. Kappa subtracts that chance agreement, so it answers "how + much better than guessing is the model" rather than "how often do they + happen to match". + """ + n = len(pairs) + if n == 0: + return None + observed_agree = sum(1 for h, m in pairs if h == m) / n + human_counts: collections.Counter = collections.Counter(h for h, _ in pairs) + model_counts: collections.Counter = collections.Counter(m for _, m in pairs) + expected_agree = sum( + (human_counts[taxon_id] / n) * (model_counts[taxon_id] / n) + for taxon_id in set(human_counts) | set(model_counts) + ) + if expected_agree >= 1.0: + return None + return round((observed_agree - expected_agree) / (1 - expected_agree), 4) + + def _detections_prefetch(*, ordering: tuple[str, ...], with_source_image: bool) -> Prefetch: from ami.main.models import Classification, Detection @@ -204,8 +264,6 @@ def model_agreement_for_project( Bench against project 18 (43,149 occurrences, 45 verified): ~80ms cold. """ - import collections - from ami.main.models import BEST_IDENTIFICATION_ORDER, Identification, Taxon # Default filters can join Identification (verified_by_me) and Taxon @@ -275,6 +333,18 @@ def model_agreement_for_project( agreed_any_rank = agreed_exact + any_rank_disagreement_count agreed_coarser_rank = agreed_exact + coarser_rank_disagreement_count + # Extra stats over the same verified_rows already in memory — no extra query. + # Wilson 95% CI conveys how shaky each rate is at small n; Cohen's kappa + # (exact-taxon) discounts the agreement you'd get by chance. + exact_ci = wilson_interval(agreed_exact, verified_with_pred) + any_rank_ci = wilson_interval(agreed_any_rank, verified_with_pred) + both_present_pairs = [ + (r["best_user_taxon_id"], r["best_machine_prediction_taxon_id"]) + for r in verified_rows + if r["best_user_taxon_id"] is not None and r["best_machine_prediction_taxon_id"] is not None + ] + kappa = cohens_kappa(both_present_pairs) + def _pct(num: int, denom: int) -> float: return round(num / denom, 4) if denom else 0.0 @@ -286,8 +356,13 @@ def _pct(num: int, denom: int) -> float: "no_prediction_count": no_prediction, "agreed_exact_count": agreed_exact, "agreed_exact_pct": _pct(agreed_exact, verified_with_pred), + "agreed_exact_ci_low": exact_ci[0] if exact_ci else None, + "agreed_exact_ci_high": exact_ci[1] if exact_ci else None, "agreed_any_rank_count": agreed_any_rank, "agreed_any_rank_pct": _pct(agreed_any_rank, verified_with_pred), + "agreed_any_rank_ci_low": any_rank_ci[0] if any_rank_ci else None, + "agreed_any_rank_ci_high": any_rank_ci[1] if any_rank_ci else None, + "cohens_kappa": kappa, "agreement_coarsest_rank": coarsest_rank.name if coarsest_rank is not None else None, "agreed_coarser_rank_count": agreed_coarser_rank if coarsest_rank is not None else None, "agreed_coarser_rank_pct": ( diff --git a/ami/main/tests.py b/ami/main/tests.py index 4e031213d..f2a0b1c42 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4800,6 +4800,75 @@ def test_unknown_rank_excluded_from_lca(self): self.assertEqual(rank, TaxonRank.KINGDOM) +class TestWilsonInterval(TestCase): + """Pure-Python Wilson score confidence interval.""" + + def test_zero_total_returns_none(self): + from ami.main.models_future.occurrence import wilson_interval + + self.assertIsNone(wilson_interval(0, 0)) + + def test_known_value_8_of_10(self): + """Textbook Wilson 95% CI for 8/10 ≈ [0.490, 0.943].""" + from ami.main.models_future.occurrence import wilson_interval + + low, high = wilson_interval(8, 10) + self.assertAlmostEqual(low, 0.4902, places=3) + self.assertAlmostEqual(high, 0.9433, places=3) + + def test_bounds_stay_within_unit_interval(self): + """At p̂ = 1.0 the normal approximation would exceed 1; Wilson must not.""" + from ami.main.models_future.occurrence import wilson_interval + + low, high = wilson_interval(1, 1) + self.assertGreaterEqual(low, 0.0) + self.assertLessEqual(high, 1.0) + self.assertLess(low, high) + + def test_interval_tightens_as_n_grows(self): + from ami.main.models_future.occurrence import wilson_interval + + narrow = wilson_interval(90, 100) + wide = wilson_interval(9, 10) + self.assertLess(narrow[1] - narrow[0], wide[1] - wide[0]) + + +class TestCohensKappa(TestCase): + """Pure-Python Cohen's kappa over (human_taxon, model_taxon) pairs.""" + + def test_empty_returns_none(self): + from ami.main.models_future.occurrence import cohens_kappa + + self.assertIsNone(cohens_kappa([])) + + def test_single_category_is_undefined(self): + """Everyone picks the same taxon → expected agreement 1.0 → kappa undefined.""" + from ami.main.models_future.occurrence import cohens_kappa + + self.assertIsNone(cohens_kappa([(1, 1), (1, 1), (1, 1)])) + + def test_perfect_agreement_two_categories(self): + from ami.main.models_future.occurrence import cohens_kappa + + self.assertEqual(cohens_kappa([(1, 1), (2, 2)]), 1.0) + + def test_known_2x2_value(self): + """observed 0.75, expected 0.5 → kappa = 0.5. + + pairs: 3× human=1, 1× human=2; model 1 twice, 2 twice; 3 of 4 match. + """ + from ami.main.models_future.occurrence import cohens_kappa + + self.assertEqual(cohens_kappa([(1, 1), (1, 1), (2, 2), (1, 2)]), 0.5) + + def test_can_be_negative(self): + """Systematic disagreement → worse than chance → negative kappa.""" + from ami.main.models_future.occurrence import cohens_kappa + + kappa = cohens_kappa([(1, 2), (2, 1), (1, 2), (2, 1)]) + self.assertLess(kappa, 0.0) + + class TestModelAgreementForProject(APITestCase): """Aggregation function over a filtered Occurrence queryset. @@ -5061,6 +5130,30 @@ def test_agreement_any_rank_bucket(self): self.assertEqual(body["agreed_exact_pct"], 0.0) self.assertEqual(body["agreed_any_rank_pct"], 1.0) + def test_agreement_ci_and_kappa_present(self): + """Wilson CI bounds bracket the rate; kappa is null for a single-category set.""" + occurrence = Occurrence.objects.filter(project=self.project).order_by("pk").first() + machine_taxon = occurrence.detections.first().classifications.first().taxon + Taxon.objects.update_all_parents() + Identification.objects.create(user=self.alice, occurrence=occurrence, taxon=machine_taxon) + + body = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}").json() + self.assertIsNotNone(body["agreed_exact_ci_low"]) + self.assertIsNotNone(body["agreed_exact_ci_high"]) + self.assertLessEqual(body["agreed_exact_ci_low"], body["agreed_exact_pct"]) + self.assertGreaterEqual(body["agreed_exact_ci_high"], body["agreed_exact_pct"]) + # One verified occurrence, exact match → a single taxon category → kappa undefined. + self.assertIsNone(body["cohens_kappa"]) + + def test_agreement_empty_ci_and_kappa_null(self): + """No verified occurrences → CI bounds and kappa are null, not zero.""" + body = self.client.get(f"{self.agreement_url}?project_id={self.project.pk}").json() + self.assertIsNone(body["agreed_exact_ci_low"]) + self.assertIsNone(body["agreed_exact_ci_high"]) + self.assertIsNone(body["agreed_any_rank_ci_low"]) + self.assertIsNone(body["agreed_any_rank_ci_high"]) + self.assertIsNone(body["cohens_kappa"]) + def test_agreement_coarsest_rank_invalid_returns_400(self): response = self.client.get( f"{self.agreement_url}?project_id={self.project.pk}&agreement_coarsest_rank=GARBAGE" From 336c1fe6a89c244528f8644e58f97dc738185eab Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 26 May 2026 12:50:28 -0700 Subject: [PATCH 17/18] refactor(stats): move wilson_interval + cohens_kappa to ami/utils/stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both are generic statistical helpers — they don't depend on Django or any domain model. Lifting them out of ami/main/models_future/occurrence.py so other endpoints/jobs that need binomial CIs or chance-corrected agreement can import them without dragging in the occurrence module. Same implementations, just relocated. Renamed parameter names on cohens_kappa from (human, model) to (rater_a, rater_b) so the helper reads as generic rather than human-vs-model specific. Tests already use isolated `from ami.utils.stats import …` imports (updated all 9 sites in ami/main/tests.py). Co-Authored-By: Claude --- ami/main/models_future/occurrence.py | 60 +------------------------ ami/main/tests.py | 18 ++++---- ami/utils/stats.py | 65 ++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 68 deletions(-) create mode 100644 ami/utils/stats.py diff --git a/ami/main/models_future/occurrence.py b/ami/main/models_future/occurrence.py index 3dffca92f..38c6b9935 100644 --- a/ami/main/models_future/occurrence.py +++ b/ami/main/models_future/occurrence.py @@ -11,12 +11,12 @@ from __future__ import annotations import collections -import math from typing import TYPE_CHECKING from django.db.models import Count, OuterRef, Prefetch, Q, QuerySet, Subquery from ami.main.models import Project, TaxonRank, User +from ami.utils.stats import cohens_kappa, wilson_interval if TYPE_CHECKING: from ami.main.models import Classification, Identification, Occurrence @@ -52,64 +52,6 @@ def lca_rank_between(a: TaxonTuple, b: TaxonTuple) -> TaxonRank | None: return deepest -# z-score for a 95% two-sided confidence interval (Wilson score). -WILSON_Z_95 = 1.959963984540054 - - -def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None: - """Wilson score confidence interval for a binomial proportion. - - Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or - ``None`` when ``total`` is 0. Defaults to a 95% interval. - - The Wilson score interval is used instead of the normal approximation - because the verified set is often tiny (single-digit counts), where the - normal approximation produces bounds outside [0, 1] and understates the - uncertainty. Wilson stays well-behaved at small n and at proportions - near 0 or 1. - """ - if total <= 0: - return None - phat = successes / total - z2 = z * z - denom = 1 + z2 / total - center = (phat + z2 / (2 * total)) / denom - margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total)) - low = max(0.0, center - margin) - high = min(1.0, center + margin) - return (round(low, 4), round(high, 4)) - - -def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None: - """Cohen's kappa for exact-taxon agreement between human and model. - - ``pairs`` is one ``(human_taxon_id, model_taxon_id)`` per occurrence that - both a human and the model assigned a taxon to. Returns kappa rounded to - 4 dp in ``[-1, 1]`` (negative = worse than chance), or ``None`` when - there are no pairs or expected agreement is 1.0 (kappa undefined — a - single category leaves no chance-agreement to correct for). - - Plain agreement rate rewards luck: in a project dominated by one common - species, human and model agree most of the time just by both naming the - common one. Kappa subtracts that chance agreement, so it answers "how - much better than guessing is the model" rather than "how often do they - happen to match". - """ - n = len(pairs) - if n == 0: - return None - observed_agree = sum(1 for h, m in pairs if h == m) / n - human_counts: collections.Counter = collections.Counter(h for h, _ in pairs) - model_counts: collections.Counter = collections.Counter(m for _, m in pairs) - expected_agree = sum( - (human_counts[taxon_id] / n) * (model_counts[taxon_id] / n) - for taxon_id in set(human_counts) | set(model_counts) - ) - if expected_agree >= 1.0: - return None - return round((observed_agree - expected_agree) / (1 - expected_agree), 4) - - def _detections_prefetch(*, ordering: tuple[str, ...], with_source_image: bool) -> Prefetch: from ami.main.models import Classification, Detection diff --git a/ami/main/tests.py b/ami/main/tests.py index f2a0b1c42..7c3ce90ab 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4804,13 +4804,13 @@ class TestWilsonInterval(TestCase): """Pure-Python Wilson score confidence interval.""" def test_zero_total_returns_none(self): - from ami.main.models_future.occurrence import wilson_interval + from ami.utils.stats import wilson_interval self.assertIsNone(wilson_interval(0, 0)) def test_known_value_8_of_10(self): """Textbook Wilson 95% CI for 8/10 ≈ [0.490, 0.943].""" - from ami.main.models_future.occurrence import wilson_interval + from ami.utils.stats import wilson_interval low, high = wilson_interval(8, 10) self.assertAlmostEqual(low, 0.4902, places=3) @@ -4818,7 +4818,7 @@ def test_known_value_8_of_10(self): def test_bounds_stay_within_unit_interval(self): """At p̂ = 1.0 the normal approximation would exceed 1; Wilson must not.""" - from ami.main.models_future.occurrence import wilson_interval + from ami.utils.stats import wilson_interval low, high = wilson_interval(1, 1) self.assertGreaterEqual(low, 0.0) @@ -4826,7 +4826,7 @@ def test_bounds_stay_within_unit_interval(self): self.assertLess(low, high) def test_interval_tightens_as_n_grows(self): - from ami.main.models_future.occurrence import wilson_interval + from ami.utils.stats import wilson_interval narrow = wilson_interval(90, 100) wide = wilson_interval(9, 10) @@ -4837,18 +4837,18 @@ class TestCohensKappa(TestCase): """Pure-Python Cohen's kappa over (human_taxon, model_taxon) pairs.""" def test_empty_returns_none(self): - from ami.main.models_future.occurrence import cohens_kappa + from ami.utils.stats import cohens_kappa self.assertIsNone(cohens_kappa([])) def test_single_category_is_undefined(self): """Everyone picks the same taxon → expected agreement 1.0 → kappa undefined.""" - from ami.main.models_future.occurrence import cohens_kappa + from ami.utils.stats import cohens_kappa self.assertIsNone(cohens_kappa([(1, 1), (1, 1), (1, 1)])) def test_perfect_agreement_two_categories(self): - from ami.main.models_future.occurrence import cohens_kappa + from ami.utils.stats import cohens_kappa self.assertEqual(cohens_kappa([(1, 1), (2, 2)]), 1.0) @@ -4857,13 +4857,13 @@ def test_known_2x2_value(self): pairs: 3× human=1, 1× human=2; model 1 twice, 2 twice; 3 of 4 match. """ - from ami.main.models_future.occurrence import cohens_kappa + from ami.utils.stats import cohens_kappa self.assertEqual(cohens_kappa([(1, 1), (1, 1), (2, 2), (1, 2)]), 0.5) def test_can_be_negative(self): """Systematic disagreement → worse than chance → negative kappa.""" - from ami.main.models_future.occurrence import cohens_kappa + from ami.utils.stats import cohens_kappa kappa = cohens_kappa([(1, 2), (2, 1), (1, 2), (2, 1)]) self.assertLess(kappa, 0.0) diff --git a/ami/utils/stats.py b/ami/utils/stats.py new file mode 100644 index 000000000..cda1c9a09 --- /dev/null +++ b/ami/utils/stats.py @@ -0,0 +1,65 @@ +"""Generic statistical helpers reusable across apps. + +Kept independent of Django and any domain models so they can be unit-tested +in isolation and reused by other endpoints/jobs that need to express +uncertainty (Wilson CI) or correct an agreement rate for chance (kappa). +""" + +from __future__ import annotations + +import collections +import math + +# z-score for a 95% two-sided confidence interval (Wilson score). +WILSON_Z_95 = 1.959963984540054 + + +def wilson_interval(successes: int, total: int, z: float = WILSON_Z_95) -> tuple[float, float] | None: + """Wilson score confidence interval for a binomial proportion. + + Returns ``(low, high)`` bounded to ``[0, 1]`` (rounded to 4 dp), or + ``None`` when ``total`` is 0. Defaults to a 95% interval. + + The Wilson score interval is used instead of the normal approximation + because the verified set is often tiny (single-digit counts), where the + normal approximation produces bounds outside [0, 1] and understates the + uncertainty. Wilson stays well-behaved at small n and at proportions + near 0 or 1. + """ + if total <= 0: + return None + phat = successes / total + z2 = z * z + denom = 1 + z2 / total + center = (phat + z2 / (2 * total)) / denom + margin = (z / denom) * math.sqrt(phat * (1 - phat) / total + z2 / (4 * total * total)) + low = max(0.0, center - margin) + high = min(1.0, center + margin) + return (round(low, 4), round(high, 4)) + + +def cohens_kappa(pairs: list[tuple[int, int]]) -> float | None: + """Cohen's kappa for exact agreement between two raters. + + ``pairs`` is one ``(rater_a, rater_b)`` per item that both raters + classified. Returns kappa rounded to 4 dp in ``[-1, 1]`` (negative = + worse than chance), or ``None`` when there are no pairs or expected + agreement is 1.0 (kappa undefined — a single category leaves no + chance-agreement to correct for). + + Plain agreement rate rewards luck: in a project dominated by one common + category, both raters agree most of the time just by both naming the + common one. Kappa subtracts that chance agreement, so it answers "how + much better than guessing do they agree" rather than "how often do they + happen to match". + """ + n = len(pairs) + if n == 0: + return None + observed_agree = sum(1 for a, b in pairs if a == b) / n + a_counts: collections.Counter = collections.Counter(a for a, _ in pairs) + b_counts: collections.Counter = collections.Counter(b for _, b in pairs) + expected_agree = sum((a_counts[key] / n) * (b_counts[key] / n) for key in set(a_counts) | set(b_counts)) + if expected_agree >= 1.0: + return None + return round((observed_agree - expected_agree) / (1 - expected_agree), 4) From 3d522dbf16c2ce4d1ccb7c91c711108c7eda4398 Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Thu, 28 May 2026 00:18:34 +0000 Subject: [PATCH 18/18] =?UTF-8?q?=F0=9F=93=9D=20CodeRabbit=20Chat:=20Imple?= =?UTF-8?q?ment=20requested=20code=20changes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ami/base/serializers.py | 28 ++++++++++++++++++++++++++++ ami/main/api/views.py | 23 +++++++---------------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/ami/base/serializers.py b/ami/base/serializers.py index e39ede52b..66cf31e36 100644 --- a/ami/base/serializers.py +++ b/ami/base/serializers.py @@ -165,6 +165,34 @@ def __init__( self.fields[param_name] = field +class EnumChoiceField(serializers.ChoiceField): + """A ``ChoiceField`` backed by a Python ``enum.Enum`` class. + + Accepts enum member *names* (case-insensitive) and returns the + corresponding enum member on ``to_internal_value``. Use the + ``exclude`` parameter to reject specific members (e.g. sentinel + values like ``UNKNOWN``) with a standard 400 invalid-choice error. + + Example:: + + field = EnumChoiceField(TaxonRank, exclude=[TaxonRank.UNKNOWN], required=False, default=None) + rank = SingleParamSerializer[TaxonRank | None].clean("agreement_coarsest_rank", field, request.query_params) + """ + + def __init__(self, enum_class: type, exclude: list | None = None, **kwargs: typing.Any) -> None: + self._enum_class = enum_class + excluded = set(exclude or []) + choices = [m.name for m in enum_class if m not in excluded] + kwargs.setdefault("choices", choices) + super().__init__(**kwargs) + + def to_internal_value(self, data: typing.Any) -> typing.Any: + normalized = str(data).upper() + if normalized not in self.choices: + self.fail("invalid_choice", input=data) + return self._enum_class[normalized] + + class FilterParamsSerializer(serializers.Serializer): """ Serializer for validating query parameters in DRF views. diff --git a/ami/main/api/views.py b/ami/main/api/views.py index b27cf48c1..15fe77697 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -27,7 +27,7 @@ from ami.base.models import BaseQuerySet from ami.base.pagination import LimitOffsetPaginationWithPermissions from ami.base.permissions import IsActiveStaffOrReadOnly, IsProjectMemberOrReadOnly, ObjectPermission -from ami.base.serializers import FilterParamsSerializer, SingleParamSerializer +from ami.base.serializers import EnumChoiceField, FilterParamsSerializer, SingleParamSerializer from ami.base.views import ProjectMixin from ami.main.api.schemas import limit_doc_param, project_id_doc_param from ami.main.api.serializers import TagSerializer @@ -1386,24 +1386,15 @@ def model_agreement(self, request): if not Project.objects.visible_for_user(request.user).filter(pk=project.pk).exists(): raise NotFound("Project not found.") - coarsest_rank_param = request.query_params.get("agreement_coarsest_rank") - coarsest_rank = None - if coarsest_rank_param: - try: - coarsest_rank = TaxonRank[coarsest_rank_param.upper()] - except KeyError: - valid = ", ".join(r.name for r in TaxonRank if r.name != "UNKNOWN") - raise api_exceptions.ValidationError( - {"agreement_coarsest_rank": f"Invalid rank '{coarsest_rank_param}'. Must be one of: {valid}."} - ) - if coarsest_rank == TaxonRank.UNKNOWN: - raise api_exceptions.ValidationError( - {"agreement_coarsest_rank": "UNKNOWN is not a valid threshold rank."} - ) + coarsest_rank = SingleParamSerializer[TaxonRank | None].clean( + "agreement_coarsest_rank", + EnumChoiceField(TaxonRank, exclude=[TaxonRank.UNKNOWN], required=False, allow_null=True, default=None), + request.query_params, + ) base_qs = Occurrence.objects.filter(project=project).valid().apply_default_filters(project, request) filtered_qs = self.filter_queryset(base_qs) - payload = model_agreement_for_project(filtered_qs, coarsest_rank=coarsest_rank) + payload["project_id"] = project.pk return Response(ModelAgreementSerializer(payload, context={"request": request}).data)