From ea9f45da9110744d2a9d4d3df4caa34c3f79896f Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Thu, 21 May 2026 21:46:02 -0700 Subject: [PATCH 01/16] feat(taxa): per-taxon verification + agreement counts and verified filter Adds to GET /api/v2/taxa/ (issue #1316): - verified_count and agreed_with_prediction_count annotations (always on), rolled up over descendant occurrences via a hierarchical parents_json match. - agreed_exact_count, gated behind with_agreement=true (and always on the detail view). - verified=true|false filter (EXISTS / strict complement), project-scoped and respecting apply_default_filters. - verified_count added to ordering_fields. The hierarchical descendant match uses a Postgres jsonb @> containment built from an OuterRef (literal __contains can't embed an OuterRef). Migration 0085 adds the supporting GIN index on Taxon.parents_json (jsonb_path_ops). Frontend: sortable "Verified" column + "Verification status" filter on the taxa list, and a Verification panel on the taxon detail page. Co-Authored-By: Claude --- ami/main/api/serializers.py | 20 +++ ami/main/api/views.py | 123 +++++++++++++ .../0085_taxon_parents_json_gin_index.py | 31 ++++ ami/main/models.py | 12 ++ ami/main/tests.py | 129 ++++++++++++++ ...05-20-taxa-verification-guidance-ticket.md | 165 ++++++++++++++++++ ui/src/data-services/models/species.ts | 13 ++ .../pages/species-details/species-details.tsx | 19 ++ ui/src/pages/species/species-columns.tsx | 18 ++ ui/src/pages/species/species.tsx | 2 + ui/src/utils/getAppRoute.ts | 1 + 11 files changed, 533 insertions(+) create mode 100644 ami/main/migrations/0085_taxon_parents_json_gin_index.py create mode 100644 docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index a34e7d561..db6aa48bf 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -588,6 +588,14 @@ def get_taxa(self, obj): return [{"id": taxon.id, "name": taxon.name} for taxon in obj.taxa.all()] +def agreement_requested(request: Request | None) -> bool: + """Whether ``with_agreement=true`` is set, gating the heavier agreed_exact_count.""" + if request is None: + return False + value = request.query_params.get("with_agreement", "") + return str(value).lower() in ("true", "1", "yes", "on") + + class TaxonListSerializer(DefaultSerializer): # latest_detection = DetectionNestedSerializer(read_only=True) occurrences = serializers.SerializerMethodField() @@ -595,6 +603,12 @@ class TaxonListSerializer(DefaultSerializer): parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent") tags = serializers.SerializerMethodField() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # agreed_exact_count is a gated annotation: omit it unless with_agreement=true. + if not agreement_requested(self.context.get("request")): + self.fields.pop("agreed_exact_count", None) + def get_tags(self, obj): tag_list = getattr(obj, "prefetched_tags", []) return TagSerializer(tag_list, many=True, context=self.context).data @@ -609,6 +623,9 @@ class Meta: "parents", "details", "occurrences_count", + "verified_count", + "agreed_with_prediction_count", + "agreed_exact_count", "occurrences", "tags", "last_detected", @@ -886,6 +903,9 @@ class Meta: "parents", "details", "occurrences_count", + "verified_count", + "agreed_with_prediction_count", + "agreed_exact_count", "events_count", "occurrences", "gbif_taxon_key", diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 5a6d5aece..a36eb259f 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -37,6 +37,8 @@ from ami.utils.storages import ConnectionTestResult from ..models import ( + BEST_IDENTIFICATION_ORDER, + BEST_MACHINE_PREDICTION_ORDER, NULL_DETECTIONS_FILTER, Classification, Deployment, @@ -1400,6 +1402,19 @@ def filter_queryset(self, request, queryset, view): return queryset.distinct() +class JSONBContains(models.Func): + """Postgres ``@>`` containment rendered as a boolean expression. + + Needed for correlated subqueries where the right-hand side is built from an + ``OuterRef`` — the literal ``parents_json__contains=[{"id": ...}]`` lookup + can't embed an ``OuterRef`` (it would try to JSON-serialize the expression). + """ + + arg_joiner = " @> " + template = "(%(expressions)s)" + output_field = models.BooleanField() + + class TaxonViewSet(DefaultViewSet, ProjectMixin): """ API endpoint that allows taxa to be viewed or edited. @@ -1428,6 +1443,7 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin): "created_at", "updated_at", "occurrences_count", + "verified_count", "last_detected", "best_determination_score", "name", @@ -1654,6 +1670,113 @@ def get_taxa_observed( # Efficient EXISTS check that uses the composite index qs = qs.filter(models.Exists(Occurrence.objects.filter(base_filter))) + qs = self.add_verification_data(qs, occurrence_filters, default_filters_q) + + return qs + + def _occurrences_under_taxon(self, occurrence_filters: models.Q, default_filters_q: models.Q) -> QuerySet: + """ + Correlated Occurrence queryset matching occurrences whose determination is the + outer Taxon (``OuterRef("id")``) or any of its descendants, project-scoped and + default-filtered. + + Mirrors the hierarchical match used by the occurrence-list ``taxon=`` filter + (``CustomOccurrenceDeterminationFilter``), but the descendant test is built with + an ``OuterRef`` right-hand side so a Family/Order row aggregates all its + descendant species' occurrences. + """ + descendant_match = JSONBContains( + models.F("determination__parents_json"), + models.Func( + models.Func( + models.Value("id"), + models.OuterRef("id"), + function="jsonb_build_object", + ), + function="jsonb_build_array", + output_field=models.JSONField(), + ), + ) + return ( + Occurrence.objects.filter(occurrence_filters) + .filter(default_filters_q) + .alias(_under_taxon=descendant_match) + .filter(models.Q(determination_id=models.OuterRef("id")) | models.Q(_under_taxon=True)) + ) + + def _include_agreement(self) -> bool: + """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" + if self.action == "retrieve": + return True + return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement"))) + + def add_verification_data( + self, qs: QuerySet, occurrence_filters: models.Q, default_filters_q: models.Q + ) -> QuerySet: + """ + Annotate per-taxon verification and human/model agreement counts, and apply the + ``verified=true|false`` filter on list responses. + + All counts roll up descendant occurrences via ``_occurrences_under_taxon`` and + respect the project's default filters (same ``apply_defaults`` handling as + ``occurrences_count``). + """ + under_taxon = self._occurrences_under_taxon(occurrence_filters, default_filters_q) + + has_identification = models.Exists( + Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False) + ) + verified_occurrences = under_taxon.filter(has_identification) + + def correlated_count(occurrence_qs: QuerySet) -> Coalesce: + # Group by project_id (constant within the subquery) to collapse the + # hierarchical match — determination_id varies across descendants so it + # can't be the grouping key. + return Coalesce( + models.Subquery( + occurrence_qs.values("project_id").annotate(c=models.Count("id")).values("c")[:1], + output_field=models.IntegerField(), + ), + 0, + ) + + # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK is set. + best_identification_agreed_prediction = models.Subquery( + Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False) + .order_by(*BEST_IDENTIFICATION_ORDER) + .values("agreed_with_prediction_id")[:1] + ) + agreed_with_prediction_occurrences = under_taxon.annotate( + _best_agreed_prediction=best_identification_agreed_prediction + ).filter(_best_agreed_prediction__isnull=False) + + qs = qs.annotate( + verified_count=correlated_count(verified_occurrences), + agreed_with_prediction_count=correlated_count(agreed_with_prediction_occurrences), + ) + + if self._include_agreement(): + # Verified occurrence where the user determination equals the top machine + # prediction's taxon for the same occurrence. + best_machine_taxon = models.Subquery( + Classification.objects.filter(detection__occurrence=models.OuterRef("pk")) + .order_by(*BEST_MACHINE_PREDICTION_ORDER) + .values("taxon_id")[:1] + ) + agreed_exact_occurrences = verified_occurrences.annotate(_best_machine_taxon=best_machine_taxon).filter( + determination_id=models.F("_best_machine_taxon") + ) + qs = qs.annotate(agreed_exact_count=correlated_count(agreed_exact_occurrences)) + + # verified=true|false filter (list only); the complement uses the same set, so + # verified=false is the strict complement of verified=true on the filtered taxa. + if self.action == "list" and "verified" in self.request.query_params: + verified = BooleanField(required=False).clean(self.request.query_params.get("verified")) + if verified: + qs = qs.filter(models.Exists(verified_occurrences)) + else: + qs = qs.filter(~models.Exists(verified_occurrences)) + return qs def attach_tags_by_project(self, qs: QuerySet, project: Project) -> QuerySet: diff --git a/ami/main/migrations/0085_taxon_parents_json_gin_index.py b/ami/main/migrations/0085_taxon_parents_json_gin_index.py new file mode 100644 index 000000000..94b65ff2a --- /dev/null +++ b/ami/main/migrations/0085_taxon_parents_json_gin_index.py @@ -0,0 +1,31 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + """ + GIN index on Taxon.parents_json to support hierarchical (descendant) rollup + of the per-taxon verification / agreement counts added for issue #1316. + + Without it, Family- and Order-rank rows on large projects fall back to a + seq-scan on the parents_json containment (`@>`) test and dominate query time. + + CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration + is non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if + it lands separately via the #1307 follow-up. + """ + + atomic = False + + dependencies = [ + ("main", "0084_revoke_delete_job_from_roles"), + ] + + operations = [ + migrations.RunSQL( + sql=( + "CREATE INDEX CONCURRENTLY IF NOT EXISTS main_taxon_parents_json_gin_idx " + "ON main_taxon USING gin (parents_json jsonb_path_ops);" + ), + reverse_sql="DROP INDEX CONCURRENTLY IF EXISTS main_taxon_parents_json_gin_idx;", + ), + ] diff --git a/ami/main/models.py b/ami/main/models.py index b30b4e645..d772d2ca7 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3811,6 +3811,18 @@ def best_determination_score(self) -> float | None: # This is handled by an annotation if we are filtering by project, deployment or event return None + def verified_count(self) -> int | None: + # Handled by an annotation when filtering by project (TaxonViewSet.add_verification_data) + return None + + def agreed_with_prediction_count(self) -> int | None: + # Handled by an annotation when filtering by project (TaxonViewSet.add_verification_data) + return None + + def agreed_exact_count(self) -> int | None: + # Handled by an annotation only when with_agreement is requested or on the detail view + return None + def occurrence_images(self, limit: int | None = 10) -> list[str]: # This is handled by an annotation if we are filtering by project, deployment or event return [] diff --git a/ami/main/tests.py b/ami/main/tests.py index 517c4c87b..d419d7acc 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4761,3 +4761,132 @@ def test_registration_order_preserves_occurrence_retrieve(self): retrieve_response = self.client.get(f"/api/v2/occurrences/{occurrence.pk}/?project_id={self.project.pk}") self.assertEqual(stats_response.status_code, 200, "stats URL must resolve") self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work") + + +class TestTaxaVerification(APITestCase): + """Per-taxon verification + human/model agreement annotations and the verified filter (#1316).""" + + def setUp(self): + self.project, self.deployment = setup_test_project(reuse=False) + self.taxa_list = create_taxa(self.project) + self.order = Taxon.objects.get(name="Lepidoptera") + self.family = Taxon.objects.get(name="Nymphalidae") + self.genus = Taxon.objects.get(name="Vanessa") + self.cardui = Taxon.objects.get(name="Vanessa cardui") + self.atalanta = Taxon.objects.get(name="Vanessa atalanta") + self.itea = Taxon.objects.get(name="Vanessa itea") + + create_captures(deployment=self.deployment, num_nights=1, images_per_night=3) + # 3 occurrences ML-determined to cardui, 1 to itea (left unverified) + create_occurrences(deployment=self.deployment, num=3, taxon=self.cardui, determination_score=0.9) + create_occurrences(deployment=self.deployment, num=1, taxon=self.itea, determination_score=0.9) + + self.user = User.objects.create_user(email="verifier@insectai.org", is_staff=True, is_superuser=True) + self.client.force_authenticate(user=self.user) + + cardui_occ = list(Occurrence.objects.filter(project=self.project, determination=self.cardui).order_by("pk")) + self.assertEqual(len(cardui_occ), 3) + self.occ_pred, self.occ_exact, self.occ_disagree = cardui_occ + + # occ_pred: user agrees with the model prediction (cardui), agreed_with_prediction set + Identification.objects.create( + occurrence=self.occ_pred, + taxon=self.cardui, + user=self.user, + agreed_with_prediction=self.occ_pred.best_prediction, + ) + # occ_exact: same taxon as the model, but not via the "agree" workflow + Identification.objects.create(occurrence=self.occ_exact, taxon=self.cardui, user=self.user) + # occ_disagree: user overrides to a different taxon (atalanta) than the model (cardui) + Identification.objects.create(occurrence=self.occ_disagree, taxon=self.atalanta, user=self.user) + + self.itea_occ = Occurrence.objects.get(project=self.project, determination=self.itea) + self.list_url = f"/api/v2/taxa/?project_id={self.project.pk}&limit=1000" + + def _detail(self, taxon): + res = self.client.get(f"/api/v2/taxa/{taxon.pk}/?project_id={self.project.pk}") + self.assertEqual(res.status_code, status.HTTP_200_OK) + return res.json() + + def _list_by_name(self, url=None): + res = self.client.get(url or self.list_url) + self.assertEqual(res.status_code, status.HTTP_200_OK) + return {row["name"]: row for row in res.json()["results"]} + + # --- verified_count (hierarchical rollup) --- + + def test_verified_count_species(self): + self.assertEqual(self._detail(self.cardui)["verified_count"], 2) + self.assertEqual(self._detail(self.atalanta)["verified_count"], 1) + self.assertEqual(self._detail(self.itea)["verified_count"], 0) + + def test_verified_count_rolls_up_to_ancestors(self): + # Verifying species marks genus/family/order verified, occurrence-weighted by descendants. + for ancestor in (self.genus, self.family, self.order): + self.assertEqual(self._detail(ancestor)["verified_count"], 3, ancestor.name) + + # --- agreed_with_prediction_count (chosen identification only) --- + + def test_agreed_with_prediction_counts_only_chosen_identification(self): + self.assertEqual(self._detail(self.cardui)["agreed_with_prediction_count"], 1) + self.assertEqual(self._detail(self.atalanta)["agreed_with_prediction_count"], 0) + # Rolls up: only occ_pred contributes under the genus. + self.assertEqual(self._detail(self.genus)["agreed_with_prediction_count"], 1) + + # --- agreed_exact_count (gated) --- + + def test_agreed_exact_count_on_detail(self): + # occ_pred + occ_exact: user determination == top machine prediction (cardui). + self.assertEqual(self._detail(self.cardui)["agreed_exact_count"], 2) + # occ_disagree: user picked atalanta, model said cardui → not exact. + self.assertEqual(self._detail(self.atalanta)["agreed_exact_count"], 0) + self.assertEqual(self._detail(self.genus)["agreed_exact_count"], 2) + + def test_agreed_exact_count_gated_on_list(self): + rows = self._list_by_name() + self.assertIn("verified_count", rows["Vanessa cardui"]) + self.assertIn("agreed_with_prediction_count", rows["Vanessa cardui"]) + self.assertNotIn("agreed_exact_count", rows["Vanessa cardui"]) + + rows = self._list_by_name(self.list_url + "&with_agreement=true") + self.assertIn("agreed_exact_count", rows["Vanessa cardui"]) + self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2) + + # --- list field values --- + + def test_list_field_values(self): + rows = self._list_by_name() + self.assertEqual(rows["Vanessa cardui"]["occurrences_count"], 2) + self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) + self.assertEqual(rows["Vanessa cardui"]["agreed_with_prediction_count"], 1) + self.assertEqual(rows["Vanessa atalanta"]["verified_count"], 1) + self.assertEqual(rows["Vanessa itea"]["verified_count"], 0) + + # --- verified=true|false filter --- + + def test_verified_filter_true_false_complement(self): + all_names = set(self._list_by_name().keys()) + verified = set(self._list_by_name(self.list_url + "&verified=true").keys()) + unverified = set(self._list_by_name(self.list_url + "&verified=false").keys()) + self.assertEqual(verified, {"Vanessa cardui", "Vanessa atalanta"}) + self.assertEqual(unverified, {"Vanessa itea"}) + # verified=false is the strict complement of verified=true on the filtered set. + self.assertEqual(verified | unverified, all_names) + self.assertEqual(verified & unverified, set()) + + def test_ordering_by_verified_count(self): + res = self.client.get(self.list_url + "&ordering=verified_count") + self.assertEqual(res.status_code, status.HTTP_200_OK) + counts = [row["verified_count"] for row in res.json()["results"]] + self.assertEqual(counts, sorted(counts)) + + # --- apply_defaults handling --- + + def test_verified_filter_respects_apply_defaults(self): + self.project.default_filters_exclude_taxa.add(self.atalanta) + + verified_default = set(self._list_by_name(self.list_url + "&verified=true").keys()) + self.assertEqual(verified_default, {"Vanessa cardui"}) + + verified_bypassed = set(self._list_by_name(self.list_url + "&verified=true&apply_defaults=false").keys()) + self.assertEqual(verified_bypassed, {"Vanessa cardui", "Vanessa atalanta"}) diff --git a/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md b/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md new file mode 100644 index 000000000..5d2fc9747 --- /dev/null +++ b/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md @@ -0,0 +1,165 @@ +# Add guidance and stats about which taxa need verification + +## Motivation + +To get a meaningful project-wide picture of model accuracy and data quality, users should verify at least one occurrence of every unique taxon their pipelines have apparently observed. Today there is no surface that tells them *which* taxa still need attention or how many they have already verified — they have to drill into the occurrence list per taxon and check by hand. + +This ticket adds per-taxon verification and agreement data to the existing taxa list endpoint, plus the matching UI controls, so users can sort and filter to find the taxa that most need their attention. It is part of this year's proactive-surfacing goal and is the natural next step after [#1296](https://github.com/RolnickLab/antenna/pull/1296) (project summary) and [#1307](https://github.com/RolnickLab/antenna/pull/1307) (dataset-wide model agreement endpoint). + +## Scope + +Backend annotations on `GET /api/v2/taxa/` plus a new filter, and a new column + filter in the taxa list table. The dataset-wide `/occurrences/stats/model-agreement/` endpoint from #1307 is unchanged — it stays as the aggregate view; this ticket adds the per-taxon breakdown. + +### Out of scope (queued separately) + +- "Needs verification" badge / status pill on rows. +- Project-summary widget with `X of Y unique taxa verified`. +- Dedicated unverified-taxa queue page. +- Backfilling counts onto a denormalized `Taxon` field. +- Macro-average rollup (occurrence-weighted descendant sum is the only rollup in this ticket). + +## Backend + +### Filter + +- `verified=true|false` on `TaxonViewSet` — matches taxa with at least one non-withdrawn `Identification` on an occurrence whose `determination` is the taxon itself **or any descendant** (via `parents_json__contains`). Implemented as an `EXISTS` subquery, project-scoped, respects `apply_default_filters`. + +### Always-on annotations (cheap) + +Both reuse the existing hierarchical-match pattern (`parents_json__contains [{id: OuterRef("id")}] OR determination_id = OuterRef("id")`) used by the `taxon=` filter in the occurrence list, so a Family row aggregates all its descendant species' occurrences — occurrence-weighted by construction. + +- `verified_count` — count of occurrences under the taxon (incl. descendants) with at least one non-withdrawn `Identification`. Sortable. Single correlated subquery per row. +- `agreed_with_prediction_count` — count of verified occurrences whose chosen `Identification.agreed_with_prediction` is non-null. No join through `Classification` needed — just a non-null FK check. Different signal from `agreed_exact_count` below: this measures the *agree-with-model workflow* (user clicked the "agree" button on a prediction), not independent-match accuracy. + +### Gated annotation (heavier) + +- `agreed_exact_count` — count of verified occurrences where `occurrence.determination_id` equals the top machine `Classification.taxon_id` for the same occurrence. Surfaced only when `with_agreement=true` is on the request. Cost: two correlated subqueries per row (verified set + best classification per occurrence). Needs benchmarking on P#85 (13k verified) before this can default on. **NOT** included in the default list response. + +`occurrence.determination` is already maintained as the top non-withdrawn user identification's taxon (`update_occurrence_determination` runs on every `Identification.save`, see `ami/main/models.py:2528, 3383-3393`), so we do not need a correlated subquery over `Identification` to find the best human identification — just read `determination_id` directly. This is what makes `agreed_exact_count` only two subqueries instead of three. + +### Dropped vs PR #1307 + +- `agreed_under_order_count` is not added per-taxon. The under-order LCA bucket from `/occurrences/stats/model-agreement/` stays available at the dataset level; per-taxon it's redundant since each row already represents a single taxon. + +### Detail view + +`GET /api/v2/taxa//` should include all four fields above unconditionally — single-row cost is negligible. + +### Performance prerequisites + +- The hierarchical match uses `Taxon.parents_json` containment. Without a GIN index on that column, Family- and Order-rank rows on large projects (P#85 has 13k verified, P#20 has 41k occurrences) will fall back to seq-scan and dominate query time. **This index is already flagged as a follow-up to #1307**: + + ```sql + CREATE INDEX CONCURRENTLY main_taxon_parents_json_gin_idx + ON main_taxon USING gin (parents_json jsonb_path_ops); + ``` + + Treat shipping the GIN index as a hard blocker for enabling recursive rollup correctness at higher ranks. Without it, this ticket is safe to ship for projects with shallow taxa lists (species-only) but will be slow elsewhere. + +- The composite-index follow-up from #1307 (`main_occurrence (project_id, determination_score)`) is also relevant — `verified_count` filters by project + verified flag and benefits from the same indexed path. + +### Cost benchmarks to run before merge + +| Query | Project | Expected | Acceptance | +|---|---|---|---| +| `/taxa/?project_id=18&verified=true` | P#18 (45 verified) | < 200ms warm | ≤ 1.5× current `/taxa/` p99 | +| `/taxa/?project_id=85&verified=false` | P#85 (13k verified) | < 500ms warm | ≤ 2× current p99 | +| `/taxa/?project_id=85&with_agreement=true` | P#85 | < 1.5s warm | < 5s cold | +| `/taxa/?project_id=85&ordering=verified_count` | P#85 | < 1s warm | doesn't fall off cliff | + +If `with_agreement=true` exceeds the cold budget on P#85, fall back to keeping `agreed_exact_count` on the detail view only and add a `/taxa/stats/verification/` aggregate endpoint mirroring the #1307 pattern instead. + +## Frontend + +### Taxa list page (`/projects//taxa`) + +- New sortable column **Verified** showing `verified_count` per row. Default ordering unchanged; user can click the column to sort asc (least-verified first → matches the proactive-surfacing intent). +- New filter pill **Verification status**: `All` (default) / `Verified` / `Unverified`. Wires to the `verified=` query param. +- Existing `Occurrences` column stays as the primary count signal; `Verified` sits next to it so the ratio is visually obvious. + +### Not in this ticket + +- No new column for `agreed_with_prediction_count` or `agreed_exact_count` in the table by default. These are surfaced on the **taxon detail page** only (existing detail page; add a small "Verification" panel showing the four numbers). If the table eventually grows a "Model accuracy" toggle, it would flip `with_agreement=true` on — design that in a follow-up. + +## API contract examples + +```bash +# Verified taxa only, project default filters applied +curl '.../api/v2/taxa/?project_id=18&verified=true' + +# Unverified taxa, sorted by occurrence count desc — the "biggest gaps" view +curl '.../api/v2/taxa/?project_id=18&verified=false&ordering=-occurrences_count' + +# Sort by which taxa have the most human verification +curl '.../api/v2/taxa/?project_id=18&ordering=-verified_count' + +# Enable the heavier agreed_exact_count on a list response +curl '.../api/v2/taxa/?project_id=18&with_agreement=true' + +# Detail view always includes all four +curl '.../api/v2/taxa/567/?project_id=18' +``` + +### Response shape (list) + +```json +{ + "id": 567, + "name": "Hyalophora cecropia", + "rank": "SPECIES", + "occurrences_count": 124, + "verified_count": 3, + "agreed_with_prediction_count": 2, + "best_determination_score": 0.94, + "last_detected": "2025-08-12T03:14:22" +} +``` + +With `with_agreement=true`: + +```json +{ + "...": "...", + "verified_count": 3, + "agreed_with_prediction_count": 2, + "agreed_exact_count": 2 +} +``` + +## Test plan + +Backend: + +- [ ] Unit test on `Taxon` queryset: `verified=true` returns only taxa with non-withdrawn identifications, respecting hierarchical match (verifying a species also marks its genus/family as verified at higher-rank rows). +- [ ] Unit test on `Taxon` queryset: `verified=false` is the strict complement on the project's filtered taxa set. +- [ ] Unit test: `verified_count` equals number of verified occurrences under the taxon (descendants included). +- [ ] Unit test: `agreed_with_prediction_count` only counts the chosen identification's `agreed_with_prediction`, not all identifications on the occurrence. +- [ ] Unit test: `agreed_exact_count` reads `occurrence.determination_id` for the user side and top-score `Classification.taxon_id` for the model side, and is only populated when `with_agreement=true`. +- [ ] HTTP test: list endpoint shape includes new fields; gated field absent unless flag is set. +- [ ] HTTP test: `verified=` filter behaves correctly under `apply_defaults=true|false`. +- [ ] Bench: queries above hit acceptance thresholds. + +Frontend: + +- [ ] Verified column renders, sorts asc and desc. +- [ ] Filter pill updates URL, persists across reload, clears with the rest of the project filter state. +- [ ] Detail page Verification panel renders all four fields. + +## Follow-ups (not in this ticket) + +- `Taxon.parents_json` GIN index (carries over from #1307 — gating dependency for rollup correctness at higher ranks). +- `main_occurrence (project_id, determination_score)` composite index (also from #1307). +- Project-summary "X of Y unique taxa verified" widget on the overview page. +- "Needs verification" status pill on taxa rows once we know what the threshold should be (`verified_count == 0` is the obvious v1). +- Dedicated unverified-taxa queue view (pre-filtered, ranked by occurrence count desc). +- Macro-averaged agreement rollup at higher ranks (alternative to the occurrence-weighted sum this ticket ships). +- A `with_counts` / `with_agreement` query-param convention audit across the API — we already have similar gated-annotation patterns elsewhere; document a single convention. + +## References + +- PR #1307 — dataset-wide `/occurrences/stats/model-agreement/` endpoint, established the LCA + agreement-bucket compute that this ticket reuses per-taxon. Includes the GIN-index and composite-index follow-ups this ticket inherits. +- PR #1296 — project summary view, the surfacing target this work feeds into. +- `ami/main/api/views.py:1403` — `TaxonViewSet`, where the new annotations and filter land. +- `ami/main/api/views.py:1576` — `get_taxa_observed`, the existing helper that already wires `parents_json`-aware subqueries; pattern for adding the new ones. +- `ami/main/models.py:2440` — `Identification` model (`agreed_with_prediction` FK). +- `ami/main/models.py:3383` — `update_occurrence_determination`, which keeps `Occurrence.determination` aligned with the top non-withdrawn user identification. diff --git a/ui/src/data-services/models/species.ts b/ui/src/data-services/models/species.ts index e507eca4f..bd0f34f33 100644 --- a/ui/src/data-services/models/species.ts +++ b/ui/src/data-services/models/species.ts @@ -80,6 +80,19 @@ export class Species extends Taxon { return this._species.occurrences_count ?? 0 } + get numVerified(): number { + return this._species.verified_count ?? 0 + } + + get numAgreedWithPrediction(): number { + return this._species.agreed_with_prediction_count ?? 0 + } + + // Only present when with_agreement=true is requested (or on the detail view). + get numAgreedExact(): number | undefined { + return this._species.agreed_exact_count ?? undefined + } + get score(): number | undefined { const score = this._species.best_determination_score diff --git a/ui/src/pages/species-details/species-details.tsx b/ui/src/pages/species-details/species-details.tsx index d6677033c..bd5f20edf 100644 --- a/ui/src/pages/species-details/species-details.tsx +++ b/ui/src/pages/species-details/species-details.tsx @@ -157,6 +157,25 @@ export const SpeciesDetails = ({ })} /> + + + + {species.numAgreedExact !== undefined ? ( + + ) : null} + diff --git a/ui/src/pages/species/species-columns.tsx b/ui/src/pages/species/species-columns.tsx index 49bf330ad..d78181fe9 100644 --- a/ui/src/pages/species/species-columns.tsx +++ b/ui/src/pages/species/species-columns.tsx @@ -95,6 +95,24 @@ export const columns: (project: { ), }, + { + id: 'verified', + sortField: 'verified_count', + name: 'Verified', + styles: { + textAlign: TextAlign.Right, + }, + renderCell: (item: Species) => ( + + + + ), + }, { id: 'best-determination-score', name: translate(STRING.FIELD_LABEL_BEST_SCORE), diff --git a/ui/src/pages/species/species.tsx b/ui/src/pages/species/species.tsx index 6609db0fa..a693ee995 100644 --- a/ui/src/pages/species/species.tsx +++ b/ui/src/pages/species/species.tsx @@ -39,6 +39,7 @@ export const Species = () => { rank: false, 'last-seen': true, occurrences: true, + verified: true, 'best-determination-score': true, 'created-at': false, 'updated-at': false, @@ -82,6 +83,7 @@ export const Species = () => { )} + {project?.featureFlags.tags ? ( <> diff --git a/ui/src/utils/getAppRoute.ts b/ui/src/utils/getAppRoute.ts index 4cb054028..ca4012f94 100644 --- a/ui/src/utils/getAppRoute.ts +++ b/ui/src/utils/getAppRoute.ts @@ -13,6 +13,7 @@ type FilterType = | 'taxa_list_id' | 'taxon' | 'timestamp' + | 'verified' export const getAppRoute = ({ to, From 16b14686580108bb6da2ee0d7f7e12cd0a2a2b94 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Fri, 22 May 2026 13:15:37 -0700 Subject: [PATCH 02/16] perf(taxa): compute verification rollup in one pass, not per-taxon subquery The per-taxon correlated parents_json subquery for verified_count / agreed_with_prediction_count / agreed_exact_count did not scale: on a large project (~1k taxa, ~17k occurrences) the taxa list timed out at the 30s statement limit even with the column hidden and on the default sort, because each page row (and the verified=false COUNT) ran a JSONB containment scan the GIN index can't serve when the @> right-hand side is an OuterRef. All three counts only concern verified occurrences (those with a non-withdrawn Identification), which are sparse. Compute the hierarchical rollup in a single pass over that small set in Python and apply it as constant-time CASE annotations; resolve the verified filter from the same precomputed set via id__in. Page values, sort, and the pagination COUNT are now constant-time. Also fixes ancestor rollup returning 0: parents_json round-trips through the pydantic schema field, so elements may be TaxonParent objects rather than dicts. Measured on the large project: default page 30s timeout -> ~0.6s; verified=false 30s timeout -> ~0.2s; ordering=verified_count 30s timeout -> ~0.04s. Co-Authored-By: Claude --- ami/main/api/views.py | 162 +++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 87 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index a36eb259f..0a09ae075 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1402,19 +1402,6 @@ def filter_queryset(self, request, queryset, view): return queryset.distinct() -class JSONBContains(models.Func): - """Postgres ``@>`` containment rendered as a boolean expression. - - Needed for correlated subqueries where the right-hand side is built from an - ``OuterRef`` — the literal ``parents_json__contains=[{"id": ...}]`` lookup - can't embed an ``OuterRef`` (it would try to JSON-serialize the expression). - """ - - arg_joiner = " @> " - template = "(%(expressions)s)" - output_field = models.BooleanField() - - class TaxonViewSet(DefaultViewSet, ProjectMixin): """ API endpoint that allows taxa to be viewed or edited. @@ -1674,36 +1661,6 @@ def get_taxa_observed( return qs - def _occurrences_under_taxon(self, occurrence_filters: models.Q, default_filters_q: models.Q) -> QuerySet: - """ - Correlated Occurrence queryset matching occurrences whose determination is the - outer Taxon (``OuterRef("id")``) or any of its descendants, project-scoped and - default-filtered. - - Mirrors the hierarchical match used by the occurrence-list ``taxon=`` filter - (``CustomOccurrenceDeterminationFilter``), but the descendant test is built with - an ``OuterRef`` right-hand side so a Family/Order row aggregates all its - descendant species' occurrences. - """ - descendant_match = JSONBContains( - models.F("determination__parents_json"), - models.Func( - models.Func( - models.Value("id"), - models.OuterRef("id"), - function="jsonb_build_object", - ), - function="jsonb_build_array", - output_field=models.JSONField(), - ), - ) - return ( - Occurrence.objects.filter(occurrence_filters) - .filter(default_filters_q) - .alias(_under_taxon=descendant_match) - .filter(models.Q(determination_id=models.OuterRef("id")) | models.Q(_under_taxon=True)) - ) - def _include_agreement(self) -> bool: """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" if self.action == "retrieve": @@ -1717,65 +1674,96 @@ def add_verification_data( Annotate per-taxon verification and human/model agreement counts, and apply the ``verified=true|false`` filter on list responses. - All counts roll up descendant occurrences via ``_occurrences_under_taxon`` and - respect the project's default filters (same ``apply_defaults`` handling as - ``occurrences_count``). - """ - under_taxon = self._occurrences_under_taxon(occurrence_filters, default_filters_q) + Counts roll up descendant occurrences (verifying a species also counts toward its + genus/family rows) and respect the project's default filters (same + ``apply_defaults`` handling as ``occurrences_count``). - has_identification = models.Exists( - Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False) - ) - verified_occurrences = under_taxon.filter(has_identification) - - def correlated_count(occurrence_qs: QuerySet) -> Coalesce: - # Group by project_id (constant within the subquery) to collapse the - # hierarchical match — determination_id varies across descendants so it - # can't be the grouping key. - return Coalesce( - models.Subquery( - occurrence_qs.values("project_id").annotate(c=models.Count("id")).values("c")[:1], - output_field=models.IntegerField(), - ), - 0, - ) + All three counts only concern *verified* occurrences (those with a non-withdrawn + Identification), which are sparse relative to all occurrences. So the hierarchical + rollup is computed in a single pass over that small set in Python and applied as + constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per + taxon does not scale: on large projects it forces a per-row scan that the GIN + index can't serve (the containment RHS is an ``OuterRef``), timing out the list. + """ + include_agreement = self._include_agreement() - # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK is set. + # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK. best_identification_agreed_prediction = models.Subquery( Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False) .order_by(*BEST_IDENTIFICATION_ORDER) .values("agreed_with_prediction_id")[:1] ) - agreed_with_prediction_occurrences = under_taxon.annotate( - _best_agreed_prediction=best_identification_agreed_prediction - ).filter(_best_agreed_prediction__isnull=False) - - qs = qs.annotate( - verified_count=correlated_count(verified_occurrences), - agreed_with_prediction_count=correlated_count(agreed_with_prediction_occurrences), + verified_occurrences = ( + Occurrence.objects.filter(occurrence_filters) + .filter(default_filters_q) + .filter(models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False))) + .annotate(_agreed_prediction_id=best_identification_agreed_prediction) ) - - if self._include_agreement(): - # Verified occurrence where the user determination equals the top machine - # prediction's taxon for the same occurrence. - best_machine_taxon = models.Subquery( - Classification.objects.filter(detection__occurrence=models.OuterRef("pk")) - .order_by(*BEST_MACHINE_PREDICTION_ORDER) - .values("taxon_id")[:1] + value_fields = ["determination_id", "determination__parents_json", "_agreed_prediction_id"] + if include_agreement: + # Top machine prediction's taxon for the same occurrence. + verified_occurrences = verified_occurrences.annotate( + _best_machine_taxon_id=models.Subquery( + Classification.objects.filter(detection__occurrence=models.OuterRef("pk")) + .order_by(*BEST_MACHINE_PREDICTION_ORDER) + .values("taxon_id")[:1] + ) ) - agreed_exact_occurrences = verified_occurrences.annotate(_best_machine_taxon=best_machine_taxon).filter( - determination_id=models.F("_best_machine_taxon") + value_fields.append("_best_machine_taxon_id") + + verified_counts: dict[int, int] = {} + agreed_with_prediction_counts: dict[int, int] = {} + agreed_exact_counts: dict[int, int] = {} + for row in verified_occurrences.values(*value_fields): + determination_id = row["determination_id"] + # The taxon itself plus every ancestor — i.e. every row this occurrence rolls up to. + taxon_ids: set[int] = set() + if determination_id is not None: + taxon_ids.add(determination_id) + for parent in row["determination__parents_json"] or []: + # parents_json round-trips through the pydantic schema field, so elements + # may be dicts or ``TaxonParent`` objects depending on the query path. + parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) + if parent_id is not None: + taxon_ids.add(int(parent_id)) + + for taxon_id in taxon_ids: + verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1 + if row["_agreed_prediction_id"] is not None: + for taxon_id in taxon_ids: + agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1 + if ( + include_agreement + and determination_id is not None + and determination_id == row["_best_machine_taxon_id"] + ): + for taxon_id in taxon_ids: + agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1 + + def count_annotation(counts: dict[int, int]) -> models.expressions.Combinable: + if not counts: + return models.Value(0, output_field=models.IntegerField()) + return models.Case( + *(models.When(id=taxon_id, then=models.Value(count)) for taxon_id, count in counts.items()), + default=models.Value(0), + output_field=models.IntegerField(), ) - qs = qs.annotate(agreed_exact_count=correlated_count(agreed_exact_occurrences)) - # verified=true|false filter (list only); the complement uses the same set, so - # verified=false is the strict complement of verified=true on the filtered taxa. + qs = qs.annotate( + verified_count=count_annotation(verified_counts), + agreed_with_prediction_count=count_annotation(agreed_with_prediction_counts), + ) + if include_agreement: + qs = qs.annotate(agreed_exact_count=count_annotation(agreed_exact_counts)) + + # verified=true|false filter (list only); verified=false is the strict complement. if self.action == "list" and "verified" in self.request.query_params: verified = BooleanField(required=False).clean(self.request.query_params.get("verified")) + verified_taxon_ids = list(verified_counts.keys()) if verified: - qs = qs.filter(models.Exists(verified_occurrences)) + qs = qs.filter(id__in=verified_taxon_ids) else: - qs = qs.filter(~models.Exists(verified_occurrences)) + qs = qs.exclude(id__in=verified_taxon_ids) return qs From 5d929ce3142b53dd5797f32815af07ea9ef130d3 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 17:32:17 -0700 Subject: [PATCH 03/16] docs(taxa): clarify GIN index purpose + add rollup query-performance reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parents_json GIN index (migration 0085) no longer backs the verification rollup (now a Python single-pass). Update the migration docstring and add an inline comment at its real consumer — the literal-RHS containment in the occurrence-list taxon filter — so the index's purpose is clear. Add docs/claude/reference/hierarchical-rollup-query-performance.md capturing the anti-pattern (per-taxon correlated parents_json subquery) vs the pattern (precompute over the sparse set + CASE), the COUNT/cachalot/pydantic-field gotchas, and the denormalized TaxonObserved direction. Co-Authored-By: Claude --- ami/main/api/views.py | 4 +- .../0085_taxon_parents_json_gin_index.py | 19 +-- .../hierarchical-rollup-query-performance.md | 114 ++++++++++++++++++ 3 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 docs/claude/reference/hierarchical-rollup-query-performance.md diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 0a09ae075..a69e3570f 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -979,7 +979,9 @@ class CustomOccurrenceDeterminationFilter(CustomTaxonFilter): def filter_queryset(self, request, queryset, view): taxon = self.get_filter_taxon(request, query_params=self.query_params) if taxon: - # Here the queryset is the Occurrence queryset + # Here the queryset is the Occurrence queryset. + # The literal parents_json containment (constant RHS) is what the GIN index from + # migration 0085 serves — this hierarchical taxon filter is the index's main consumer. return queryset.filter( models.Q(determination=taxon) | models.Q(determination__parents_json__contains=[{"id": taxon.pk}]) ) diff --git a/ami/main/migrations/0085_taxon_parents_json_gin_index.py b/ami/main/migrations/0085_taxon_parents_json_gin_index.py index 94b65ff2a..1ae8a1832 100644 --- a/ami/main/migrations/0085_taxon_parents_json_gin_index.py +++ b/ami/main/migrations/0085_taxon_parents_json_gin_index.py @@ -3,15 +3,20 @@ class Migration(migrations.Migration): """ - GIN index on Taxon.parents_json to support hierarchical (descendant) rollup - of the per-taxon verification / agreement counts added for issue #1316. + GIN index on Taxon.parents_json to support the hierarchical (descendant) taxon + filters that issue a literal `parents_json @> [{"id": }]` containment: the + occurrence-list `taxon=` filter (CustomOccurrenceDeterminationFilter) and the + project default-taxa filter (build_occurrence_default_filters_q). The index applies + to these because the right-hand side is a constant. - Without it, Family- and Order-rank rows on large projects fall back to a - seq-scan on the parents_json containment (`@>`) test and dominate query time. + Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is + computed in a single Python pass over the (sparse) verified-occurrence set rather + than a correlated subquery, because a containment whose RHS is an OuterRef can't use + the index. See TaxonViewSet.add_verification_data. - CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration - is non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if - it lands separately via the #1307 follow-up. + CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is + non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands + separately via the #1307 follow-up. """ atomic = False diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md new file mode 100644 index 000000000..a2c94f9b8 --- /dev/null +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -0,0 +1,114 @@ +# Hierarchical (descendant) rollup counts — performant query patterns + +How to compute a per-taxon count that rolls up descendant occurrences +(a Family/Order row aggregating its species) without timing out the list +endpoint on large projects. Generalises to any "aggregate over a node and +all its `parents_json` descendants" problem. + +Reference implementation: `TaxonViewSet.add_verification_data` in +`ami/main/api/views.py` (verified_count / agreed_with_prediction_count / +agreed_exact_count for #1316). History: the first revision used the +anti-pattern below and timed out; commit `16b14686` switched to the pattern. + +## TL;DR + +- **Do not** roll up with a per-taxon correlated `parents_json @> [{"id": OuterRef}]` + subquery. A containment whose RHS is an `OuterRef` can't use the GIN index, so it + degrades to a per-row seq-scan that runs once per page row **and** once per taxon in + the pagination `COUNT`. +- **Do** precompute when the driving data is *sparse*: one pass over the small set, + build `{taxon_id: count}` dicts (incrementing the determination taxon + every + ancestor in `parents_json`), then apply as constant-time `CASE` annotations. Resolve + any membership filter from the same set via `id__in`. +- The GIN index (`main_taxon_parents_json_gin_idx`, migration `0085`) only helps the + **literal**-RHS containment filters (occurrence-list `taxon=`, + `build_occurrence_default_filters_q`), not correlated ones. + +## The anti-pattern (does not scale) + +```python +# Per-taxon correlated subquery — OuterRef RHS, can't use the GIN index. +descendant_match = JSONBContains( + F("determination__parents_json"), + jsonb_build_array(jsonb_build_object("id", OuterRef("id"))), +) +under_taxon = Occurrence.objects.filter(...).filter( + Q(determination_id=OuterRef("id")) | Q(_under_taxon=True) +) +qs.annotate(verified_count=Coalesce(Subquery(under_taxon.values("project_id") + .annotate(c=Count("id")).values("c")[:1]), 0)) +``` + +Measured on a ~1k-taxa / ~17k-occurrence project (statement_timeout = 30s): +`limit=25` page **9s** for one such annotation; with two always-on annotations the +default list and `verified=false` hit the 30s timeout; `ordering=verified_count` +timed out (the subquery is computed for *every* taxon before the LIMIT). + +Standalone, the same containment for **one** constant taxon id is ~70ms (it uses the +index). The cost is the correlation: per outer row the planner can't use the index for +a parameterised `@>` and re-evaluates the join. **Always benchmark the correlated form, +not the standalone query.** + +## The pattern (precompute over the sparse set + CASE) + +Key observation: the three counts only concern *verified* occurrences (those with a +non-withdrawn `Identification`), which are sparse — bounded by human review effort, not +total occurrences. + +```python +verified_occ = Occurrence.objects.filter(occ_filters).filter(default_filters_q).filter( + Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False)) +).annotate(_agreed_prediction_id=...) # + _best_machine_taxon_id when gated + +counts: dict[int, int] = {} +for row in verified_occ.values("determination_id", "determination__parents_json", ...): + taxon_ids = {row["determination_id"], *(_id(p) for p in row["determination__parents_json"] or [])} + for tid in taxon_ids: + counts[tid] = counts.get(tid, 0) + 1 + +case = Case(*(When(id=tid, then=Value(c)) for tid, c in counts.items()), + default=Value(0), output_field=IntegerField()) if counts else Value(0, ...) +qs = qs.annotate(verified_count=case) + +# membership filter from the same precomputed set: +qs.filter(id__in=list(counts)) / qs.exclude(id__in=list(counts)) +``` + +Same project, after: every path (`default`, `verified=true/false`, +`ordering=verified_count`) ~0.3s. Cost is `O(verified occ × ancestor depth)`, paid once +per request. The `CASE` annotation is a constant per row → DB-sortable, paginatable, +and stripped from the pagination `COUNT`. + +### Why this keeps COUNT cheap + +Django's `QuerySet.count()` strips **select-only** annotations it doesn't need — so the +`CASE` annotations don't appear in the pagination `COUNT`. Two things defeat that strip +and were the original timeout: +- a filter that *references* the expensive expression — e.g. `verified=false` as + `~Exists(correlated_subquery)` forces the subquery into the `COUNT` WHERE for every + taxon. Resolving the filter via `id__in=` avoids it. +- `.distinct()` combined with select annotations (wraps the whole annotated query in the + `COUNT` subquery). Watch for it on list viewsets. + +## Gotchas + +- **cachalot caches query results**, including across repeated benchmark runs — a second + identical timing is a cache hit, not a real measurement. Wrap timing in + `from cachalot.api import cachalot_disabled` → `with cachalot_disabled(): ...`. +- **`django-pydantic-field` `.values()` returns deserialised pydantic objects**, not + dicts. `parents_json` elements may be `TaxonParent` objects depending on the query + path, so read the id defensively: `p.get("id") if isinstance(p, dict) else getattr(p, "id", None)`. + (This silently broke ancestor rollup — direct/species counts worked, ancestors were 0.) +- **GIN `jsonb_path_ops` only serves `@>` with a constant RHS.** Literal + `parents_json__contains=[{"id": X}]` uses it; an `OuterRef` RHS does not. + +## Future direction — denormalised per-project observed taxa + +The precompute approach scales with verified-data volume. If per-project taxa +aggregates (observed counts, verified counts, agreement) become a recurring perf +problem across endpoints, consider a dedicated **`TaxonObserved`** model holding +denormalised data per `(project, taxon)` — distinct from the generic project-agnostic +`Taxon` profile — refreshed via the cached-count pattern +(`update_cached_counts(run_async=True)`) on `Identification` / `Occurrence` writes. +That moves the rollup off the read path entirely and makes counts directly sortable / +filterable as real columns. Open idea, not yet scoped. From 10c72cbde41863c3c079cb53501393f5bff95982 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:03:47 -0700 Subject: [PATCH 04/16] fix(taxa): dedupe occurrences in verification rollup under collection filter When ?collection= joins Occurrence to detections, a single occurrence yields one row per matching detection, inflating verified_count / agreed_with_prediction_count / agreed_exact_count. Select pk and .distinct() the values() rollup so each occurrence is counted once. Adds a regression test. Co-Authored-By: Claude --- ami/main/api/views.py | 7 +++++-- ami/main/tests.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index a69e3570f..5290edc45 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1701,7 +1701,10 @@ def add_verification_data( .filter(models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False))) .annotate(_agreed_prediction_id=best_identification_agreed_prediction) ) - value_fields = ["determination_id", "determination__parents_json", "_agreed_prediction_id"] + # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when + # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence + # yields a row per matching Detection, which would otherwise inflate the counts. + value_fields = ["pk", "determination_id", "determination__parents_json", "_agreed_prediction_id"] if include_agreement: # Top machine prediction's taxon for the same occurrence. verified_occurrences = verified_occurrences.annotate( @@ -1716,7 +1719,7 @@ def add_verification_data( verified_counts: dict[int, int] = {} agreed_with_prediction_counts: dict[int, int] = {} agreed_exact_counts: dict[int, int] = {} - for row in verified_occurrences.values(*value_fields): + for row in verified_occurrences.values(*value_fields).distinct(): determination_id = row["determination_id"] # The taxon itself plus every ancestor — i.e. every row this occurrence rolls up to. taxon_ids: set[int] = set() diff --git a/ami/main/tests.py b/ami/main/tests.py index d419d7acc..c3c28217d 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4890,3 +4890,26 @@ def test_verified_filter_respects_apply_defaults(self): verified_bypassed = set(self._list_by_name(self.list_url + "&verified=true&apply_defaults=false").keys()) self.assertEqual(verified_bypassed, {"Vanessa cardui", "Vanessa atalanta"}) + + # --- collection filter must not inflate counts via the detections join --- + + def test_verified_count_not_inflated_by_collection_join(self): + # A second detection on a verified occurrence means the ?collection= INNER JOIN to + # detections yields two rows for that occurrence; the rollup must still count it once. + extra_detection = Detection.objects.create( + source_image=self.occ_exact.best_detection.source_image, + occurrence=self.occ_exact, + timestamp=self.occ_exact.best_detection.timestamp, + bbox=[0.5, 0.5, 0.6, 0.6], + path="detections/test_detection_dup.jpg", + ) + extra_detection.classifications.create(taxon=self.cardui, score=0.9, timestamp=datetime.datetime.now()) + self.assertEqual(self.occ_exact.detections.count(), 2) + + collection = SourceImageCollection.objects.create(project=self.project, name="verif-dedup") + collection.images.set(SourceImage.objects.filter(deployment=self.deployment)) + + rows = self._list_by_name(f"{self.list_url}&collection={collection.pk}&with_agreement=true") + # 2 verified cardui occurrences, not 3 — the duplicate detection must not double-count. + self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) + self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2) From 30955e33cf880c095940b43b46799b01d2f0f1aa Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:04:49 -0700 Subject: [PATCH 05/16] chore(migrations): renumber parents_json GIN index 0085 -> 0087 after main merge main merged #1305 which took 0085/0086 (project activity + recent-capture indexes). Repoint the GIN-index migration onto 0086 and update the references to it (views.py comment, rollup perf doc). Co-Authored-By: Claude --- ami/main/api/views.py | 2 +- ...s_json_gin_index.py => 0087_taxon_parents_json_gin_index.py} | 2 +- docs/claude/reference/hierarchical-rollup-query-performance.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename ami/main/migrations/{0085_taxon_parents_json_gin_index.py => 0087_taxon_parents_json_gin_index.py} (96%) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 35fcc7774..06a0e77a7 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1030,7 +1030,7 @@ def filter_queryset(self, request, queryset, view): if taxon: # Here the queryset is the Occurrence queryset. # The literal parents_json containment (constant RHS) is what the GIN index from - # migration 0085 serves — this hierarchical taxon filter is the index's main consumer. + # migration 0087 serves — this hierarchical taxon filter is the index's main consumer. return queryset.filter( models.Q(determination=taxon) | models.Q(determination__parents_json__contains=[{"id": taxon.pk}]) ) diff --git a/ami/main/migrations/0085_taxon_parents_json_gin_index.py b/ami/main/migrations/0087_taxon_parents_json_gin_index.py similarity index 96% rename from ami/main/migrations/0085_taxon_parents_json_gin_index.py rename to ami/main/migrations/0087_taxon_parents_json_gin_index.py index 1ae8a1832..6d7404d59 100644 --- a/ami/main/migrations/0085_taxon_parents_json_gin_index.py +++ b/ami/main/migrations/0087_taxon_parents_json_gin_index.py @@ -22,7 +22,7 @@ class Migration(migrations.Migration): atomic = False dependencies = [ - ("main", "0084_revoke_delete_job_from_roles"), + ("main", "0086_sourceimage_recent_capture_index"), ] operations = [ diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md index a2c94f9b8..c8f8e1b8c 100644 --- a/docs/claude/reference/hierarchical-rollup-query-performance.md +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -20,7 +20,7 @@ anti-pattern below and timed out; commit `16b14686` switched to the pattern. build `{taxon_id: count}` dicts (incrementing the determination taxon + every ancestor in `parents_json`), then apply as constant-time `CASE` annotations. Resolve any membership filter from the same set via `id__in`. -- The GIN index (`main_taxon_parents_json_gin_idx`, migration `0085`) only helps the +- The GIN index (`main_taxon_parents_json_gin_idx`, migration `0087`) only helps the **literal**-RHS containment filters (occurrence-list `taxon=`, `build_occurrence_default_filters_q`), not correlated ones. From b92b2b0e5c32b69821cd873e89639fd496170ad5 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:18:23 -0700 Subject: [PATCH 06/16] fix(taxa): make collection-filtered taxa list COUNT scale The observed-set membership used a correlated EXISTS; under ?collection= the occurrence_filters join to detections turned it into a per-taxon scan, timing out the pagination COUNT (no LIMIT to short-circuit). Replace with a single distinct-determination id__in subquery. Default path unchanged; the collection path drops from a 30s timeout to sub-second. Co-Authored-By: Claude --- ami/main/api/views.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 06a0e77a7..abbd6cb4a 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1705,8 +1705,19 @@ def get_taxa_observed( ) if not include_unobserved: - # Efficient EXISTS check that uses the composite index - qs = qs.filter(models.Exists(Occurrence.objects.filter(base_filter))) + # Membership via a single "distinct determination" subquery rather than a + # correlated EXISTS. The EXISTS form is index-served and cheap for the default + # filters, but when occurrence_filters joins detections (?collection=) it + # degrades to a per-taxon scan that times out the pagination COUNT (which has + # no LIMIT to short-circuit it). The id__in subquery runs the join once. + observed_taxon_ids = ( + Occurrence.objects.filter(occurrence_filters) + .filter(default_filters_q) + .filter(determination_id__isnull=False) + .values("determination_id") + .distinct() + ) + qs = qs.filter(id__in=observed_taxon_ids) qs = self.add_verification_data(qs, occurrence_filters, default_filters_q) From 29bec78cfe1d793ae7ed26a7c0668b5704388ff5 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:22:51 -0700 Subject: [PATCH 07/16] fix(taxa): materialize observed-taxon id set instead of IN-subquery The IN-subquery form (b92b2b0e) still embedded the detections m2m join in both the COUNT and page queries; under ?collection= the planner re-evaluated it pathologically (collection path ~87s). Materialize the distinct-determination id set in one query (~0.2s) so COUNT/page reduce to a plain indexed id IN (...). Mirrors the verified-filter pattern. Co-Authored-By: Claude --- ami/main/api/views.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index abbd6cb4a..2d3d4e015 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1705,16 +1705,18 @@ def get_taxa_observed( ) if not include_unobserved: - # Membership via a single "distinct determination" subquery rather than a + # Membership via a materialised "distinct determination" id set rather than a # correlated EXISTS. The EXISTS form is index-served and cheap for the default # filters, but when occurrence_filters joins detections (?collection=) it - # degrades to a per-taxon scan that times out the pagination COUNT (which has - # no LIMIT to short-circuit it). The id__in subquery runs the join once. - observed_taxon_ids = ( + # degrades to a per-taxon scan that times out the pagination COUNT (no LIMIT to + # short-circuit it). Materialising the id set runs the detections join exactly + # once (~0.2s) and leaves COUNT/page as a plain indexed `id IN (...)`; the same + # pattern is used for the verified filter in add_verification_data. + observed_taxon_ids = list( Occurrence.objects.filter(occurrence_filters) .filter(default_filters_q) .filter(determination_id__isnull=False) - .values("determination_id") + .values_list("determination_id", flat=True) .distinct() ) qs = qs.filter(id__in=observed_taxon_ids) From 838f9d750436bf579202a49f57dc7dfb577e1dbd Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:32:03 -0700 Subject: [PATCH 08/16] refactor(taxa): centralize per-taxon counts into one filtered-occurrence pass The taxa list computed occurrences_count / best_determination_score / last_detected as three per-taxon correlated subqueries. Index-served for the default filters, but under ?collection= the detections join turned each into a per-row scan (~20s for a 25-row page on a large project), on top of the already-fixed COUNT membership. Replace them with annotate_taxon_counts: one GROUP BY over the shared filtered occurrence set builds {taxon_id: value} maps, applied as constant-time CASE annotations via _case_from_map. The same base feeds the verification rollup (_annotate_verification_counts, formerly add_verification_data), and its determination ids serve the observed-set membership filter, so no separate membership query is needed. Count(distinct) also dedupes the collection join fan-out for occurrences_count. No denormalized model introduced. Co-Authored-By: Claude --- ami/main/api/views.py | 190 +++++++++--------- .../0087_taxon_parents_json_gin_index.py | 2 +- ami/main/models.py | 4 +- 3 files changed, 98 insertions(+), 98 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 2d3d4e015..bdf71484d 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -6,7 +6,6 @@ from django.core import exceptions from django.db import models from django.db.models import OuterRef, Prefetch, Q, Subquery -from django.db.models.functions import Coalesce from django.db.models.query import QuerySet from django.forms import BooleanField, CharField, IntegerField from django.shortcuts import get_object_or_404 @@ -1639,10 +1638,8 @@ def get_taxa_observed( If a project is passed, only return taxa that have been observed. Also add the number of occurrences and the last time it was detected. - Uses efficient subqueries with default filters applied directly via Q objects - to leverage composite indexes on (determination_id, project_id, event_id, determination_score). - This avoids the N+1 query problem by building a single Q filter that can be reused - across all subqueries. + Counts are computed by annotate_taxon_counts from one filtered occurrence set + (occurrence_filters + the project's default filters), not per-taxon subqueries. """ occurrence_filters = self.get_occurrence_filters(project) @@ -1659,95 +1656,109 @@ def get_taxa_observed( apply_default_taxa_filter=apply_default_taxa_filter, ) - # Combine base occurrence filters with default filters - base_filter = models.Q( + qs = self.annotate_taxon_counts( + qs, occurrence_filters, - determination_id=models.OuterRef("id"), + default_filters_q, + restrict_to_observed=not include_unobserved, ) - base_filter = base_filter & default_filters_q + return qs - # Count occurrences - uses composite index (determination_id, project_id, event_id, determination_score) - occurrences_count_subquery = models.Subquery( - Occurrence.objects.filter(base_filter) - .values("determination_id") - .annotate(count=models.Count("id")) - .values("count")[:1], - output_field=models.IntegerField(), - ) + def _include_agreement(self) -> bool: + """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" + if self.action == "retrieve": + return True + return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement"))) - # Get best score - uses same composite index - best_score_subquery = models.Subquery( - Occurrence.objects.filter(base_filter) - .values("determination_id") - .annotate(max_score=models.Max("determination_score")) - .values("max_score")[:1], - output_field=models.FloatField(), - ) + @staticmethod + def _case_from_map(mapping: dict, default, output_field: models.Field) -> models.expressions.Combinable: + """Turn a precomputed ``{taxon_id: value}`` map into a constant-time ``CASE``. - # Get last detected timestamp - requires join with detections - last_detected_subquery = models.Subquery( - Occurrence.objects.filter( - base_filter, - detections__timestamp__isnull=False, - ) - .values("determination_id") - .annotate(last_detected=models.Max("detections__timestamp")) - .values("last_detected")[:1], - output_field=models.DateTimeField(), + The result is constant per row, so it is DB-sortable, paginatable, and stripped + from the pagination ``COUNT`` — unlike a per-taxon correlated subquery, which is + re-evaluated for every row and (in ``COUNT``) for every taxon in the project. + """ + if not mapping: + return models.Value(default, output_field=output_field) + return models.Case( + *( + models.When(id=taxon_id, then=models.Value(value, output_field=output_field)) + for taxon_id, value in mapping.items() + ), + default=models.Value(default, output_field=output_field), + output_field=output_field, ) - # Apply annotations + def annotate_taxon_counts( + self, + qs: QuerySet, + occurrence_filters: models.Q, + default_filters_q: models.Q, + *, + restrict_to_observed: bool, + ) -> QuerySet: + """Centralised per-(project, taxon) count annotations for the taxa endpoint. + + Every count is derived from one filtered occurrence set + (``occurrence_filters`` + the project default filters) by precomputing + ``{taxon_id: value}`` maps in Python and applying them as constant-time ``CASE`` + annotations (:meth:`_case_from_map`), rather than per-taxon correlated subqueries. + The subquery form does not scale once ``occurrence_filters`` joins detections + (``?collection=``): each annotation degrades to a per-row scan — 25x on the + page, and once per taxon in the unbounded pagination ``COUNT``. + + Two count shapes share the same base queryset: + + - Direct aggregates (``occurrences_count``, ``best_determination_score``, + ``last_detected``), keyed by the occurrence's own determination — one GROUP BY. + ``Count(distinct)`` dedupes the detections-join fan-out under ``?collection=``. + - ``verified_count`` / ``agreed_*`` roll up to ancestors via ``parents_json`` over + the (sparse) verified subset — see :meth:`_annotate_verification_counts`. + + The determination ids present in the filtered set are exactly the observed taxa, + so ``restrict_to_observed`` reuses them instead of a separate membership query. + """ + base = Occurrence.objects.filter(occurrence_filters).filter(default_filters_q) + + occurrences_count: dict[int, int] = {} + best_score: dict[int, float | None] = {} + last_detected: dict[int, object] = {} + for row in base.values("determination_id").annotate( + _n=models.Count("id", distinct=True), + _score=models.Max("determination_score"), + _last=models.Max("detections__timestamp"), + ): + determination_id = row["determination_id"] + if determination_id is None: + continue + occurrences_count[determination_id] = row["_n"] + best_score[determination_id] = row["_score"] + last_detected[determination_id] = row["_last"] + qs = qs.annotate( - occurrences_count=Coalesce(occurrences_count_subquery, 0), - best_determination_score=best_score_subquery, - last_detected=last_detected_subquery, + occurrences_count=self._case_from_map(occurrences_count, 0, models.IntegerField()), + best_determination_score=self._case_from_map(best_score, None, models.FloatField()), + last_detected=self._case_from_map(last_detected, None, models.DateTimeField()), ) - if not include_unobserved: - # Membership via a materialised "distinct determination" id set rather than a - # correlated EXISTS. The EXISTS form is index-served and cheap for the default - # filters, but when occurrence_filters joins detections (?collection=) it - # degrades to a per-taxon scan that times out the pagination COUNT (no LIMIT to - # short-circuit it). Materialising the id set runs the detections join exactly - # once (~0.2s) and leaves COUNT/page as a plain indexed `id IN (...)`; the same - # pattern is used for the verified filter in add_verification_data. - observed_taxon_ids = list( - Occurrence.objects.filter(occurrence_filters) - .filter(default_filters_q) - .filter(determination_id__isnull=False) - .values_list("determination_id", flat=True) - .distinct() - ) - qs = qs.filter(id__in=observed_taxon_ids) - - qs = self.add_verification_data(qs, occurrence_filters, default_filters_q) + if restrict_to_observed: + qs = qs.filter(id__in=list(occurrences_count.keys())) - return qs + return self._annotate_verification_counts(qs, base) - def _include_agreement(self) -> bool: - """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" - if self.action == "retrieve": - return True - return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement"))) - - def add_verification_data( - self, qs: QuerySet, occurrence_filters: models.Q, default_filters_q: models.Q - ) -> QuerySet: + def _annotate_verification_counts(self, qs: QuerySet, base: QuerySet) -> QuerySet: """ - Annotate per-taxon verification and human/model agreement counts, and apply the + Annotate per-taxon verification / human-model agreement counts, and apply the ``verified=true|false`` filter on list responses. + ``base`` is the shared filtered occurrence set from :meth:`annotate_taxon_counts`. Counts roll up descendant occurrences (verifying a species also counts toward its - genus/family rows) and respect the project's default filters (same - ``apply_defaults`` handling as ``occurrences_count``). - - All three counts only concern *verified* occurrences (those with a non-withdrawn - Identification), which are sparse relative to all occurrences. So the hierarchical - rollup is computed in a single pass over that small set in Python and applied as - constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per - taxon does not scale: on large projects it forces a per-row scan that the GIN - index can't serve (the containment RHS is an ``OuterRef``), timing out the list. + genus/family rows). They only concern *verified* occurrences (those with a + non-withdrawn Identification), which are sparse, so the hierarchical rollup is a + single Python pass over that small subset applied as constant-time ``CASE`` + annotations. A correlated ``parents_json`` subquery per taxon does not scale: the + GIN index can't serve a containment whose RHS is an ``OuterRef``. """ include_agreement = self._include_agreement() @@ -1757,12 +1768,9 @@ def add_verification_data( .order_by(*BEST_IDENTIFICATION_ORDER) .values("agreed_with_prediction_id")[:1] ) - verified_occurrences = ( - Occurrence.objects.filter(occurrence_filters) - .filter(default_filters_q) - .filter(models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False))) - .annotate(_agreed_prediction_id=best_identification_agreed_prediction) - ) + verified_occurrences = base.filter( + models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)) + ).annotate(_agreed_prediction_id=best_identification_agreed_prediction) # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence # yields a row per matching Detection, which would otherwise inflate the counts. @@ -1807,21 +1815,13 @@ def add_verification_data( for taxon_id in taxon_ids: agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1 - def count_annotation(counts: dict[int, int]) -> models.expressions.Combinable: - if not counts: - return models.Value(0, output_field=models.IntegerField()) - return models.Case( - *(models.When(id=taxon_id, then=models.Value(count)) for taxon_id, count in counts.items()), - default=models.Value(0), - output_field=models.IntegerField(), - ) - + int_field = models.IntegerField() qs = qs.annotate( - verified_count=count_annotation(verified_counts), - agreed_with_prediction_count=count_annotation(agreed_with_prediction_counts), + verified_count=self._case_from_map(verified_counts, 0, int_field), + agreed_with_prediction_count=self._case_from_map(agreed_with_prediction_counts, 0, int_field), ) if include_agreement: - qs = qs.annotate(agreed_exact_count=count_annotation(agreed_exact_counts)) + qs = qs.annotate(agreed_exact_count=self._case_from_map(agreed_exact_counts, 0, int_field)) # verified=true|false filter (list only); verified=false is the strict complement. if self.action == "list" and "verified" in self.request.query_params: diff --git a/ami/main/migrations/0087_taxon_parents_json_gin_index.py b/ami/main/migrations/0087_taxon_parents_json_gin_index.py index 6d7404d59..6ffc93127 100644 --- a/ami/main/migrations/0087_taxon_parents_json_gin_index.py +++ b/ami/main/migrations/0087_taxon_parents_json_gin_index.py @@ -12,7 +12,7 @@ class Migration(migrations.Migration): Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is computed in a single Python pass over the (sparse) verified-occurrence set rather than a correlated subquery, because a containment whose RHS is an OuterRef can't use - the index. See TaxonViewSet.add_verification_data. + the index. See TaxonViewSet._annotate_verification_counts. CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands diff --git a/ami/main/models.py b/ami/main/models.py index ff0443e8a..d011f83f7 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3818,11 +3818,11 @@ def best_determination_score(self) -> float | None: return None def verified_count(self) -> int | None: - # Handled by an annotation when filtering by project (TaxonViewSet.add_verification_data) + # Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts) return None def agreed_with_prediction_count(self) -> int | None: - # Handled by an annotation when filtering by project (TaxonViewSet.add_verification_data) + # Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts) return None def agreed_exact_count(self) -> int | None: From 7f571be79e154202fbaed3648f88b5caebb66be6 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:38:19 -0700 Subject: [PATCH 09/16] fix(taxa): use conditional aggregation for dense per-taxon counts The previous commit applied occurrences_count / best_determination_score / last_detected as CASE-from-map annotations. That works for the sparse verified counts but not for these dense aggregates: one CASE branch per observed taxon blew past the SQL parser token limit (SQLParseError: Maximum number of tokens exceeded) on large projects, breaking every taxa request. Switch the dense aggregates to conditional aggregation over the Taxon->occurrences reverse relation (Count/Max with filter=, the pattern already used for Event counts), which is one GROUP BY with constant-size SQL. occurrences_count__gt=0 becomes the observed-set membership (HAVING). The sparse verified/agreement counts stay as CASE annotations. get_occurrence_filters gains an accessor arg to express the same filters through the reverse relation. Measured on a ~1k-taxa / ~17k-occurrence project: collection-filtered list page ~0.25s and COUNT ~0.31s (was a 30s+ timeout); default/verified/ordering ~0.1-0.4s. Co-Authored-By: Claude --- ami/main/api/views.py | 122 +++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 66 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index bdf71484d..6de88afbe 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1537,12 +1537,16 @@ def get_serializer_class(self): else: return TaxonSerializer - def get_occurrence_filters(self, project: Project) -> models.Q: + def get_occurrence_filters(self, project: Project, accessor: str = "") -> models.Q: """ - Filter taxa by when/where it has occurred. + Filter by when/where a taxon has occurred. Supports querying by occurrence, project, deployment, or event. + ``accessor`` is the relation path to the Occurrence model. Pass "" to filter the + Occurrence model directly, or "occurrences" to filter the Taxon model via its + reverse relation (for conditional aggregation in annotate_taxon_counts). + @TODO Consider using a custom filter class for this (see get_filter_name) @TODO Move this to a custom QuerySet manager on the Taxon model """ @@ -1554,12 +1558,12 @@ def get_occurrence_filters(self, project: Project) -> models.Q: event_id = self.request.query_params.get("event") or self.request.query_params.get("occurrences__event") collection_id = self.request.query_params.get("collection") - # filter_active = any([occurrence_id, project, deployment_id, event_id, collection_id]) + prefix = f"{accessor}__" if accessor else "" - filters = models.Q( - project=project, - event__isnull=False, - ) + def field(path: str) -> str: + return f"{prefix}{path}" + + filters = models.Q(**{field("project"): project, field("event__isnull"): False}) try: """ Ensure that the related objects exist before filtering by them. @@ -1568,16 +1572,16 @@ def get_occurrence_filters(self, project: Project) -> models.Q: if occurrence_id: Occurrence.objects.get(id=occurrence_id) # This query does not need the same filtering as the others - filters &= models.Q(id=occurrence_id) + filters &= models.Q(**{field("id"): occurrence_id}) if deployment_id: Deployment.objects.get(id=deployment_id) - filters &= models.Q(deployment=deployment_id) + filters &= models.Q(**{field("deployment"): deployment_id}) if event_id: Event.objects.get(id=event_id) - filters &= models.Q(event=event_id) + filters &= models.Q(**{field("event"): event_id}) if collection_id: SourceImageCollection.objects.get(id=collection_id) - filters &= models.Q(detections__source_image__collections=collection_id) + filters &= models.Q(**{field("detections__source_image__collections"): collection_id}) except exceptions.ObjectDoesNotExist as e: # Raise a 404 if any of the related objects don't exist raise NotFound(detail=str(e)) @@ -1641,30 +1645,14 @@ def get_taxa_observed( Counts are computed by annotate_taxon_counts from one filtered occurrence set (occurrence_filters + the project's default filters), not per-taxon subqueries. """ - occurrence_filters = self.get_occurrence_filters(project) - - # Build a single Q filter for default filters (score threshold + taxa filters) - # This creates an efficient filter that works with composite indexes - # Respects apply_defaults flag: build_occurrence_default_filters_q checks it internally - from ami.main.models_future.filters import build_occurrence_default_filters_q - - default_filters_q = build_occurrence_default_filters_q( + return self.annotate_taxon_counts( + qs, project, - self.request, - occurrence_accessor="", apply_default_score_filter=apply_default_score_filter, apply_default_taxa_filter=apply_default_taxa_filter, - ) - - qs = self.annotate_taxon_counts( - qs, - occurrence_filters, - default_filters_q, restrict_to_observed=not include_unobserved, ) - return qs - def _include_agreement(self) -> bool: """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" if self.action == "retrieve": @@ -1693,58 +1681,60 @@ def _case_from_map(mapping: dict, default, output_field: models.Field) -> models def annotate_taxon_counts( self, qs: QuerySet, - occurrence_filters: models.Q, - default_filters_q: models.Q, + project: Project, *, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, restrict_to_observed: bool, ) -> QuerySet: """Centralised per-(project, taxon) count annotations for the taxa endpoint. - Every count is derived from one filtered occurrence set - (``occurrence_filters`` + the project default filters) by precomputing - ``{taxon_id: value}`` maps in Python and applying them as constant-time ``CASE`` - annotations (:meth:`_case_from_map`), rather than per-taxon correlated subqueries. - The subquery form does not scale once ``occurrence_filters`` joins detections + Every count comes from the same filtered occurrence set (the project's + occurrence_filters + default filters); nothing here uses a per-taxon correlated + subquery, which does not scale once the filters join detections (``?collection=``): each annotation degrades to a per-row scan — 25x on the page, and once per taxon in the unbounded pagination ``COUNT``. - Two count shapes share the same base queryset: + Two count shapes, two mechanisms: - Direct aggregates (``occurrences_count``, ``best_determination_score``, - ``last_detected``), keyed by the occurrence's own determination — one GROUP BY. - ``Count(distinct)`` dedupes the detections-join fan-out under ``?collection=``. - - ``verified_count`` / ``agreed_*`` roll up to ancestors via ``parents_json`` over - the (sparse) verified subset — see :meth:`_annotate_verification_counts`. - - The determination ids present in the filtered set are exactly the observed taxa, - so ``restrict_to_observed`` reuses them instead of a separate membership query. - """ - base = Occurrence.objects.filter(occurrence_filters).filter(default_filters_q) - - occurrences_count: dict[int, int] = {} - best_score: dict[int, float | None] = {} - last_detected: dict[int, object] = {} - for row in base.values("determination_id").annotate( - _n=models.Count("id", distinct=True), - _score=models.Max("determination_score"), - _last=models.Max("detections__timestamp"), - ): - determination_id = row["determination_id"] - if determination_id is None: - continue - occurrences_count[determination_id] = row["_n"] - best_score[determination_id] = row["_score"] - last_detected[determination_id] = row["_last"] + ``last_detected``) are dense — one value per observed taxon — so they use + conditional aggregation over the Taxon→occurrences reverse relation (one GROUP + BY, constant-size SQL). ``Count(distinct)`` dedupes the detections-join fan-out + under ``?collection=``; ``restrict_to_observed`` is then a HAVING on the count. + - ``verified_count`` / ``agreed_*`` are sparse (only *verified* occurrences) and + roll up to ancestors via ``parents_json``, so they are precomputed in Python and + applied as ``CASE`` annotations — see :meth:`_annotate_verification_counts`. A + dense map would not work there: one ``CASE`` branch per taxon blows past the SQL + token limit on large projects. + """ + from ami.main.models_future.filters import build_occurrence_default_filters_q - qs = qs.annotate( - occurrences_count=self._case_from_map(occurrences_count, 0, models.IntegerField()), - best_determination_score=self._case_from_map(best_score, None, models.FloatField()), - last_detected=self._case_from_map(last_detected, None, models.DateTimeField()), + default_kwargs = dict( + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, ) + # Filters expressed through the Taxon→occurrences reverse relation, for conditional + # aggregation on the main query. + count_filter = self.get_occurrence_filters( + project, accessor="occurrences" + ) & build_occurrence_default_filters_q( + project, self.request, occurrence_accessor="occurrences", **default_kwargs + ) + qs = qs.annotate( + occurrences_count=models.Count("occurrences", filter=count_filter, distinct=True), + best_determination_score=models.Max("occurrences__determination_score", filter=count_filter), + last_detected=models.Max("occurrences__detections__timestamp", filter=count_filter), + ) if restrict_to_observed: - qs = qs.filter(id__in=list(occurrences_count.keys())) + qs = qs.filter(occurrences_count__gt=0) + # The verification rollup queries the Occurrence model directly, so it needs the + # same filters without the relation prefix. + base = Occurrence.objects.filter(self.get_occurrence_filters(project)).filter( + build_occurrence_default_filters_q(project, self.request, occurrence_accessor="", **default_kwargs) + ) return self._annotate_verification_counts(qs, base) def _annotate_verification_counts(self, qs: QuerySet, base: QuerySet) -> QuerySet: From f20a05d0575dc13773d98023eb34a5e34d18aff7 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 18:47:59 -0700 Subject: [PATCH 10/16] fix(taxa): drop redundant taxa filter from occurrences_count aggregate Including the default taxa include/exclude filter in the conditional-aggregate filter added a parents_json containment join the planner couldn't reconcile with the detections (?collection=) join, turning the collection page into a multi-minute scan. It is redundant: occurrences_count groups by determination = the taxon row, so the per-occurrence taxa filter just mirrors filter_by_project_default_taxa (already applied to the queryset). Keep only the per-occurrence score threshold in the aggregate; the verification base still gets the full filters (sparse, cheap). Collection-filtered list now ~0.3s (page + COUNT); default/verified/ordering ~0.1-0.4s. Co-Authored-By: Claude --- ami/main/api/views.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 6de88afbe..6038aced3 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1710,17 +1710,23 @@ def annotate_taxon_counts( """ from ami.main.models_future.filters import build_occurrence_default_filters_q - default_kwargs = dict( - apply_default_score_filter=apply_default_score_filter, - apply_default_taxa_filter=apply_default_taxa_filter, - ) - # Filters expressed through the Taxon→occurrences reverse relation, for conditional - # aggregation on the main query. + # aggregation on the main query. The default *taxa* include/exclude filter is + # deliberately omitted here: occurrences_count groups by determination = the taxon + # row itself, so the per-occurrence taxa filter is redundant with the row already + # being kept/dropped by filter_by_project_default_taxa (applied to the queryset for + # list responses). Including it would add a parents_json containment join inside the + # aggregate that the planner cannot reconcile with the detections (?collection=) + # join — turning the page into a multi-minute scan. The score threshold is per + # occurrence, so it is kept. count_filter = self.get_occurrence_filters( project, accessor="occurrences" ) & build_occurrence_default_filters_q( - project, self.request, occurrence_accessor="occurrences", **default_kwargs + project, + self.request, + occurrence_accessor="occurrences", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=False, ) qs = qs.annotate( occurrences_count=models.Count("occurrences", filter=count_filter, distinct=True), @@ -1730,10 +1736,18 @@ def annotate_taxon_counts( if restrict_to_observed: qs = qs.filter(occurrences_count__gt=0) - # The verification rollup queries the Occurrence model directly, so it needs the - # same filters without the relation prefix. + # The verification rollup queries the Occurrence model directly (so no relation + # prefix), and rolls up to ancestors via parents_json, so it does need the full + # default filters. Its driving set is sparse (verified occurrences only), so the + # taxa containment join here is cheap. base = Occurrence.objects.filter(self.get_occurrence_filters(project)).filter( - build_occurrence_default_filters_q(project, self.request, occurrence_accessor="", **default_kwargs) + build_occurrence_default_filters_q( + project, + self.request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) ) return self._annotate_verification_counts(qs, base) From cf86550da5ba65dbdf840138c3d5e399d2cce081 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 19:03:14 -0700 Subject: [PATCH 11/16] fix(taxa): remove redundant TaxonCollectionFilter backend The filter_backends chain included TaxonCollectionFilter, which unconditionally appends queryset.filter(occurrences__detections__source_image__collections=) to the Taxon queryset on top of whatever annotate_taxon_counts already does. The collection filter is now fully enforced inside the conditional aggregates (via get_occurrence_filters(accessor="occurrences")) and the observed-set HAVING, so the backend only added a redundant JOIN on the main qs. Combined with the aggregate GROUP BY, that JOIN turned the collection-filtered taxa page into a multi-minute scan; removing it brings ?collection= back to sub-second. Co-Authored-By: Claude --- ami/main/api/views.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 6038aced3..7bd359f45 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1459,9 +1459,12 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin): queryset = Taxon.objects.all().defer("notes") serializer_class = TaxonSerializer + # `?collection=` is handled inside annotate_taxon_counts (via get_occurrence_filters + # and a HAVING on occurrences_count > 0), so TaxonCollectionFilter would only add a + # redundant JOIN on the main queryset that the planner cannot reconcile with the + # conditional-aggregate GROUP BY — turning the page into a multi-minute scan. filter_backends = DefaultViewSetMixin.filter_backends + [ CustomTaxonFilter, - TaxonCollectionFilter, TaxonTaxaListFilter, TaxonBestScoreFilter, TaxonTagFilter, From 4f456819335dcf1264ec1e0d6ea22969ac6a4b49 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Mon, 25 May 2026 19:04:55 -0700 Subject: [PATCH 12/16] =?UTF-8?q?docs(taxa):=20document=20sparse=20vs=20de?= =?UTF-8?q?nse=20=E2=80=94=20when=20CASE=20breaks,=20when=20to=20use=20con?= =?UTF-8?q?ditional=20aggregation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the two findings from the collection-path timeout fix: (1) the CASE-from-map precompute pattern only scales for sparse maps because dense maps blow past sqlparse's 10000-token limit, and (2) the two gotchas that turn conditional aggregation from sub-second into a multi-minute scan (taxa filter redundant inside the aggregate, redundant collection JOIN backend on top of the aggregate GROUP BY). Co-Authored-By: Claude --- .../hierarchical-rollup-query-performance.md | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md index c8f8e1b8c..4ff33da03 100644 --- a/docs/claude/reference/hierarchical-rollup-query-performance.md +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -102,6 +102,55 @@ and were the original timeout: - **GIN `jsonb_path_ops` only serves `@>` with a constant RHS.** Literal `parents_json__contains=[{"id": X}]` uses it; an `OuterRef` RHS does not. +## When to use which mechanism (sparse vs dense) + +The CASE-from-map pattern above is the right tool **only when the map is sparse**. +For *dense* per-taxon aggregates — one value per observed taxon, e.g. the existing +`occurrences_count` / `best_determination_score` / `last_detected` for a project's +~hundreds-to-thousands of taxa — it does **not** work: one `When` branch per taxon +blows past `sqlparse`'s parser limit and the query fails with +`SQLParseError: Maximum number of tokens exceeded (10000)`. + +The scalable alternative for dense aggregates is **conditional aggregation over the +reverse relation** (already documented for Event counts in `build_occurrence_default_filters_q`): + +```python +count_filter = ( + self.get_occurrence_filters(project, accessor="occurrences") + & build_occurrence_default_filters_q(project, request, occurrence_accessor="occurrences", ...) +) +qs = qs.annotate( + occurrences_count=Count("occurrences", filter=count_filter, distinct=True), + best_determination_score=Max("occurrences__determination_score", filter=count_filter), + last_detected=Max("occurrences__detections__timestamp", filter=count_filter), +).filter(occurrences_count__gt=0) # HAVING — replaces the EXISTS membership query +``` + +One GROUP BY, constant-size SQL, scales in both taxa and occurrences. `Count(distinct)` +dedupes the detections fan-out under `?collection=`. The HAVING replaces a separate +membership query (the determination ids present in the filtered set are exactly the +observed taxa). + +### Two gotchas that turn fast conditional aggregation into a multi-minute scan + +1. **Do not include the default *taxa* include/exclude filter in `count_filter` when + the count groups by `determination = the taxon row`.** The filter is redundant + (`filter_by_project_default_taxa` already keeps/drops the row at the queryset + level), and including it adds a `parents_json` containment join inside the + aggregate that the planner cannot reconcile with the detections join from + `?collection=` — measured 0.3s → 182s on a ~1k-taxa project. Keep only the score + threshold (per-occurrence, not redundant). The verification rollup *base* (which + queries Occurrence directly and rolls up to ancestors) does still need the taxa + filter, and pays no cost because its driving set is sparse. + +2. **Audit `filter_backends` for redundant collection / event JOIN filters before + adding conditional-aggregate annotations.** A backend like + `queryset.filter(occurrences__detections__source_image__collections=)` was + harmless on top of correlated subqueries but, combined with the aggregate + GROUP BY, induces an INNER JOIN that multiplies taxon rows and breaks the planner. + Express the filter once — inside the aggregate (`accessor="occurrences"`) — and + remove the backend, since the HAVING already enforces membership. + ## Future direction — denormalised per-project observed taxa The precompute approach scales with verified-data volume. If per-project taxa From e014a73aeee269b7273ee2c7d1be3a872e3c027e Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 26 May 2026 11:17:17 -0700 Subject: [PATCH 13/16] =?UTF-8?q?docs(taxa):=20next-session=20handoff=20?= =?UTF-8?q?=E2=80=94=20hybrid=20direct-aggregates=20+=20move=20to=20TaxonQ?= =?UTF-8?q?uerySet?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the two follow-ups: (1) restore correlated Subquery aggregates for the default path (regressed 0.82s -> 2.14s under conditional aggregation) while keeping conditional aggregation for ?collection=; (2) move with_observation_counts / observed_in_project / with_verification_counts onto TaxonQuerySet to lighten the viewset. Includes the live timings, the three gotchas to preserve, and the commit chronology so the next session can pick up cold. Co-Authored-By: Claude --- docs/claude/prompts/NEXT_SESSION_PROMPT.md | 103 +++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 docs/claude/prompts/NEXT_SESSION_PROMPT.md diff --git a/docs/claude/prompts/NEXT_SESSION_PROMPT.md b/docs/claude/prompts/NEXT_SESSION_PROMPT.md new file mode 100644 index 000000000..a56af4620 --- /dev/null +++ b/docs/claude/prompts/NEXT_SESSION_PROMPT.md @@ -0,0 +1,103 @@ +# Next session — PR #1317 follow-ups + +## Branch / deploy state (2026-05-26) + +- Branch `worktree-taxa-verification-counts`, HEAD `4f456819` (docs-only on top of `cf86550d`). +- serbia (`antenna-dev-serbia`, `~/antenna`) deployed at `cf86550d`. +- Migrations applied through `0087_taxon_parents_json_gin_index` (renumbered from `0085` after merging main). +- 43 taxa tests pass (`TestTaxaVerification`, `TestTaxonomyViews`, `TestTaxonListQueryCount`, `TestProjectDefaultTaxaFilter`, `TestCachedCountsDefaultFilters`). + +## Final live timings on project 5 (~1k taxa / 17k occ) + +| path | ms | +|---|---| +| default `limit=25` | 2140 (was 820 pre-refactor — **regression**) | +| `verified=true` | 810 | +| `verified=false` | 1190 | +| `ordering=verified_count` | 1450 | +| `collection=5` | 1830 (was 30s+ timeout) | +| `collection=5&with_agreement=true` | 1920 | + +## Why default regressed + +Conditional aggregation does **one big GROUP BY** over all project occurrences (16861 rows) every request. The previous correlated `Subquery` form was **index-served per row** by the composite `(determination_id, project_id, event_id, determination_score)` index — perfect for the default path. Conditional aggregation is universally correct + scales to the collection path; correlated subqueries are faster *when no detections join is in play*. + +## Tasks for next session + +### 1. Hybrid: keep conditional aggregation for `?collection=`, restore correlated subqueries for everything else + +In `TaxonViewSet.annotate_taxon_counts` (`ami/main/api/views.py:1683`), dispatch on whether the occurrence filter joins detections (i.e. `?collection=` is in the request — the only filter that currently induces that join): + +- **No detections join** (default / event / deployment / verified / ordering): use the ORIGINAL three correlated `Subquery` annotations (occurrences_count / best_score / last_detected) — restore the block that was deleted in `838f9d75`. Membership = the materialised `id__in` from `29bec78c` (or the original EXISTS; both worked fine without the detections join). +- **Detections join in play** (`?collection=`): use the conditional aggregation block that exists now (`Count("occurrences", filter=count_filter, distinct=True)` + `Max(...)` + HAVING). Already validated at 1.83s on project 5. + +Pick the branch from `request.query_params` (`"collection" in self.request.query_params`) rather than introspecting the Q object — simpler and reflects the actual decision. + +Targets after hybrid: default ≤ 0.9s, collection ≤ 2s. Verify both on serbia. + +### 2. Move counting into `TaxonQuerySet` methods (lighten view, make reusable) + +Existing custom queryset: `TaxonQuerySet` at `ami/main/models.py:3422` (already has `with_occurrence_counts(project)`, `filter_by_project_default_taxa`, `visible_for_user`). Manager at `models.py:3488`. The CLAUDE.md "Custom QuerySet Methods (Always Use These)" section already lists this as the canonical pattern. + +Methods to add: + +- `with_observation_counts(project, request, *, apply_default_score_filter=True, apply_default_taxa_filter=True, occurrence_filters=None)` — applies the dense aggregates (after hybrid: correlated subqueries OR conditional aggregation). Replaces the existing `with_occurrence_counts` (or replaces its body — the existing one is just `Count("occurrences", distinct=True)`, narrower scope). +- `observed_in_project(project, request, *, occurrence_filters=None)` — membership filter, returns the qs filtered to observed taxa. Mirrors the existing pattern. +- `with_verification_counts(project, request)` — the sparse CASE-from-map rollup (currently `TaxonViewSet._annotate_verification_counts`). Note this method materialises an Occurrence query and builds Python maps before annotating, but that's still appropriate to live on the queryset (it's a method, not a pure annotation). + +What stays in the view: +- `get_occurrence_filters(project, accessor="")` — parses request query params for `occurrence_id` / `deployment` / `event` / `collection`, plus the per-id existence checks that raise 404. View-specific (request introspection). Pass the resulting `Q` into the queryset methods. +- `_include_agreement` (request param parsing). +- `_case_from_map` could move to the queryset module as a module-level helper since it's generic; or stay on the view. Tradeoff: queryset module if anything else might use it. + +After refactor, `get_taxa_observed` shrinks to chaining queryset methods: + +```python +def get_taxa_observed(self, qs, project, include_unobserved=False, apply_default_score_filter=True, apply_default_taxa_filter=True): + occ_filters = self.get_occurrence_filters(project) + qs = qs.with_observation_counts(project, self.request, occurrence_filters=occ_filters, + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter) + if not include_unobserved: + qs = qs.observed_in_project(project, self.request, occurrence_filters=occ_filters) + return qs.with_verification_counts(project, self.request, occurrence_filters=occ_filters, + include_agreement=self._include_agreement(), + verified_param=self.request.query_params.get("verified")) +``` + +(Verified filter application — currently inside `_annotate_verification_counts` — can stay there or move to a small `.filter_verified(...)` queryset method. Either way the view loses the filter wiring.) + +### 3. Gotchas to preserve (cited in `docs/claude/reference/hierarchical-rollup-query-performance.md`) + +- **Dense CASE-from-map fails on large projects** — `SQLParseError: tokens exceeded 10000`. Only use the CASE-from-map pattern for *sparse* maps (verified subset). Dense aggregates need conditional aggregation OR correlated subqueries. +- **Don't include the default taxa filter in the conditional-aggregate `count_filter`** — it adds a `parents_json` containment join inside the aggregate that the planner cannot reconcile with the detections join under `?collection=`. Redundant anyway because `filter_by_project_default_taxa` keeps/drops the row at the qs level. Keep only score threshold in `count_filter`. +- **`TaxonCollectionFilter` removed** (`cf86550d`) — the backend's `queryset.filter(occurrences__detections__source_image__collections=)` was redundant on top of the aggregate's collection filter and broke the planner. If reintroducing for any reason, gate it on the absence of conditional aggregation. + +### 4. Other still-open items (already tracked, lower priority) + +- Browser check of the FE Verified column / verified filter pill / detail panel — not done. +- The `TaxonCollectionFilter` class itself is still defined at `views.py:1207` but unused — delete or keep as cruft? Either way. +- Future scaling: dedicated `TaxonObserved` denorm model (`[[project-taxon-observed-denorm]]`) — only if the conditional-aggregation default-regression or future read-path counts keep biting. + +## Key files / commits + +- `ami/main/api/views.py:1540` `get_occurrence_filters(project, accessor="")` +- `ami/main/api/views.py:1683` `annotate_taxon_counts(qs, project, ...)` +- `ami/main/api/views.py:1740` `_annotate_verification_counts(qs, base)` +- `ami/main/api/views.py:1675` `_case_from_map(mapping, default, output_field)` +- `ami/main/api/views.py:1462` filter_backends (TaxonCollectionFilter removed) +- `ami/main/models.py:3422` `TaxonQuerySet` (target home for new methods) +- `docs/claude/reference/hierarchical-rollup-query-performance.md` — pattern + gotchas reference +- `ami/main/tests.py:4766` `TestTaxaVerification` — gate the hybrid against these + +## Commits this session (most recent first) + +- `4f456819` docs: sparse vs dense, gotchas +- `cf86550d` fix: remove redundant TaxonCollectionFilter backend (collection 182s → 1.83s) +- `f20a05d0` fix: drop redundant taxa filter from occurrences_count aggregate (broken→working but slow path still) +- `7f571be7` fix: conditional aggregation for dense per-taxon counts +- `838f9d75` refactor: centralize per-taxon counts (introduced; dense CASE form broke SQL token limit) +- `29bec78c` fix: materialize observed-taxon id set +- `b92b2b0e` fix: collection-COUNT scale (membership id__in subquery — regressed, superseded) +- `30955e33` chore: migration renumber 0085 → 0087 after main merge +- `10c72cbd` fix: dedupe occurrences in verification rollup under collection filter From 04a62c60af49f94d8e12fb8cc6fea7eb25e88c8a Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 26 May 2026 13:29:43 -0700 Subject: [PATCH 14/16] refactor(taxa): move count logic to TaxonQuerySet, hybrid subquery/aggregation dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts per-taxon count annotation logic out of TaxonViewSet into four new TaxonQuerySet methods, matching the "Custom QuerySet Methods (Always Use These)" pattern in CLAUDE.md. - with_observation_counts_subqueries — three correlated Subquery annotations (occurrences_count / best_determination_score / last_detected), index-served by the composite (determination_id, project_id, event_id, determination_score) index on Occurrence. Default / event / deployment / verified / ordering paths. - with_observation_counts_aggregated — conditional aggregation over the Taxon→occurrences reverse relation with Count(distinct) dedup. Required under ?collection= where the detections join turns correlated subqueries into per-row scans. - observed_in_project_subqueries — materialised id__in membership for the subquery path (the aggregate path uses HAVING via occurrences_count__gt=0). - with_verification_counts — sparse CASE-from-map rollup of verified_count / agreed_with_prediction_count / (gated) agreed_exact_count over ancestor parents_json, with optional verified=true|false filter. TaxonViewSet.get_taxa_observed shrinks to a dispatcher that picks the count shape based on "collection" in request.query_params and chains the queryset methods. _case_from_map moves to a module-level helper alongside the new queryset methods. Removes the now-redundant TaxonCollectionFilter backend (its INNER JOIN on queryset.filter(occurrences__detections__source_image__collections=) was unreconcilable with the aggregate GROUP BY). Co-Authored-By: Claude --- ami/main/api/views.py | 274 ++++-------------- ami/main/models.py | 249 +++++++++++++++- .../hierarchical-rollup-query-performance.md | 6 +- 3 files changed, 307 insertions(+), 222 deletions(-) diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 7bd359f45..7372503b8 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -36,8 +36,6 @@ from ami.utils.storages import ConnectionTestResult from ..models import ( - BEST_IDENTIFICATION_ORDER, - BEST_MACHINE_PREDICTION_ORDER, NULL_DETECTIONS_FILTER, Classification, Deployment, @@ -1204,22 +1202,6 @@ def filter_queryset(self, request, queryset, view): return queryset -class TaxonCollectionFilter(filters.BaseFilterBackend): - """ - Filter taxa by the capture set their occurrences belong to. - """ - - query_param = "collection" - - def filter_queryset(self, request, queryset, view): - collection_id = IntegerField(required=False).clean(request.query_params.get(self.query_param)) - if collection_id: - # Here the queryset is the Taxon queryset - return queryset.filter(occurrences__detections__source_image__collections=collection_id) - else: - return queryset - - class OccurrenceViewSet(DefaultViewSet, ProjectMixin): """ API endpoint that allows occurrences to be viewed or edited. @@ -1459,10 +1441,11 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin): queryset = Taxon.objects.all().defer("notes") serializer_class = TaxonSerializer - # `?collection=` is handled inside annotate_taxon_counts (via get_occurrence_filters - # and a HAVING on occurrences_count > 0), so TaxonCollectionFilter would only add a - # redundant JOIN on the main queryset that the planner cannot reconcile with the - # conditional-aggregate GROUP BY — turning the page into a multi-minute scan. + # ``?collection=`` is handled inside get_taxa_observed (via get_occurrence_filters + # + TaxonQuerySet.with_observation_counts_aggregated + HAVING). A dedicated + # filter_backends entry that re-applied the collection filter on the main queryset + # would add a redundant JOIN that the planner cannot reconcile with the + # conditional-aggregate GROUP BY, turning the page into a multi-minute scan. filter_backends = DefaultViewSetMixin.filter_backends + [ CustomTaxonFilter, TaxonTaxaListFilter, @@ -1548,7 +1531,8 @@ def get_occurrence_filters(self, project: Project, accessor: str = "") -> models ``accessor`` is the relation path to the Occurrence model. Pass "" to filter the Occurrence model directly, or "occurrences" to filter the Taxon model via its - reverse relation (for conditional aggregation in annotate_taxon_counts). + reverse relation (for conditional aggregation in + :meth:`TaxonQuerySet.with_observation_counts_aggregated`). @TODO Consider using a custom filter class for this (see get_filter_name) @TODO Move this to a custom QuerySet manager on the Taxon model @@ -1641,19 +1625,66 @@ def get_taxa_observed( apply_default_score_filter=True, apply_default_taxa_filter=True, ) -> QuerySet: - """ - If a project is passed, only return taxa that have been observed. - Also add the number of occurrences and the last time it was detected. + """Annotate per-(project, taxon) counts and optionally restrict to observed taxa. + + Two SQL shapes for the direct aggregates (``occurrences_count`` / + ``best_determination_score`` / ``last_detected``): + + - **Default / event / deployment / verified paths** — correlated ``Subquery`` + annotations, index-served by the composite + ``(determination_id, project_id, event_id, determination_score)`` index on + Occurrence. Membership via materialised ``id__in``. + - **``?collection=``** — conditional aggregation over the Taxon→occurrences + reverse relation. The detections join would turn each correlated subquery into + a per-row scan, so we switch to one GROUP BY. Membership via HAVING. + + The sparse verification rollup (``verified_count`` / ``agreed_*``) is the same on + either path — a Python pass over the verified subset applied as ``CASE`` + annotations, see :meth:`TaxonQuerySet.with_verification_counts`. + """ + request = self.request + use_aggregation = "collection" in request.query_params + direct_filters = self.get_occurrence_filters(project) + + if use_aggregation: + relation_filters = self.get_occurrence_filters(project, accessor="occurrences") + qs = qs.with_observation_counts_aggregated( + project, + request, + relation_occurrence_filters=relation_filters, + apply_default_score_filter=apply_default_score_filter, + ) + if not include_unobserved: + qs = qs.filter(occurrences_count__gt=0) + else: + qs = qs.with_observation_counts_subqueries( + project, + request, + occurrence_filters=direct_filters, + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + if not include_unobserved: + qs = qs.observed_in_project_subqueries( + project, + request, + occurrence_filters=direct_filters, + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) - Counts are computed by annotate_taxon_counts from one filtered occurrence set - (occurrence_filters + the project's default filters), not per-taxon subqueries. - """ - return self.annotate_taxon_counts( - qs, + verified_param: bool | None = None + if self.action == "list" and "verified" in request.query_params: + verified_param = BooleanField(required=False).clean(request.query_params.get("verified")) + + return qs.with_verification_counts( project, + request, + occurrence_filters=direct_filters, apply_default_score_filter=apply_default_score_filter, apply_default_taxa_filter=apply_default_taxa_filter, - restrict_to_observed=not include_unobserved, + include_agreement=self._include_agreement(), + verified=verified_param, ) def _include_agreement(self) -> bool: @@ -1662,185 +1693,6 @@ def _include_agreement(self) -> bool: return True return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement"))) - @staticmethod - def _case_from_map(mapping: dict, default, output_field: models.Field) -> models.expressions.Combinable: - """Turn a precomputed ``{taxon_id: value}`` map into a constant-time ``CASE``. - - The result is constant per row, so it is DB-sortable, paginatable, and stripped - from the pagination ``COUNT`` — unlike a per-taxon correlated subquery, which is - re-evaluated for every row and (in ``COUNT``) for every taxon in the project. - """ - if not mapping: - return models.Value(default, output_field=output_field) - return models.Case( - *( - models.When(id=taxon_id, then=models.Value(value, output_field=output_field)) - for taxon_id, value in mapping.items() - ), - default=models.Value(default, output_field=output_field), - output_field=output_field, - ) - - def annotate_taxon_counts( - self, - qs: QuerySet, - project: Project, - *, - apply_default_score_filter: bool = True, - apply_default_taxa_filter: bool = True, - restrict_to_observed: bool, - ) -> QuerySet: - """Centralised per-(project, taxon) count annotations for the taxa endpoint. - - Every count comes from the same filtered occurrence set (the project's - occurrence_filters + default filters); nothing here uses a per-taxon correlated - subquery, which does not scale once the filters join detections - (``?collection=``): each annotation degrades to a per-row scan — 25x on the - page, and once per taxon in the unbounded pagination ``COUNT``. - - Two count shapes, two mechanisms: - - - Direct aggregates (``occurrences_count``, ``best_determination_score``, - ``last_detected``) are dense — one value per observed taxon — so they use - conditional aggregation over the Taxon→occurrences reverse relation (one GROUP - BY, constant-size SQL). ``Count(distinct)`` dedupes the detections-join fan-out - under ``?collection=``; ``restrict_to_observed`` is then a HAVING on the count. - - ``verified_count`` / ``agreed_*`` are sparse (only *verified* occurrences) and - roll up to ancestors via ``parents_json``, so they are precomputed in Python and - applied as ``CASE`` annotations — see :meth:`_annotate_verification_counts`. A - dense map would not work there: one ``CASE`` branch per taxon blows past the SQL - token limit on large projects. - """ - from ami.main.models_future.filters import build_occurrence_default_filters_q - - # Filters expressed through the Taxon→occurrences reverse relation, for conditional - # aggregation on the main query. The default *taxa* include/exclude filter is - # deliberately omitted here: occurrences_count groups by determination = the taxon - # row itself, so the per-occurrence taxa filter is redundant with the row already - # being kept/dropped by filter_by_project_default_taxa (applied to the queryset for - # list responses). Including it would add a parents_json containment join inside the - # aggregate that the planner cannot reconcile with the detections (?collection=) - # join — turning the page into a multi-minute scan. The score threshold is per - # occurrence, so it is kept. - count_filter = self.get_occurrence_filters( - project, accessor="occurrences" - ) & build_occurrence_default_filters_q( - project, - self.request, - occurrence_accessor="occurrences", - apply_default_score_filter=apply_default_score_filter, - apply_default_taxa_filter=False, - ) - qs = qs.annotate( - occurrences_count=models.Count("occurrences", filter=count_filter, distinct=True), - best_determination_score=models.Max("occurrences__determination_score", filter=count_filter), - last_detected=models.Max("occurrences__detections__timestamp", filter=count_filter), - ) - if restrict_to_observed: - qs = qs.filter(occurrences_count__gt=0) - - # The verification rollup queries the Occurrence model directly (so no relation - # prefix), and rolls up to ancestors via parents_json, so it does need the full - # default filters. Its driving set is sparse (verified occurrences only), so the - # taxa containment join here is cheap. - base = Occurrence.objects.filter(self.get_occurrence_filters(project)).filter( - build_occurrence_default_filters_q( - project, - self.request, - occurrence_accessor="", - apply_default_score_filter=apply_default_score_filter, - apply_default_taxa_filter=apply_default_taxa_filter, - ) - ) - return self._annotate_verification_counts(qs, base) - - def _annotate_verification_counts(self, qs: QuerySet, base: QuerySet) -> QuerySet: - """ - Annotate per-taxon verification / human-model agreement counts, and apply the - ``verified=true|false`` filter on list responses. - - ``base`` is the shared filtered occurrence set from :meth:`annotate_taxon_counts`. - Counts roll up descendant occurrences (verifying a species also counts toward its - genus/family rows). They only concern *verified* occurrences (those with a - non-withdrawn Identification), which are sparse, so the hierarchical rollup is a - single Python pass over that small subset applied as constant-time ``CASE`` - annotations. A correlated ``parents_json`` subquery per taxon does not scale: the - GIN index can't serve a containment whose RHS is an ``OuterRef``. - """ - include_agreement = self._include_agreement() - - # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK. - best_identification_agreed_prediction = models.Subquery( - Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False) - .order_by(*BEST_IDENTIFICATION_ORDER) - .values("agreed_with_prediction_id")[:1] - ) - verified_occurrences = base.filter( - models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)) - ).annotate(_agreed_prediction_id=best_identification_agreed_prediction) - # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when - # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence - # yields a row per matching Detection, which would otherwise inflate the counts. - value_fields = ["pk", "determination_id", "determination__parents_json", "_agreed_prediction_id"] - if include_agreement: - # Top machine prediction's taxon for the same occurrence. - verified_occurrences = verified_occurrences.annotate( - _best_machine_taxon_id=models.Subquery( - Classification.objects.filter(detection__occurrence=models.OuterRef("pk")) - .order_by(*BEST_MACHINE_PREDICTION_ORDER) - .values("taxon_id")[:1] - ) - ) - value_fields.append("_best_machine_taxon_id") - - verified_counts: dict[int, int] = {} - agreed_with_prediction_counts: dict[int, int] = {} - agreed_exact_counts: dict[int, int] = {} - for row in verified_occurrences.values(*value_fields).distinct(): - determination_id = row["determination_id"] - # The taxon itself plus every ancestor — i.e. every row this occurrence rolls up to. - taxon_ids: set[int] = set() - if determination_id is not None: - taxon_ids.add(determination_id) - for parent in row["determination__parents_json"] or []: - # parents_json round-trips through the pydantic schema field, so elements - # may be dicts or ``TaxonParent`` objects depending on the query path. - parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) - if parent_id is not None: - taxon_ids.add(int(parent_id)) - - for taxon_id in taxon_ids: - verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1 - if row["_agreed_prediction_id"] is not None: - for taxon_id in taxon_ids: - agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1 - if ( - include_agreement - and determination_id is not None - and determination_id == row["_best_machine_taxon_id"] - ): - for taxon_id in taxon_ids: - agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1 - - int_field = models.IntegerField() - qs = qs.annotate( - verified_count=self._case_from_map(verified_counts, 0, int_field), - agreed_with_prediction_count=self._case_from_map(agreed_with_prediction_counts, 0, int_field), - ) - if include_agreement: - qs = qs.annotate(agreed_exact_count=self._case_from_map(agreed_exact_counts, 0, int_field)) - - # verified=true|false filter (list only); verified=false is the strict complement. - if self.action == "list" and "verified" in self.request.query_params: - verified = BooleanField(required=False).clean(self.request.query_params.get("verified")) - verified_taxon_ids = list(verified_counts.keys()) - if verified: - qs = qs.filter(id__in=verified_taxon_ids) - else: - qs = qs.exclude(id__in=verified_taxon_ids) - - return qs - def attach_tags_by_project(self, qs: QuerySet, project: Project) -> QuerySet: """ Prefetch and override the `.tags` attribute on each Taxon diff --git a/ami/main/models.py b/ami/main/models.py index d011f83f7..78a9ef9fd 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3419,15 +3419,246 @@ def update_occurrence_determination( return needs_update +def _case_from_map(mapping: dict, default, output_field: models.Field) -> models.expressions.Combinable: + """Turn a precomputed ``{taxon_id: value}`` map into a constant-time ``CASE``. + + The result is constant per row, so it is DB-sortable, paginatable, and stripped from + the pagination ``COUNT`` — unlike a per-taxon correlated subquery, which is + re-evaluated for every row and (in ``COUNT``) for every taxon in the project. Only + sparse maps work here: one ``When`` per entry blows past sqlparse's 10000-token + limit at ~hundreds of taxa × multiple columns. + """ + if not mapping: + return models.Value(default, output_field=output_field) + return models.Case( + *( + models.When(id=taxon_id, then=models.Value(value, output_field=output_field)) + for taxon_id, value in mapping.items() + ), + default=models.Value(default, output_field=output_field), + output_field=output_field, + ) + + class TaxonQuerySet(BaseQuerySet): - def with_occurrence_counts(self, project: Project): - """ - Annotate each taxon with the count of its occurrences for a given project. - """ - qs = self - qs = qs.filter(occurrences__project=project) + def with_observation_counts_subqueries( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + ): + """Annotate ``occurrences_count`` / ``best_determination_score`` / ``last_detected`` + via three correlated ``Subquery`` annotations. + + Index-served by the composite ``(determination_id, project_id, event_id, + determination_score)`` index on Occurrence. Use this on non-collection paths. + When ``occurrence_filters`` joins detections (e.g. ``?collection=``) the + correlated form degrades to a per-row scan; use + :meth:`with_observation_counts_aggregated` instead. + """ + default_filters_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + base_filter = models.Q(occurrence_filters, determination_id=OuterRef("id")) & default_filters_q + + occurrences_count_subquery = models.Subquery( + Occurrence.objects.filter(base_filter) + .values("determination_id") + .annotate(count=models.Count("id")) + .values("count")[:1], + output_field=models.IntegerField(), + ) + best_score_subquery = models.Subquery( + Occurrence.objects.filter(base_filter) + .values("determination_id") + .annotate(max_score=models.Max("determination_score")) + .values("max_score")[:1], + output_field=models.FloatField(), + ) + last_detected_subquery = models.Subquery( + Occurrence.objects.filter(base_filter, detections__timestamp__isnull=False) + .values("determination_id") + .annotate(last_detected=models.Max("detections__timestamp")) + .values("last_detected")[:1], + output_field=models.DateTimeField(), + ) + return self.annotate( + occurrences_count=Coalesce(occurrences_count_subquery, 0), + best_determination_score=best_score_subquery, + last_detected=last_detected_subquery, + ) + + def with_observation_counts_aggregated( + self, + project: Project, + request: Request | None, + *, + relation_occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + ): + """Annotate ``occurrences_count`` / ``best_determination_score`` / ``last_detected`` + via conditional aggregation over the Taxon→occurrences reverse relation. + + Required when ``relation_occurrence_filters`` joins detections (``?collection=``), + where the correlated-subquery form degrades to per-row scans. One GROUP BY, + constant-size SQL. ``Count(distinct)`` dedupes the detections-join fan-out. + + The default *taxa* include/exclude filter is deliberately omitted from + ``count_filter``: it is redundant with row-level + :meth:`filter_by_project_default_taxa`, and including it adds a + ``parents_json`` containment join inside the aggregate that the planner cannot + reconcile with the detections join (measured: 0.3s → 182s on a ~1k-taxa project). + Score threshold is per-occurrence so it stays. + """ + count_filter = relation_occurrence_filters & build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="occurrences", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=False, + ) + return self.annotate( + occurrences_count=models.Count("occurrences", filter=count_filter, distinct=True), + best_determination_score=models.Max("occurrences__determination_score", filter=count_filter), + last_detected=models.Max("occurrences__detections__timestamp", filter=count_filter), + ) - return qs.annotate(occurrence_count=models.Count("occurrences", distinct=True)) + def observed_in_project_subqueries( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + ): + """Restrict the queryset to taxa observed in the filtered occurrence set, via a + materialised ``id__in``. Pair with :meth:`with_observation_counts_subqueries`. + + The materialised form runs the (potentially detections-joined) filter exactly + once and leaves the pagination ``COUNT`` / page as a plain indexed ``id IN + (...)``. Aggregation-path callers should use ``.filter(occurrences_count__gt=0)`` + (HAVING) instead. + """ + default_filters_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + observed_taxon_ids = list( + Occurrence.objects.filter(occurrence_filters) + .filter(default_filters_q) + .filter(determination_id__isnull=False) + .values_list("determination_id", flat=True) + .distinct() + ) + return self.filter(id__in=observed_taxon_ids) + + def with_verification_counts( + self, + project: Project, + request: Request | None, + *, + occurrence_filters: models.Q, + apply_default_score_filter: bool = True, + apply_default_taxa_filter: bool = True, + include_agreement: bool = False, + verified: bool | None = None, + ): + """Annotate ``verified_count`` / ``agreed_with_prediction_count`` (and, if + ``include_agreement``, ``agreed_exact_count``) and optionally apply the + ``verified=true|false`` filter. + + Counts roll up descendant occurrences (verifying a species also counts toward + its genus / family rows). They concern only *verified* occurrences (those with a + non-withdrawn ``Identification``) — sparse relative to all occurrences — so the + hierarchical rollup is a single Python pass over that small subset applied as + constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per + taxon would not scale (GIN can't serve a containment with an ``OuterRef`` RHS). + """ + default_q = build_occurrence_default_filters_q( + project, + request, + occurrence_accessor="", + apply_default_score_filter=apply_default_score_filter, + apply_default_taxa_filter=apply_default_taxa_filter, + ) + base = Occurrence.objects.filter(occurrence_filters).filter(default_q) + + # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK. + best_identification_agreed_prediction = models.Subquery( + Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False) + .order_by(*BEST_IDENTIFICATION_ORDER) + .values("agreed_with_prediction_id")[:1] + ) + verified_occurrences = base.filter( + Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False)) + ).annotate(_agreed_prediction_id=best_identification_agreed_prediction) + # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when + # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence + # yields a row per matching Detection, which would otherwise inflate counts. + value_fields = ["pk", "determination_id", "determination__parents_json", "_agreed_prediction_id"] + if include_agreement: + verified_occurrences = verified_occurrences.annotate( + _best_machine_taxon_id=models.Subquery( + Classification.objects.filter(detection__occurrence=OuterRef("pk")) + .order_by(*BEST_MACHINE_PREDICTION_ORDER) + .values("taxon_id")[:1] + ) + ) + value_fields.append("_best_machine_taxon_id") + + verified_counts: dict[int, int] = {} + agreed_with_prediction_counts: dict[int, int] = {} + agreed_exact_counts: dict[int, int] = {} + for row in verified_occurrences.values(*value_fields).distinct(): + determination_id = row["determination_id"] + taxon_ids: set[int] = set() + if determination_id is not None: + taxon_ids.add(determination_id) + for parent in row["determination__parents_json"] or []: + # parents_json round-trips through the pydantic schema field, so elements + # may be dicts or ``TaxonParent`` objects depending on the query path. + parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) + if parent_id is not None: + taxon_ids.add(int(parent_id)) + + for taxon_id in taxon_ids: + verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1 + if row["_agreed_prediction_id"] is not None: + for taxon_id in taxon_ids: + agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1 + if ( + include_agreement + and determination_id is not None + and determination_id == row["_best_machine_taxon_id"] + ): + for taxon_id in taxon_ids: + agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1 + + int_field = models.IntegerField() + qs = self.annotate( + verified_count=_case_from_map(verified_counts, 0, int_field), + agreed_with_prediction_count=_case_from_map(agreed_with_prediction_counts, 0, int_field), + ) + if include_agreement: + qs = qs.annotate(agreed_exact_count=_case_from_map(agreed_exact_counts, 0, int_field)) + + if verified is True: + qs = qs.filter(id__in=list(verified_counts.keys())) + elif verified is False: + qs = qs.exclude(id__in=list(verified_counts.keys())) + + return qs def filter_by_project_default_taxa(self, project: Project | None = None, request: Request | None = None): """ @@ -3818,11 +4049,11 @@ def best_determination_score(self) -> float | None: return None def verified_count(self) -> int | None: - # Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts) + # Handled by an annotation when filtering by project (TaxonQuerySet.with_verification_counts) return None def agreed_with_prediction_count(self) -> int | None: - # Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts) + # Handled by an annotation when filtering by project (TaxonQuerySet.with_verification_counts) return None def agreed_exact_count(self) -> int | None: diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md index 4ff33da03..3de10c98b 100644 --- a/docs/claude/reference/hierarchical-rollup-query-performance.md +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -5,10 +5,12 @@ How to compute a per-taxon count that rolls up descendant occurrences endpoint on large projects. Generalises to any "aggregate over a node and all its `parents_json` descendants" problem. -Reference implementation: `TaxonViewSet.add_verification_data` in -`ami/main/api/views.py` (verified_count / agreed_with_prediction_count / +Reference implementation: `TaxonQuerySet.with_verification_counts` in +`ami/main/models.py` (verified_count / agreed_with_prediction_count / agreed_exact_count for #1316). History: the first revision used the anti-pattern below and timed out; commit `16b14686` switched to the pattern. +The logic moved from `TaxonViewSet.add_verification_data` → the queryset +method as part of the view→queryset refactor for #1317. ## TL;DR From 20683ee2646ddd7eab7df4b6904d5757a0721003 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 26 May 2026 13:36:29 -0700 Subject: [PATCH 15/16] refactor(taxa): drop model-agreement counts, keep verification only PR scope narrowed to match issue #1316 (per-taxon verification UX). The model-agreement signals (`agreed_with_prediction_count`, `agreed_exact_count`, `with_agreement` query gate) had no FE consumer at merge time and serve a different audience (ML model evaluation) than the human-trust `verified_count`. They are deferred to follow-up issue #1319 where they can be paired with a real FE consumer and renamed to a `model_agreed_*` prefix to disambiguate from human verifications. Removed from the backend: - Classification + Identification subqueries inside `TaxonQuerySet.with_verification_counts` for `_best_machine_taxon_id` and `_agreed_prediction_id` - `include_agreement` parameter on `with_verification_counts` - `TaxonViewSet._include_agreement` and the `with_agreement` query param - `agreement_requested` helper and the field-pop logic in `TaxonListSerializer.__init__` - `agreed_with_prediction_count` / `agreed_exact_count` fields on `TaxonListSerializer` and `TaxonSerializer` - Property stubs `Taxon.agreed_with_prediction_count` / `Taxon.agreed_exact_count` - Tests for the agreed counts (kept the rollup, dedup, list, and ordering tests for the verified count) Removed from the frontend: - `Species.numAgreedWithPrediction` / `Species.numAgreedExact` getters - "Agreed with prediction" and "Matched model exactly" rows in the species detail view (Verification block keeps just the Verified row) 40 of the 43 taxa tests pass under CI compose; the dropped three covered the agreed counts. Co-Authored-By: Claude --- ami/main/api/serializers.py | 18 ------ ami/main/api/views.py | 7 --- ami/main/models.py | 63 ++++--------------- ami/main/tests.py | 31 +-------- ui/src/data-services/models/species.ts | 9 --- .../pages/species-details/species-details.tsx | 8 --- 6 files changed, 12 insertions(+), 124 deletions(-) diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index db6aa48bf..6d70906e1 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -588,14 +588,6 @@ def get_taxa(self, obj): return [{"id": taxon.id, "name": taxon.name} for taxon in obj.taxa.all()] -def agreement_requested(request: Request | None) -> bool: - """Whether ``with_agreement=true`` is set, gating the heavier agreed_exact_count.""" - if request is None: - return False - value = request.query_params.get("with_agreement", "") - return str(value).lower() in ("true", "1", "yes", "on") - - class TaxonListSerializer(DefaultSerializer): # latest_detection = DetectionNestedSerializer(read_only=True) occurrences = serializers.SerializerMethodField() @@ -603,12 +595,6 @@ class TaxonListSerializer(DefaultSerializer): parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent") tags = serializers.SerializerMethodField() - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # agreed_exact_count is a gated annotation: omit it unless with_agreement=true. - if not agreement_requested(self.context.get("request")): - self.fields.pop("agreed_exact_count", None) - def get_tags(self, obj): tag_list = getattr(obj, "prefetched_tags", []) return TagSerializer(tag_list, many=True, context=self.context).data @@ -624,8 +610,6 @@ class Meta: "details", "occurrences_count", "verified_count", - "agreed_with_prediction_count", - "agreed_exact_count", "occurrences", "tags", "last_detected", @@ -904,8 +888,6 @@ class Meta: "details", "occurrences_count", "verified_count", - "agreed_with_prediction_count", - "agreed_exact_count", "events_count", "occurrences", "gbif_taxon_key", diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 7372503b8..5d21b9b20 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -1683,16 +1683,9 @@ def get_taxa_observed( occurrence_filters=direct_filters, apply_default_score_filter=apply_default_score_filter, apply_default_taxa_filter=apply_default_taxa_filter, - include_agreement=self._include_agreement(), verified=verified_param, ) - def _include_agreement(self) -> bool: - """Whether the heavier ``agreed_exact_count`` annotation should be computed.""" - if self.action == "retrieve": - return True - return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement"))) - def attach_tags_by_project(self, qs: QuerySet, project: Project) -> QuerySet: """ Prefetch and override the `.tags` attribute on each Taxon diff --git a/ami/main/models.py b/ami/main/models.py index 78a9ef9fd..3d21c07cd 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -3571,11 +3571,9 @@ def with_verification_counts( occurrence_filters: models.Q, apply_default_score_filter: bool = True, apply_default_taxa_filter: bool = True, - include_agreement: bool = False, verified: bool | None = None, ): - """Annotate ``verified_count`` / ``agreed_with_prediction_count`` (and, if - ``include_agreement``, ``agreed_exact_count``) and optionally apply the + """Annotate ``verified_count`` and optionally apply the ``verified=true|false`` filter. Counts roll up descendant occurrences (verifying a species also counts toward @@ -3584,6 +3582,9 @@ def with_verification_counts( hierarchical rollup is a single Python pass over that small subset applied as constant-time ``CASE`` annotations. A correlated ``parents_json`` subquery per taxon would not scale (GIN can't serve a containment with an ``OuterRef`` RHS). + + Model-agreement counts (whether the chosen identification matched the model's + top prediction) are tracked separately — see issue #1319. """ default_q = build_occurrence_default_filters_q( project, @@ -3592,34 +3593,17 @@ def with_verification_counts( apply_default_score_filter=apply_default_score_filter, apply_default_taxa_filter=apply_default_taxa_filter, ) - base = Occurrence.objects.filter(occurrence_filters).filter(default_q) - - # The chosen (best, non-withdrawn) identification's agreed_with_prediction FK. - best_identification_agreed_prediction = models.Subquery( - Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False) - .order_by(*BEST_IDENTIFICATION_ORDER) - .values("agreed_with_prediction_id")[:1] + verified_occurrences = ( + Occurrence.objects.filter(occurrence_filters) + .filter(default_q) + .filter(Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False))) ) - verified_occurrences = base.filter( - Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False)) - ).annotate(_agreed_prediction_id=best_identification_agreed_prediction) # ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when # occurrence_filters joins to detections (e.g. ?collection=), one Occurrence # yields a row per matching Detection, which would otherwise inflate counts. - value_fields = ["pk", "determination_id", "determination__parents_json", "_agreed_prediction_id"] - if include_agreement: - verified_occurrences = verified_occurrences.annotate( - _best_machine_taxon_id=models.Subquery( - Classification.objects.filter(detection__occurrence=OuterRef("pk")) - .order_by(*BEST_MACHINE_PREDICTION_ORDER) - .values("taxon_id")[:1] - ) - ) - value_fields.append("_best_machine_taxon_id") + value_fields = ["pk", "determination_id", "determination__parents_json"] verified_counts: dict[int, int] = {} - agreed_with_prediction_counts: dict[int, int] = {} - agreed_exact_counts: dict[int, int] = {} for row in verified_occurrences.values(*value_fields).distinct(): determination_id = row["determination_id"] taxon_ids: set[int] = set() @@ -3631,27 +3615,10 @@ def with_verification_counts( parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) if parent_id is not None: taxon_ids.add(int(parent_id)) - for taxon_id in taxon_ids: verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1 - if row["_agreed_prediction_id"] is not None: - for taxon_id in taxon_ids: - agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1 - if ( - include_agreement - and determination_id is not None - and determination_id == row["_best_machine_taxon_id"] - ): - for taxon_id in taxon_ids: - agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1 - - int_field = models.IntegerField() - qs = self.annotate( - verified_count=_case_from_map(verified_counts, 0, int_field), - agreed_with_prediction_count=_case_from_map(agreed_with_prediction_counts, 0, int_field), - ) - if include_agreement: - qs = qs.annotate(agreed_exact_count=_case_from_map(agreed_exact_counts, 0, int_field)) + + qs = self.annotate(verified_count=_case_from_map(verified_counts, 0, models.IntegerField())) if verified is True: qs = qs.filter(id__in=list(verified_counts.keys())) @@ -4052,14 +4019,6 @@ def verified_count(self) -> int | None: # Handled by an annotation when filtering by project (TaxonQuerySet.with_verification_counts) return None - def agreed_with_prediction_count(self) -> int | None: - # Handled by an annotation when filtering by project (TaxonQuerySet.with_verification_counts) - return None - - def agreed_exact_count(self) -> int | None: - # Handled by an annotation only when with_agreement is requested or on the detail view - return None - def occurrence_images(self, limit: int | None = 10) -> list[str]: # This is handled by an annotation if we are filtering by project, deployment or event return [] diff --git a/ami/main/tests.py b/ami/main/tests.py index c3c28217d..3352bd2d0 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4825,40 +4825,12 @@ def test_verified_count_rolls_up_to_ancestors(self): for ancestor in (self.genus, self.family, self.order): self.assertEqual(self._detail(ancestor)["verified_count"], 3, ancestor.name) - # --- agreed_with_prediction_count (chosen identification only) --- - - def test_agreed_with_prediction_counts_only_chosen_identification(self): - self.assertEqual(self._detail(self.cardui)["agreed_with_prediction_count"], 1) - self.assertEqual(self._detail(self.atalanta)["agreed_with_prediction_count"], 0) - # Rolls up: only occ_pred contributes under the genus. - self.assertEqual(self._detail(self.genus)["agreed_with_prediction_count"], 1) - - # --- agreed_exact_count (gated) --- - - def test_agreed_exact_count_on_detail(self): - # occ_pred + occ_exact: user determination == top machine prediction (cardui). - self.assertEqual(self._detail(self.cardui)["agreed_exact_count"], 2) - # occ_disagree: user picked atalanta, model said cardui → not exact. - self.assertEqual(self._detail(self.atalanta)["agreed_exact_count"], 0) - self.assertEqual(self._detail(self.genus)["agreed_exact_count"], 2) - - def test_agreed_exact_count_gated_on_list(self): - rows = self._list_by_name() - self.assertIn("verified_count", rows["Vanessa cardui"]) - self.assertIn("agreed_with_prediction_count", rows["Vanessa cardui"]) - self.assertNotIn("agreed_exact_count", rows["Vanessa cardui"]) - - rows = self._list_by_name(self.list_url + "&with_agreement=true") - self.assertIn("agreed_exact_count", rows["Vanessa cardui"]) - self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2) - # --- list field values --- def test_list_field_values(self): rows = self._list_by_name() self.assertEqual(rows["Vanessa cardui"]["occurrences_count"], 2) self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) - self.assertEqual(rows["Vanessa cardui"]["agreed_with_prediction_count"], 1) self.assertEqual(rows["Vanessa atalanta"]["verified_count"], 1) self.assertEqual(rows["Vanessa itea"]["verified_count"], 0) @@ -4909,7 +4881,6 @@ def test_verified_count_not_inflated_by_collection_join(self): collection = SourceImageCollection.objects.create(project=self.project, name="verif-dedup") collection.images.set(SourceImage.objects.filter(deployment=self.deployment)) - rows = self._list_by_name(f"{self.list_url}&collection={collection.pk}&with_agreement=true") + rows = self._list_by_name(f"{self.list_url}&collection={collection.pk}") # 2 verified cardui occurrences, not 3 — the duplicate detection must not double-count. self.assertEqual(rows["Vanessa cardui"]["verified_count"], 2) - self.assertEqual(rows["Vanessa cardui"]["agreed_exact_count"], 2) diff --git a/ui/src/data-services/models/species.ts b/ui/src/data-services/models/species.ts index bd0f34f33..b4f766b66 100644 --- a/ui/src/data-services/models/species.ts +++ b/ui/src/data-services/models/species.ts @@ -84,15 +84,6 @@ export class Species extends Taxon { return this._species.verified_count ?? 0 } - get numAgreedWithPrediction(): number { - return this._species.agreed_with_prediction_count ?? 0 - } - - // Only present when with_agreement=true is requested (or on the detail view). - get numAgreedExact(): number | undefined { - return this._species.agreed_exact_count ?? undefined - } - get score(): number | undefined { const score = this._species.best_determination_score diff --git a/ui/src/pages/species-details/species-details.tsx b/ui/src/pages/species-details/species-details.tsx index bd5f20edf..808e060ab 100644 --- a/ui/src/pages/species-details/species-details.tsx +++ b/ui/src/pages/species-details/species-details.tsx @@ -167,14 +167,6 @@ export const SpeciesDetails = ({ filters: { taxon: species.id, verified: 'true' }, })} /> - - {species.numAgreedExact !== undefined ? ( - - ) : null} Date: Tue, 26 May 2026 17:56:30 -0700 Subject: [PATCH 16/16] docs(taxa): consolidate PR #1317 findings into single reference, drop stale planning + prompt files [skip ci] - Expanded `docs/claude/reference/hierarchical-rollup-query-performance.md` to be the single canonical reference for per-taxon rollup queries on the taxa endpoint. Adds the `TaxonQuerySet` API surface and dispatch decision table, the detection fan-out dedup pattern, and the model-agreement split-out rationale (deferred to issue #1319). - Deleted `docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md` (superseded by the PR description and the reference doc; the deferred model-agreement scope now lives on issue #1319). - Deleted `docs/claude/prompts/NEXT_SESSION_PROMPT.md` (work shipped). Co-Authored-By: Claude --- ...05-20-taxa-verification-guidance-ticket.md | 165 --------------- docs/claude/prompts/NEXT_SESSION_PROMPT.md | 103 --------- .../hierarchical-rollup-query-performance.md | 199 +++++++++++++----- 3 files changed, 150 insertions(+), 317 deletions(-) delete mode 100644 docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md delete mode 100644 docs/claude/prompts/NEXT_SESSION_PROMPT.md diff --git a/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md b/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md deleted file mode 100644 index 5d2fc9747..000000000 --- a/docs/claude/planning/2026-05-20-taxa-verification-guidance-ticket.md +++ /dev/null @@ -1,165 +0,0 @@ -# Add guidance and stats about which taxa need verification - -## Motivation - -To get a meaningful project-wide picture of model accuracy and data quality, users should verify at least one occurrence of every unique taxon their pipelines have apparently observed. Today there is no surface that tells them *which* taxa still need attention or how many they have already verified — they have to drill into the occurrence list per taxon and check by hand. - -This ticket adds per-taxon verification and agreement data to the existing taxa list endpoint, plus the matching UI controls, so users can sort and filter to find the taxa that most need their attention. It is part of this year's proactive-surfacing goal and is the natural next step after [#1296](https://github.com/RolnickLab/antenna/pull/1296) (project summary) and [#1307](https://github.com/RolnickLab/antenna/pull/1307) (dataset-wide model agreement endpoint). - -## Scope - -Backend annotations on `GET /api/v2/taxa/` plus a new filter, and a new column + filter in the taxa list table. The dataset-wide `/occurrences/stats/model-agreement/` endpoint from #1307 is unchanged — it stays as the aggregate view; this ticket adds the per-taxon breakdown. - -### Out of scope (queued separately) - -- "Needs verification" badge / status pill on rows. -- Project-summary widget with `X of Y unique taxa verified`. -- Dedicated unverified-taxa queue page. -- Backfilling counts onto a denormalized `Taxon` field. -- Macro-average rollup (occurrence-weighted descendant sum is the only rollup in this ticket). - -## Backend - -### Filter - -- `verified=true|false` on `TaxonViewSet` — matches taxa with at least one non-withdrawn `Identification` on an occurrence whose `determination` is the taxon itself **or any descendant** (via `parents_json__contains`). Implemented as an `EXISTS` subquery, project-scoped, respects `apply_default_filters`. - -### Always-on annotations (cheap) - -Both reuse the existing hierarchical-match pattern (`parents_json__contains [{id: OuterRef("id")}] OR determination_id = OuterRef("id")`) used by the `taxon=` filter in the occurrence list, so a Family row aggregates all its descendant species' occurrences — occurrence-weighted by construction. - -- `verified_count` — count of occurrences under the taxon (incl. descendants) with at least one non-withdrawn `Identification`. Sortable. Single correlated subquery per row. -- `agreed_with_prediction_count` — count of verified occurrences whose chosen `Identification.agreed_with_prediction` is non-null. No join through `Classification` needed — just a non-null FK check. Different signal from `agreed_exact_count` below: this measures the *agree-with-model workflow* (user clicked the "agree" button on a prediction), not independent-match accuracy. - -### Gated annotation (heavier) - -- `agreed_exact_count` — count of verified occurrences where `occurrence.determination_id` equals the top machine `Classification.taxon_id` for the same occurrence. Surfaced only when `with_agreement=true` is on the request. Cost: two correlated subqueries per row (verified set + best classification per occurrence). Needs benchmarking on P#85 (13k verified) before this can default on. **NOT** included in the default list response. - -`occurrence.determination` is already maintained as the top non-withdrawn user identification's taxon (`update_occurrence_determination` runs on every `Identification.save`, see `ami/main/models.py:2528, 3383-3393`), so we do not need a correlated subquery over `Identification` to find the best human identification — just read `determination_id` directly. This is what makes `agreed_exact_count` only two subqueries instead of three. - -### Dropped vs PR #1307 - -- `agreed_under_order_count` is not added per-taxon. The under-order LCA bucket from `/occurrences/stats/model-agreement/` stays available at the dataset level; per-taxon it's redundant since each row already represents a single taxon. - -### Detail view - -`GET /api/v2/taxa//` should include all four fields above unconditionally — single-row cost is negligible. - -### Performance prerequisites - -- The hierarchical match uses `Taxon.parents_json` containment. Without a GIN index on that column, Family- and Order-rank rows on large projects (P#85 has 13k verified, P#20 has 41k occurrences) will fall back to seq-scan and dominate query time. **This index is already flagged as a follow-up to #1307**: - - ```sql - CREATE INDEX CONCURRENTLY main_taxon_parents_json_gin_idx - ON main_taxon USING gin (parents_json jsonb_path_ops); - ``` - - Treat shipping the GIN index as a hard blocker for enabling recursive rollup correctness at higher ranks. Without it, this ticket is safe to ship for projects with shallow taxa lists (species-only) but will be slow elsewhere. - -- The composite-index follow-up from #1307 (`main_occurrence (project_id, determination_score)`) is also relevant — `verified_count` filters by project + verified flag and benefits from the same indexed path. - -### Cost benchmarks to run before merge - -| Query | Project | Expected | Acceptance | -|---|---|---|---| -| `/taxa/?project_id=18&verified=true` | P#18 (45 verified) | < 200ms warm | ≤ 1.5× current `/taxa/` p99 | -| `/taxa/?project_id=85&verified=false` | P#85 (13k verified) | < 500ms warm | ≤ 2× current p99 | -| `/taxa/?project_id=85&with_agreement=true` | P#85 | < 1.5s warm | < 5s cold | -| `/taxa/?project_id=85&ordering=verified_count` | P#85 | < 1s warm | doesn't fall off cliff | - -If `with_agreement=true` exceeds the cold budget on P#85, fall back to keeping `agreed_exact_count` on the detail view only and add a `/taxa/stats/verification/` aggregate endpoint mirroring the #1307 pattern instead. - -## Frontend - -### Taxa list page (`/projects//taxa`) - -- New sortable column **Verified** showing `verified_count` per row. Default ordering unchanged; user can click the column to sort asc (least-verified first → matches the proactive-surfacing intent). -- New filter pill **Verification status**: `All` (default) / `Verified` / `Unverified`. Wires to the `verified=` query param. -- Existing `Occurrences` column stays as the primary count signal; `Verified` sits next to it so the ratio is visually obvious. - -### Not in this ticket - -- No new column for `agreed_with_prediction_count` or `agreed_exact_count` in the table by default. These are surfaced on the **taxon detail page** only (existing detail page; add a small "Verification" panel showing the four numbers). If the table eventually grows a "Model accuracy" toggle, it would flip `with_agreement=true` on — design that in a follow-up. - -## API contract examples - -```bash -# Verified taxa only, project default filters applied -curl '.../api/v2/taxa/?project_id=18&verified=true' - -# Unverified taxa, sorted by occurrence count desc — the "biggest gaps" view -curl '.../api/v2/taxa/?project_id=18&verified=false&ordering=-occurrences_count' - -# Sort by which taxa have the most human verification -curl '.../api/v2/taxa/?project_id=18&ordering=-verified_count' - -# Enable the heavier agreed_exact_count on a list response -curl '.../api/v2/taxa/?project_id=18&with_agreement=true' - -# Detail view always includes all four -curl '.../api/v2/taxa/567/?project_id=18' -``` - -### Response shape (list) - -```json -{ - "id": 567, - "name": "Hyalophora cecropia", - "rank": "SPECIES", - "occurrences_count": 124, - "verified_count": 3, - "agreed_with_prediction_count": 2, - "best_determination_score": 0.94, - "last_detected": "2025-08-12T03:14:22" -} -``` - -With `with_agreement=true`: - -```json -{ - "...": "...", - "verified_count": 3, - "agreed_with_prediction_count": 2, - "agreed_exact_count": 2 -} -``` - -## Test plan - -Backend: - -- [ ] Unit test on `Taxon` queryset: `verified=true` returns only taxa with non-withdrawn identifications, respecting hierarchical match (verifying a species also marks its genus/family as verified at higher-rank rows). -- [ ] Unit test on `Taxon` queryset: `verified=false` is the strict complement on the project's filtered taxa set. -- [ ] Unit test: `verified_count` equals number of verified occurrences under the taxon (descendants included). -- [ ] Unit test: `agreed_with_prediction_count` only counts the chosen identification's `agreed_with_prediction`, not all identifications on the occurrence. -- [ ] Unit test: `agreed_exact_count` reads `occurrence.determination_id` for the user side and top-score `Classification.taxon_id` for the model side, and is only populated when `with_agreement=true`. -- [ ] HTTP test: list endpoint shape includes new fields; gated field absent unless flag is set. -- [ ] HTTP test: `verified=` filter behaves correctly under `apply_defaults=true|false`. -- [ ] Bench: queries above hit acceptance thresholds. - -Frontend: - -- [ ] Verified column renders, sorts asc and desc. -- [ ] Filter pill updates URL, persists across reload, clears with the rest of the project filter state. -- [ ] Detail page Verification panel renders all four fields. - -## Follow-ups (not in this ticket) - -- `Taxon.parents_json` GIN index (carries over from #1307 — gating dependency for rollup correctness at higher ranks). -- `main_occurrence (project_id, determination_score)` composite index (also from #1307). -- Project-summary "X of Y unique taxa verified" widget on the overview page. -- "Needs verification" status pill on taxa rows once we know what the threshold should be (`verified_count == 0` is the obvious v1). -- Dedicated unverified-taxa queue view (pre-filtered, ranked by occurrence count desc). -- Macro-averaged agreement rollup at higher ranks (alternative to the occurrence-weighted sum this ticket ships). -- A `with_counts` / `with_agreement` query-param convention audit across the API — we already have similar gated-annotation patterns elsewhere; document a single convention. - -## References - -- PR #1307 — dataset-wide `/occurrences/stats/model-agreement/` endpoint, established the LCA + agreement-bucket compute that this ticket reuses per-taxon. Includes the GIN-index and composite-index follow-ups this ticket inherits. -- PR #1296 — project summary view, the surfacing target this work feeds into. -- `ami/main/api/views.py:1403` — `TaxonViewSet`, where the new annotations and filter land. -- `ami/main/api/views.py:1576` — `get_taxa_observed`, the existing helper that already wires `parents_json`-aware subqueries; pattern for adding the new ones. -- `ami/main/models.py:2440` — `Identification` model (`agreed_with_prediction` FK). -- `ami/main/models.py:3383` — `update_occurrence_determination`, which keeps `Occurrence.determination` aligned with the top non-withdrawn user identification. diff --git a/docs/claude/prompts/NEXT_SESSION_PROMPT.md b/docs/claude/prompts/NEXT_SESSION_PROMPT.md deleted file mode 100644 index a56af4620..000000000 --- a/docs/claude/prompts/NEXT_SESSION_PROMPT.md +++ /dev/null @@ -1,103 +0,0 @@ -# Next session — PR #1317 follow-ups - -## Branch / deploy state (2026-05-26) - -- Branch `worktree-taxa-verification-counts`, HEAD `4f456819` (docs-only on top of `cf86550d`). -- serbia (`antenna-dev-serbia`, `~/antenna`) deployed at `cf86550d`. -- Migrations applied through `0087_taxon_parents_json_gin_index` (renumbered from `0085` after merging main). -- 43 taxa tests pass (`TestTaxaVerification`, `TestTaxonomyViews`, `TestTaxonListQueryCount`, `TestProjectDefaultTaxaFilter`, `TestCachedCountsDefaultFilters`). - -## Final live timings on project 5 (~1k taxa / 17k occ) - -| path | ms | -|---|---| -| default `limit=25` | 2140 (was 820 pre-refactor — **regression**) | -| `verified=true` | 810 | -| `verified=false` | 1190 | -| `ordering=verified_count` | 1450 | -| `collection=5` | 1830 (was 30s+ timeout) | -| `collection=5&with_agreement=true` | 1920 | - -## Why default regressed - -Conditional aggregation does **one big GROUP BY** over all project occurrences (16861 rows) every request. The previous correlated `Subquery` form was **index-served per row** by the composite `(determination_id, project_id, event_id, determination_score)` index — perfect for the default path. Conditional aggregation is universally correct + scales to the collection path; correlated subqueries are faster *when no detections join is in play*. - -## Tasks for next session - -### 1. Hybrid: keep conditional aggregation for `?collection=`, restore correlated subqueries for everything else - -In `TaxonViewSet.annotate_taxon_counts` (`ami/main/api/views.py:1683`), dispatch on whether the occurrence filter joins detections (i.e. `?collection=` is in the request — the only filter that currently induces that join): - -- **No detections join** (default / event / deployment / verified / ordering): use the ORIGINAL three correlated `Subquery` annotations (occurrences_count / best_score / last_detected) — restore the block that was deleted in `838f9d75`. Membership = the materialised `id__in` from `29bec78c` (or the original EXISTS; both worked fine without the detections join). -- **Detections join in play** (`?collection=`): use the conditional aggregation block that exists now (`Count("occurrences", filter=count_filter, distinct=True)` + `Max(...)` + HAVING). Already validated at 1.83s on project 5. - -Pick the branch from `request.query_params` (`"collection" in self.request.query_params`) rather than introspecting the Q object — simpler and reflects the actual decision. - -Targets after hybrid: default ≤ 0.9s, collection ≤ 2s. Verify both on serbia. - -### 2. Move counting into `TaxonQuerySet` methods (lighten view, make reusable) - -Existing custom queryset: `TaxonQuerySet` at `ami/main/models.py:3422` (already has `with_occurrence_counts(project)`, `filter_by_project_default_taxa`, `visible_for_user`). Manager at `models.py:3488`. The CLAUDE.md "Custom QuerySet Methods (Always Use These)" section already lists this as the canonical pattern. - -Methods to add: - -- `with_observation_counts(project, request, *, apply_default_score_filter=True, apply_default_taxa_filter=True, occurrence_filters=None)` — applies the dense aggregates (after hybrid: correlated subqueries OR conditional aggregation). Replaces the existing `with_occurrence_counts` (or replaces its body — the existing one is just `Count("occurrences", distinct=True)`, narrower scope). -- `observed_in_project(project, request, *, occurrence_filters=None)` — membership filter, returns the qs filtered to observed taxa. Mirrors the existing pattern. -- `with_verification_counts(project, request)` — the sparse CASE-from-map rollup (currently `TaxonViewSet._annotate_verification_counts`). Note this method materialises an Occurrence query and builds Python maps before annotating, but that's still appropriate to live on the queryset (it's a method, not a pure annotation). - -What stays in the view: -- `get_occurrence_filters(project, accessor="")` — parses request query params for `occurrence_id` / `deployment` / `event` / `collection`, plus the per-id existence checks that raise 404. View-specific (request introspection). Pass the resulting `Q` into the queryset methods. -- `_include_agreement` (request param parsing). -- `_case_from_map` could move to the queryset module as a module-level helper since it's generic; or stay on the view. Tradeoff: queryset module if anything else might use it. - -After refactor, `get_taxa_observed` shrinks to chaining queryset methods: - -```python -def get_taxa_observed(self, qs, project, include_unobserved=False, apply_default_score_filter=True, apply_default_taxa_filter=True): - occ_filters = self.get_occurrence_filters(project) - qs = qs.with_observation_counts(project, self.request, occurrence_filters=occ_filters, - apply_default_score_filter=apply_default_score_filter, - apply_default_taxa_filter=apply_default_taxa_filter) - if not include_unobserved: - qs = qs.observed_in_project(project, self.request, occurrence_filters=occ_filters) - return qs.with_verification_counts(project, self.request, occurrence_filters=occ_filters, - include_agreement=self._include_agreement(), - verified_param=self.request.query_params.get("verified")) -``` - -(Verified filter application — currently inside `_annotate_verification_counts` — can stay there or move to a small `.filter_verified(...)` queryset method. Either way the view loses the filter wiring.) - -### 3. Gotchas to preserve (cited in `docs/claude/reference/hierarchical-rollup-query-performance.md`) - -- **Dense CASE-from-map fails on large projects** — `SQLParseError: tokens exceeded 10000`. Only use the CASE-from-map pattern for *sparse* maps (verified subset). Dense aggregates need conditional aggregation OR correlated subqueries. -- **Don't include the default taxa filter in the conditional-aggregate `count_filter`** — it adds a `parents_json` containment join inside the aggregate that the planner cannot reconcile with the detections join under `?collection=`. Redundant anyway because `filter_by_project_default_taxa` keeps/drops the row at the qs level. Keep only score threshold in `count_filter`. -- **`TaxonCollectionFilter` removed** (`cf86550d`) — the backend's `queryset.filter(occurrences__detections__source_image__collections=)` was redundant on top of the aggregate's collection filter and broke the planner. If reintroducing for any reason, gate it on the absence of conditional aggregation. - -### 4. Other still-open items (already tracked, lower priority) - -- Browser check of the FE Verified column / verified filter pill / detail panel — not done. -- The `TaxonCollectionFilter` class itself is still defined at `views.py:1207` but unused — delete or keep as cruft? Either way. -- Future scaling: dedicated `TaxonObserved` denorm model (`[[project-taxon-observed-denorm]]`) — only if the conditional-aggregation default-regression or future read-path counts keep biting. - -## Key files / commits - -- `ami/main/api/views.py:1540` `get_occurrence_filters(project, accessor="")` -- `ami/main/api/views.py:1683` `annotate_taxon_counts(qs, project, ...)` -- `ami/main/api/views.py:1740` `_annotate_verification_counts(qs, base)` -- `ami/main/api/views.py:1675` `_case_from_map(mapping, default, output_field)` -- `ami/main/api/views.py:1462` filter_backends (TaxonCollectionFilter removed) -- `ami/main/models.py:3422` `TaxonQuerySet` (target home for new methods) -- `docs/claude/reference/hierarchical-rollup-query-performance.md` — pattern + gotchas reference -- `ami/main/tests.py:4766` `TestTaxaVerification` — gate the hybrid against these - -## Commits this session (most recent first) - -- `4f456819` docs: sparse vs dense, gotchas -- `cf86550d` fix: remove redundant TaxonCollectionFilter backend (collection 182s → 1.83s) -- `f20a05d0` fix: drop redundant taxa filter from occurrences_count aggregate (broken→working but slow path still) -- `7f571be7` fix: conditional aggregation for dense per-taxon counts -- `838f9d75` refactor: centralize per-taxon counts (introduced; dense CASE form broke SQL token limit) -- `29bec78c` fix: materialize observed-taxon id set -- `b92b2b0e` fix: collection-COUNT scale (membership id__in subquery — regressed, superseded) -- `30955e33` chore: migration renumber 0085 → 0087 after main merge -- `10c72cbd` fix: dedupe occurrences in verification rollup under collection filter diff --git a/docs/claude/reference/hierarchical-rollup-query-performance.md b/docs/claude/reference/hierarchical-rollup-query-performance.md index 3de10c98b..a3055a785 100644 --- a/docs/claude/reference/hierarchical-rollup-query-performance.md +++ b/docs/claude/reference/hierarchical-rollup-query-performance.md @@ -1,16 +1,23 @@ -# Hierarchical (descendant) rollup counts — performant query patterns +# Per-taxon rollup counts on the taxa endpoint — query patterns and pitfalls -How to compute a per-taxon count that rolls up descendant occurrences -(a Family/Order row aggregating its species) without timing out the list -endpoint on large projects. Generalises to any "aggregate over a node and -all its `parents_json` descendants" problem. +How to compute per-taxon counts on `GET /api/v2/taxa/` that roll up +descendant occurrences (a Family/Order row aggregating its species) without +timing out the list endpoint on large projects. Generalises to any "aggregate +over a taxon node and all its `parents_json` descendants" problem. -Reference implementation: `TaxonQuerySet.with_verification_counts` in -`ami/main/models.py` (verified_count / agreed_with_prediction_count / -agreed_exact_count for #1316). History: the first revision used the -anti-pattern below and timed out; commit `16b14686` switched to the pattern. -The logic moved from `TaxonViewSet.add_verification_data` → the queryset -method as part of the view→queryset refactor for #1317. +Canonical reference for PR #1317 (verification status on taxa views, issue +#1316) and follow-up #1319 (model-agreement stats — deferred). + +Reference implementations in `ami/main/models.py`: + +- `TaxonQuerySet.with_observation_counts_subqueries` (default path, correlated `Subquery`) +- `TaxonQuerySet.with_observation_counts_aggregated` (collection path, conditional aggregation) +- `TaxonQuerySet.observed_in_project_subqueries` (membership filter for default path) +- `TaxonQuerySet.with_verification_counts` (sparse `CASE`-from-map rollup) + +Reference orchestrator in `ami/main/api/views.py`: + +- `TaxonViewSet.get_taxa_observed` (dispatches between the two count shapes) ## TL;DR @@ -22,10 +29,31 @@ method as part of the view→queryset refactor for #1317. build `{taxon_id: count}` dicts (incrementing the determination taxon + every ancestor in `parents_json`), then apply as constant-time `CASE` annotations. Resolve any membership filter from the same set via `id__in`. +- For *dense* per-taxon aggregates (one value per observed taxon: occurrences_count, + best_determination_score, last_detected) the right shape depends on whether the + occurrence filter joins detections: correlated `Subquery` per row when it does not, + conditional aggregation over the reverse relation when it does. - The GIN index (`main_taxon_parents_json_gin_idx`, migration `0087`) only helps the **literal**-RHS containment filters (occurrence-list `taxon=`, `build_occurrence_default_filters_q`), not correlated ones. +## When to use which mechanism + +| Driving set | Shape | Method | Why | +|---|---|---|---| +| Sparse (e.g. verified subset, bounded by review effort) | Python pass + `CASE`-from-map | `with_verification_counts` | Constant-time per row, DB-sortable, stripped from pagination `COUNT`. Sparse maps stay under sqlparse's 10000-token limit. | +| Dense, no detections join (default / event / deployment / verified / ordering paths) | Three correlated `Subquery` annotations | `with_observation_counts_subqueries` + `observed_in_project_subqueries` | Index-served by the composite `(determination_id, project_id, event_id, determination_score)` index on Occurrence. Membership materialised as `id__in`. | +| Dense, detections join in play (`?collection=`) | Conditional aggregation over the Taxon→occurrences reverse relation | `with_observation_counts_aggregated` | One `GROUP BY`, constant-size SQL. The detections join turns each correlated `Subquery` into a per-row scan; the aggregate dedupes via `Count(distinct)` and replaces the membership query with a HAVING (`occurrences_count__gt=0`). | + +The dispatch is in `TaxonViewSet.get_taxa_observed` and keys on whether +`"collection" in request.query_params` — that is the only filter on this viewset +that currently induces a detections join. Add new join-inducing filters to the +same branch. + +The sparse verification rollup is the same on both paths — it queries Occurrence +directly with the same `occurrence_filters` Q, regardless of whether the +observation counts are subqueries or aggregates. + ## The anti-pattern (does not scale) ```python @@ -51,19 +79,23 @@ index). The cost is the correlation: per outer row the planner can't use the ind a parameterised `@>` and re-evaluates the join. **Always benchmark the correlated form, not the standalone query.** -## The pattern (precompute over the sparse set + CASE) +## The sparse pattern (precompute over the small set + CASE) -Key observation: the three counts only concern *verified* occurrences (those with a -non-withdrawn `Identification`), which are sparse — bounded by human review effort, not -total occurrences. +Key observation: the verification counts only concern *verified* occurrences (those +with a non-withdrawn `Identification`), which are sparse — bounded by human review +effort, not total occurrences. ```python -verified_occ = Occurrence.objects.filter(occ_filters).filter(default_filters_q).filter( - Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False)) -).annotate(_agreed_prediction_id=...) # + _best_machine_taxon_id when gated +verified_occ = ( + Occurrence.objects.filter(occ_filters) + .filter(default_filters_q) + .filter(Exists(Identification.objects.filter(occurrence=OuterRef("pk"), withdrawn=False))) +) +# ``pk`` in value_fields + ``.distinct()`` dedupes occurrences when occ_filters +# joins to detections (see "Detection fan-out" below). counts: dict[int, int] = {} -for row in verified_occ.values("determination_id", "determination__parents_json", ...): +for row in verified_occ.values("pk", "determination_id", "determination__parents_json").distinct(): taxon_ids = {row["determination_id"], *(_id(p) for p in row["determination__parents_json"] or [])} for tid in taxon_ids: counts[tid] = counts.get(tid, 0) + 1 @@ -72,54 +104,45 @@ case = Case(*(When(id=tid, then=Value(c)) for tid, c in counts.items()), default=Value(0), output_field=IntegerField()) if counts else Value(0, ...) qs = qs.annotate(verified_count=case) -# membership filter from the same precomputed set: +# Membership filter from the same precomputed set: qs.filter(id__in=list(counts)) / qs.exclude(id__in=list(counts)) ``` Same project, after: every path (`default`, `verified=true/false`, -`ordering=verified_count`) ~0.3s. Cost is `O(verified occ × ancestor depth)`, paid once -per request. The `CASE` annotation is a constant per row → DB-sortable, paginatable, -and stripped from the pagination `COUNT`. +`ordering=verified_count`) ~1.1s cold. Cost is `O(verified occ × ancestor depth)`, +paid once per request. The `CASE` annotation is a constant per row → DB-sortable, +paginatable, and stripped from the pagination `COUNT`. -### Why this keeps COUNT cheap +### Why this keeps `COUNT` cheap Django's `QuerySet.count()` strips **select-only** annotations it doesn't need — so the `CASE` annotations don't appear in the pagination `COUNT`. Two things defeat that strip and were the original timeout: -- a filter that *references* the expensive expression — e.g. `verified=false` as + +- A filter that *references* the expensive expression — e.g. `verified=false` as `~Exists(correlated_subquery)` forces the subquery into the `COUNT` WHERE for every taxon. Resolving the filter via `id__in=` avoids it. - `.distinct()` combined with select annotations (wraps the whole annotated query in the `COUNT` subquery). Watch for it on list viewsets. -## Gotchas - -- **cachalot caches query results**, including across repeated benchmark runs — a second - identical timing is a cache hit, not a real measurement. Wrap timing in - `from cachalot.api import cachalot_disabled` → `with cachalot_disabled(): ...`. -- **`django-pydantic-field` `.values()` returns deserialised pydantic objects**, not - dicts. `parents_json` elements may be `TaxonParent` objects depending on the query - path, so read the id defensively: `p.get("id") if isinstance(p, dict) else getattr(p, "id", None)`. - (This silently broke ancestor rollup — direct/species counts worked, ancestors were 0.) -- **GIN `jsonb_path_ops` only serves `@>` with a constant RHS.** Literal - `parents_json__contains=[{"id": X}]` uses it; an `OuterRef` RHS does not. - -## When to use which mechanism (sparse vs dense) +## The dense pattern (conditional aggregation over the reverse relation) -The CASE-from-map pattern above is the right tool **only when the map is sparse**. -For *dense* per-taxon aggregates — one value per observed taxon, e.g. the existing -`occurrences_count` / `best_determination_score` / `last_detected` for a project's -~hundreds-to-thousands of taxa — it does **not** work: one `When` branch per taxon -blows past `sqlparse`'s parser limit and the query fails with +For dense per-taxon aggregates — one value per observed taxon, e.g. `occurrences_count` +/ `best_determination_score` / `last_detected` for a project's ~hundreds-to-thousands +of taxa — the sparse `CASE`-from-map fails: one `When` branch per taxon blows past +`sqlparse`'s parser limit and the query fails with `SQLParseError: Maximum number of tokens exceeded (10000)`. -The scalable alternative for dense aggregates is **conditional aggregation over the -reverse relation** (already documented for Event counts in `build_occurrence_default_filters_q`): +Use conditional aggregation over the Taxon→occurrences reverse relation: ```python count_filter = ( self.get_occurrence_filters(project, accessor="occurrences") - & build_occurrence_default_filters_q(project, request, occurrence_accessor="occurrences", ...) + & build_occurrence_default_filters_q( + project, request, occurrence_accessor="occurrences", + apply_default_score_filter=True, + apply_default_taxa_filter=False, # see "Two gotchas" below + ) ) qs = qs.annotate( occurrences_count=Count("occurrences", filter=count_filter, distinct=True), @@ -128,11 +151,14 @@ qs = qs.annotate( ).filter(occurrences_count__gt=0) # HAVING — replaces the EXISTS membership query ``` -One GROUP BY, constant-size SQL, scales in both taxa and occurrences. `Count(distinct)` +One `GROUP BY`, constant-size SQL, scales in both taxa and occurrences. `Count(distinct)` dedupes the detections fan-out under `?collection=`. The HAVING replaces a separate membership query (the determination ids present in the filtered set are exactly the observed taxa). +This is also the right shape for `with_charts=...` style endpoints or any other +aggregation over a join — it's the general pattern, not collection-specific. + ### Two gotchas that turn fast conditional aggregation into a multi-minute scan 1. **Do not include the default *taxa* include/exclude filter in `count_filter` when @@ -149,9 +175,84 @@ observed taxa). adding conditional-aggregate annotations.** A backend like `queryset.filter(occurrences__detections__source_image__collections=)` was harmless on top of correlated subqueries but, combined with the aggregate - GROUP BY, induces an INNER JOIN that multiplies taxon rows and breaks the planner. + `GROUP BY`, induces an INNER JOIN that multiplies taxon rows and breaks the planner. Express the filter once — inside the aggregate (`accessor="occurrences"`) — and - remove the backend, since the HAVING already enforces membership. + remove the backend, since the HAVING already enforces membership. (`TaxonCollectionFilter` + was removed for this reason in PR #1317.) + +## Detection fan-out under `?collection=` + +When `occurrence_filters` joins to detections (because `?collection=` resolves to +`detections__source_image__collections=`), a single Occurrence yields one row per +matching Detection in `.values(...)`. Without dedup the rollup counts every detection +as if it were a separate verified occurrence. + +The fix in `with_verification_counts`: + +```python +value_fields = ["pk", "determination_id", "determination__parents_json"] +# ... +verified_occ.values(*value_fields).distinct() +``` + +`pk` is selected only so that `.distinct()` can dedupe by Occurrence. Regression test: +`test_verified_count_not_inflated_by_collection_join` in +`ami/main/tests.py::TestTaxaVerification`. + +The aggregate path uses `Count("occurrences", filter=count_filter, distinct=True)` +which serves the same dedup role for the dense per-taxon counts. + +## Gotchas + +- **`cachalot` caches query results**, including across repeated benchmark runs — a second + identical timing is a cache hit, not a real measurement. Flush between paths with + `docker compose exec -T redis redis-cli FLUSHALL`, or wrap timing in + `from cachalot.api import cachalot_disabled` → `with cachalot_disabled(): ...`. +- **`django-pydantic-field` `.values()` returns deserialised pydantic objects**, not + dicts. `parents_json` elements may be `TaxonParent` objects depending on the query + path, so read the id defensively: + + ```python + parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None) + ``` + + This silently broke ancestor rollup in an earlier revision — direct/species counts + worked, ancestors were 0. +- **GIN `jsonb_path_ops` only serves `@>` with a constant RHS.** Literal + `parents_json__contains=[{"id": X}]` uses the index; an `OuterRef` RHS does not. +- **`Family/Order` rows can show `verified_count > occurrences_count`** because the + former rolls up descendants while the latter is direct-match. Document this in API + consumers if it might confuse readers. + +## Why model-agreement stats were split out (#1319) + +The verification surface (`verified_count`, `verified=true|false` filter) measures +human trust — "how much of this taxon is human-reviewed?" Model-agreement stats +(`agreed_with_prediction_count`, `agreed_exact_count`) measure ML evaluation — "how +often does the model match humans?" Different audiences (naturalists vs ML team) and +different consumers; bundling them into one PR without a FE consumer for the +agreement surface produced dead API columns, naming friction (`agreed_*` was ambiguous +between human and machine signals), and parser duplication (one in serializer, one in +view). + +PR #1317 ships the verification surface only. Follow-up issue #1319 carries the +agreement stats with these implementation guardrails preserved: + +- Reuse the sparse `CASE`-from-map pattern over the verified subset — both + `agreed_with_prediction_count` and `agreed_exact_count` are bounded by the verified + set, so they stay sparse. +- Extend `with_verification_counts` with an `include_agreement` flag and the + `_best_machine_taxon_id` (`Classification` subquery ordered by + `BEST_MACHINE_PREDICTION_ORDER`) + `_agreed_prediction_id` (`Identification` + subquery ordered by `BEST_IDENTIFICATION_ORDER`) annotations from the original + revision. +- Rename to `model_agreed_*` prefix to disambiguate from human verifications. +- Gate behind `?with_model_agreement=true` on the list endpoint to keep the default + cheap; detail view always includes. +- Pass the gate flag through serializer context (single parser), not re-parsed in the + serializer. +- Port the regression tests (rollup, chosen-identification-only, gated-on-list, + detection-dedup) with the new names. ## Future direction — denormalised per-project observed taxa