Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ea9f45d
feat(taxa): per-taxon verification + agreement counts and verified fi…
mihow May 22, 2026
16b1468
perf(taxa): compute verification rollup in one pass, not per-taxon su…
mihow May 22, 2026
5d929ce
docs(taxa): clarify GIN index purpose + add rollup query-performance …
mihow May 26, 2026
10c72cb
fix(taxa): dedupe occurrences in verification rollup under collection…
mihow May 26, 2026
7dcf325
Merge remote-tracking branch 'origin/main' into worktree-taxa-verific…
mihow May 26, 2026
30955e3
chore(migrations): renumber parents_json GIN index 0085 -> 0087 after…
mihow May 26, 2026
b92b2b0
fix(taxa): make collection-filtered taxa list COUNT scale
mihow May 26, 2026
29bec78
fix(taxa): materialize observed-taxon id set instead of IN-subquery
mihow May 26, 2026
838f9d7
refactor(taxa): centralize per-taxon counts into one filtered-occurre…
mihow May 26, 2026
7f571be
fix(taxa): use conditional aggregation for dense per-taxon counts
mihow May 26, 2026
f20a05d
fix(taxa): drop redundant taxa filter from occurrences_count aggregate
mihow May 26, 2026
cf86550
fix(taxa): remove redundant TaxonCollectionFilter backend
mihow May 26, 2026
4f45681
docs(taxa): document sparse vs dense — when CASE breaks, when to use …
mihow May 26, 2026
e014a73
docs(taxa): next-session handoff — hybrid direct-aggregates + move to…
mihow May 26, 2026
04a62c6
refactor(taxa): move count logic to TaxonQuerySet, hybrid subquery/ag…
mihow May 26, 2026
20683ee
refactor(taxa): drop model-agreement counts, keep verification only
mihow May 26, 2026
02f9dc3
docs(taxa): consolidate PR #1317 findings into single reference, drop…
mihow May 27, 2026
c4132e9
Merge branch 'main' into worktree-taxa-verification-counts [skip ci]
mihow May 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions ami/main/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,13 +588,27 @@ def get_taxa(self, obj):
return [{"id": taxon.id, "name": taxon.name} for taxon in obj.taxa.all()]


def agreement_requested(request: Request | None) -> bool:
"""Whether ``with_agreement=true`` is set, gating the heavier agreed_exact_count."""
if request is None:
return False
value = request.query_params.get("with_agreement", "")
Comment thread
mihow marked this conversation as resolved.
Outdated
return str(value).lower() in ("true", "1", "yes", "on")


class TaxonListSerializer(DefaultSerializer):
# latest_detection = DetectionNestedSerializer(read_only=True)
occurrences = serializers.SerializerMethodField()
parents = TaxonParentSerializer(many=True, read_only=True, source="parents_json")
parent_id = serializers.PrimaryKeyRelatedField(queryset=Taxon.objects.all(), source="parent")
tags = serializers.SerializerMethodField()

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# agreed_exact_count is a gated annotation: omit it unless with_agreement=true.
if not agreement_requested(self.context.get("request")):
self.fields.pop("agreed_exact_count", None)

def get_tags(self, obj):
tag_list = getattr(obj, "prefetched_tags", [])
return TagSerializer(tag_list, many=True, context=self.context).data
Expand All @@ -609,6 +623,9 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"agreed_with_prediction_count",
"agreed_exact_count",
"occurrences",
"tags",
"last_detected",
Expand Down Expand Up @@ -886,6 +903,9 @@ class Meta:
"parents",
"details",
"occurrences_count",
"verified_count",
"agreed_with_prediction_count",
Comment thread
mihow marked this conversation as resolved.
Outdated
"agreed_exact_count",
"events_count",
"occurrences",
"gbif_taxon_key",
Expand Down
213 changes: 171 additions & 42 deletions ami/main/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from django.core import exceptions
from django.db import models
from django.db.models import OuterRef, Prefetch, Q, Subquery
from django.db.models.functions import Coalesce
from django.db.models.query import QuerySet
from django.forms import BooleanField, CharField, IntegerField
from django.shortcuts import get_object_or_404
Expand Down Expand Up @@ -37,6 +36,8 @@
from ami.utils.storages import ConnectionTestResult

from ..models import (
BEST_IDENTIFICATION_ORDER,
BEST_MACHINE_PREDICTION_ORDER,
NULL_DETECTIONS_FILTER,
Classification,
Deployment,
Expand Down Expand Up @@ -1026,7 +1027,9 @@ class CustomOccurrenceDeterminationFilter(CustomTaxonFilter):
def filter_queryset(self, request, queryset, view):
taxon = self.get_filter_taxon(request, query_params=self.query_params)
if taxon:
# Here the queryset is the Occurrence queryset
# Here the queryset is the Occurrence queryset.
# The literal parents_json containment (constant RHS) is what the GIN index from
# migration 0087 serves — this hierarchical taxon filter is the index's main consumer.
return queryset.filter(
models.Q(determination=taxon) | models.Q(determination__parents_json__contains=[{"id": taxon.pk}])
)
Expand Down Expand Up @@ -1477,6 +1480,7 @@ class TaxonViewSet(DefaultViewSet, ProjectMixin):
"created_at",
"updated_at",
"occurrences_count",
"verified_count",
Comment thread
mihow marked this conversation as resolved.
"last_detected",
"best_determination_score",
"name",
Expand Down Expand Up @@ -1634,10 +1638,8 @@ def get_taxa_observed(
If a project is passed, only return taxa that have been observed.
Also add the number of occurrences and the last time it was detected.

Uses efficient subqueries with default filters applied directly via Q objects
to leverage composite indexes on (determination_id, project_id, event_id, determination_score).
This avoids the N+1 query problem by building a single Q filter that can be reused
across all subqueries.
Counts are computed by annotate_taxon_counts from one filtered occurrence set
(occurrence_filters + the project's default filters), not per-taxon subqueries.
"""
occurrence_filters = self.get_occurrence_filters(project)

Expand All @@ -1654,54 +1656,181 @@ def get_taxa_observed(
apply_default_taxa_filter=apply_default_taxa_filter,
)

# Combine base occurrence filters with default filters
base_filter = models.Q(
qs = self.annotate_taxon_counts(
qs,
occurrence_filters,
determination_id=models.OuterRef("id"),
default_filters_q,
restrict_to_observed=not include_unobserved,
)

base_filter = base_filter & default_filters_q
return qs

# Count occurrences - uses composite index (determination_id, project_id, event_id, determination_score)
occurrences_count_subquery = models.Subquery(
Occurrence.objects.filter(base_filter)
.values("determination_id")
.annotate(count=models.Count("id"))
.values("count")[:1],
output_field=models.IntegerField(),
def _include_agreement(self) -> bool:
"""Whether the heavier ``agreed_exact_count`` annotation should be computed."""
if self.action == "retrieve":
return True
return bool(BooleanField(required=False).clean(self.request.query_params.get("with_agreement")))
Comment thread
mihow marked this conversation as resolved.
Outdated

@staticmethod
def _case_from_map(mapping: dict, default, output_field: models.Field) -> models.expressions.Combinable:
"""Turn a precomputed ``{taxon_id: value}`` map into a constant-time ``CASE``.

The result is constant per row, so it is DB-sortable, paginatable, and stripped
from the pagination ``COUNT`` — unlike a per-taxon correlated subquery, which is
re-evaluated for every row and (in ``COUNT``) for every taxon in the project.
"""
if not mapping:
return models.Value(default, output_field=output_field)
return models.Case(
*(
models.When(id=taxon_id, then=models.Value(value, output_field=output_field))
for taxon_id, value in mapping.items()
),
default=models.Value(default, output_field=output_field),
output_field=output_field,
)

# Get best score - uses same composite index
best_score_subquery = models.Subquery(
Occurrence.objects.filter(base_filter)
.values("determination_id")
.annotate(max_score=models.Max("determination_score"))
.values("max_score")[:1],
output_field=models.FloatField(),
)
def annotate_taxon_counts(
self,
qs: QuerySet,
occurrence_filters: models.Q,
default_filters_q: models.Q,
*,
restrict_to_observed: bool,
) -> QuerySet:
"""Centralised per-(project, taxon) count annotations for the taxa endpoint.

Every count is derived from one filtered occurrence set
(``occurrence_filters`` + the project default filters) by precomputing
``{taxon_id: value}`` maps in Python and applying them as constant-time ``CASE``
annotations (:meth:`_case_from_map`), rather than per-taxon correlated subqueries.
The subquery form does not scale once ``occurrence_filters`` joins detections
(``?collection=<id>``): each annotation degrades to a per-row scan — 25x on the
page, and once per taxon in the unbounded pagination ``COUNT``.

Two count shapes share the same base queryset:

- Direct aggregates (``occurrences_count``, ``best_determination_score``,
``last_detected``), keyed by the occurrence's own determination — one GROUP BY.
``Count(distinct)`` dedupes the detections-join fan-out under ``?collection=``.
- ``verified_count`` / ``agreed_*`` roll up to ancestors via ``parents_json`` over
the (sparse) verified subset — see :meth:`_annotate_verification_counts`.

The determination ids present in the filtered set are exactly the observed taxa,
so ``restrict_to_observed`` reuses them instead of a separate membership query.
"""
base = Occurrence.objects.filter(occurrence_filters).filter(default_filters_q)

occurrences_count: dict[int, int] = {}
best_score: dict[int, float | None] = {}
last_detected: dict[int, object] = {}
for row in base.values("determination_id").annotate(
_n=models.Count("id", distinct=True),
_score=models.Max("determination_score"),
_last=models.Max("detections__timestamp"),
):
determination_id = row["determination_id"]
if determination_id is None:
continue
occurrences_count[determination_id] = row["_n"]
best_score[determination_id] = row["_score"]
last_detected[determination_id] = row["_last"]

# Get last detected timestamp - requires join with detections
last_detected_subquery = models.Subquery(
Occurrence.objects.filter(
base_filter,
detections__timestamp__isnull=False,
)
.values("determination_id")
.annotate(last_detected=models.Max("detections__timestamp"))
.values("last_detected")[:1],
output_field=models.DateTimeField(),
qs = qs.annotate(
occurrences_count=self._case_from_map(occurrences_count, 0, models.IntegerField()),
best_determination_score=self._case_from_map(best_score, None, models.FloatField()),
last_detected=self._case_from_map(last_detected, None, models.DateTimeField()),
)

# Apply annotations
if restrict_to_observed:
qs = qs.filter(id__in=list(occurrences_count.keys()))

return self._annotate_verification_counts(qs, base)

def _annotate_verification_counts(self, qs: QuerySet, base: QuerySet) -> QuerySet:
"""
Annotate per-taxon verification / human-model agreement counts, and apply the
``verified=true|false`` filter on list responses.

``base`` is the shared filtered occurrence set from :meth:`annotate_taxon_counts`.
Counts roll up descendant occurrences (verifying a species also counts toward its
genus/family rows). They only concern *verified* occurrences (those with a
non-withdrawn Identification), which are sparse, so the hierarchical rollup is a
single Python pass over that small subset applied as constant-time ``CASE``
annotations. A correlated ``parents_json`` subquery per taxon does not scale: the
GIN index can't serve a containment whose RHS is an ``OuterRef``.
"""
include_agreement = self._include_agreement()

# The chosen (best, non-withdrawn) identification's agreed_with_prediction FK.
best_identification_agreed_prediction = models.Subquery(
Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False)
.order_by(*BEST_IDENTIFICATION_ORDER)
.values("agreed_with_prediction_id")[:1]
)
verified_occurrences = base.filter(
models.Exists(Identification.objects.filter(occurrence=models.OuterRef("pk"), withdrawn=False))
).annotate(_agreed_prediction_id=best_identification_agreed_prediction)
# ``pk`` is selected only so ``.distinct()`` below dedupes by occurrence: when
# occurrence_filters joins to detections (e.g. ?collection=<id>), one Occurrence
# yields a row per matching Detection, which would otherwise inflate the counts.
value_fields = ["pk", "determination_id", "determination__parents_json", "_agreed_prediction_id"]
if include_agreement:
# Top machine prediction's taxon for the same occurrence.
verified_occurrences = verified_occurrences.annotate(
_best_machine_taxon_id=models.Subquery(
Classification.objects.filter(detection__occurrence=models.OuterRef("pk"))
.order_by(*BEST_MACHINE_PREDICTION_ORDER)
.values("taxon_id")[:1]
)
)
value_fields.append("_best_machine_taxon_id")

verified_counts: dict[int, int] = {}
agreed_with_prediction_counts: dict[int, int] = {}
agreed_exact_counts: dict[int, int] = {}
for row in verified_occurrences.values(*value_fields).distinct():
determination_id = row["determination_id"]
# The taxon itself plus every ancestor — i.e. every row this occurrence rolls up to.
taxon_ids: set[int] = set()
if determination_id is not None:
taxon_ids.add(determination_id)
for parent in row["determination__parents_json"] or []:
# parents_json round-trips through the pydantic schema field, so elements
# may be dicts or ``TaxonParent`` objects depending on the query path.
parent_id = parent.get("id") if isinstance(parent, dict) else getattr(parent, "id", None)
if parent_id is not None:
taxon_ids.add(int(parent_id))

for taxon_id in taxon_ids:
verified_counts[taxon_id] = verified_counts.get(taxon_id, 0) + 1
if row["_agreed_prediction_id"] is not None:
for taxon_id in taxon_ids:
agreed_with_prediction_counts[taxon_id] = agreed_with_prediction_counts.get(taxon_id, 0) + 1
if (
include_agreement
and determination_id is not None
and determination_id == row["_best_machine_taxon_id"]
):
for taxon_id in taxon_ids:
agreed_exact_counts[taxon_id] = agreed_exact_counts.get(taxon_id, 0) + 1

int_field = models.IntegerField()
qs = qs.annotate(
occurrences_count=Coalesce(occurrences_count_subquery, 0),
best_determination_score=best_score_subquery,
last_detected=last_detected_subquery,
verified_count=self._case_from_map(verified_counts, 0, int_field),
agreed_with_prediction_count=self._case_from_map(agreed_with_prediction_counts, 0, int_field),
)
if include_agreement:
qs = qs.annotate(agreed_exact_count=self._case_from_map(agreed_exact_counts, 0, int_field))

if not include_unobserved:
# Efficient EXISTS check that uses the composite index
qs = qs.filter(models.Exists(Occurrence.objects.filter(base_filter)))
# verified=true|false filter (list only); verified=false is the strict complement.
if self.action == "list" and "verified" in self.request.query_params:
verified = BooleanField(required=False).clean(self.request.query_params.get("verified"))
verified_taxon_ids = list(verified_counts.keys())
if verified:
qs = qs.filter(id__in=verified_taxon_ids)
else:
qs = qs.exclude(id__in=verified_taxon_ids)

return qs

Expand Down
36 changes: 36 additions & 0 deletions ami/main/migrations/0087_taxon_parents_json_gin_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from django.db import migrations


class Migration(migrations.Migration):
"""
GIN index on Taxon.parents_json to support the hierarchical (descendant) taxon
filters that issue a literal `parents_json @> [{"id": <id>}]` containment: the
occurrence-list `taxon=<id>` filter (CustomOccurrenceDeterminationFilter) and the
project default-taxa filter (build_occurrence_default_filters_q). The index applies
to these because the right-hand side is a constant.

Note it does NOT back the #1316 per-taxon verification / agreement rollup: that is
computed in a single Python pass over the (sparse) verified-occurrence set rather
than a correlated subquery, because a containment whose RHS is an OuterRef can't use
the index. See TaxonViewSet._annotate_verification_counts.

CREATE INDEX CONCURRENTLY can't run inside a transaction, so this migration is
non-atomic. IF NOT EXISTS keeps it safe to co-exist with the same index if it lands
separately via the #1307 follow-up.
"""

atomic = False

dependencies = [
("main", "0086_sourceimage_recent_capture_index"),
]

operations = [
migrations.RunSQL(
sql=(
"CREATE INDEX CONCURRENTLY IF NOT EXISTS main_taxon_parents_json_gin_idx "
"ON main_taxon USING gin (parents_json jsonb_path_ops);"
),
reverse_sql="DROP INDEX CONCURRENTLY IF EXISTS main_taxon_parents_json_gin_idx;",
),
]
12 changes: 12 additions & 0 deletions ami/main/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3817,6 +3817,18 @@ def best_determination_score(self) -> float | None:
# This is handled by an annotation if we are filtering by project, deployment or event
return None

def verified_count(self) -> int | None:
# Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts)
return None

def agreed_with_prediction_count(self) -> int | None:
# Handled by an annotation when filtering by project (TaxonViewSet.annotate_taxon_counts)
return None

def agreed_exact_count(self) -> int | None:
# Handled by an annotation only when with_agreement is requested or on the detail view
return None

def occurrence_images(self, limit: int | None = 10) -> list[str]:
# This is handled by an annotation if we are filtering by project, deployment or event
return []
Expand Down
Loading
Loading