diff --git a/ami/main/api/serializers.py b/ami/main/api/serializers.py index 5d73b4823..64c721cbd 100644 --- a/ami/main/api/serializers.py +++ b/ami/main/api/serializers.py @@ -1098,6 +1098,8 @@ class SourceImageListSerializer(DefaultSerializer): deployment = DeploymentNestedSerializer(read_only=True) event = EventNestedSerializer(read_only=True) project = serializers.PrimaryKeyRelatedField(queryset=Project.objects.all(), required=False) + # Annotated in SourceImageViewSet.get_queryset (latest detection created_at). + last_processed = serializers.DateTimeField(read_only=True) # file = serializers.ImageField(allow_empty_file=False, use_url=True) class Meta: @@ -1118,6 +1120,7 @@ class Meta: "detections_count", "occurrences_count", "taxa_count", + "last_processed", "detections", "project", ] diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 488a31dad..1bfa55d74 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -315,6 +315,7 @@ def get_queryset(self) -> QuerySet: project = self.get_active_project() if project: qs = qs.filter(project=project) + num_example_captures = 10 if self.action == "retrieve": qs = qs.prefetch_related( @@ -561,6 +562,7 @@ class SourceImageViewSet(DefaultViewSet, ProjectMixin): "deployment__name", "event__start", "path", + "last_processed", ] permission_classes = [ObjectPermission] @@ -597,13 +599,16 @@ def get_queryset(self) -> QuerySet: if self.action == "list": # It's cumbersome to override the default list view, so customize the queryset here + queryset = self.filter_by_processed(queryset) queryset = self.filter_by_has_detections(queryset) + queryset = self.annotate_last_processed(queryset) elif self.action == "retrieve": # For detail view, include storage info and additional prefetches with_counts_default = True queryset = queryset.prefetch_related("jobs", "collections") queryset = self.add_adjacent_captures(queryset) + queryset = self.annotate_last_processed(queryset) with_detections_default = True with_detections = self.request.query_params.get("with_detections", with_detections_default) @@ -627,15 +632,62 @@ def get_queryset(self) -> QuerySet: return queryset + def filter_by_processed(self, queryset: QuerySet) -> QuerySet: + """ + Filter by whether a capture has been processed by a detection pipeline. + + "Processed" means the capture has *any* Detection row, including the null + markers (``NULL_DETECTIONS_FILTER``) that record a "processed, found nothing" + result. This mirrors how the capture set list separates the processed count + from the (real) detections count. Use ``has_detections`` to filter on real + detections only. + + Reuses the ``with_was_processed`` queryset annotation so the "processed" + definition stays in one place. + """ + processed = self.request.query_params.get("processed") + if processed is not None: + processed = BooleanField(required=False).clean(processed) + queryset = queryset.with_was_processed().filter(was_processed=processed) + return queryset + def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet: + """ + Filter by whether a capture has any *real* detections (a detection with a + bounding box). Null detection markers are excluded, so a capture that was + processed but yielded nothing returns ``has_detections=false``. Use the + ``processed`` param to filter on processing status regardless of findings. + """ has_detections = self.request.query_params.get("has_detections") if has_detections is not None: has_detections = BooleanField(required=False).clean(has_detections) queryset = queryset.annotate( - has_detections=models.Exists(Detection.objects.filter(source_image=models.OuterRef("pk"))), + has_detections=models.Exists( + Detection.objects.filter(source_image=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER) + ), ).filter(has_detections=has_detections) return queryset + def annotate_last_processed(self, queryset: QuerySet) -> QuerySet: + """ + Annotate each capture with ``last_processed`` — the most recent detection + ``created_at`` for that capture, i.e. when it was last run through a + detection pipeline. Null when the capture has never been processed; + NullsLastOrderingFilter sorts those last. + + A correlated subquery (rather than a join + Max) keeps the row count stable + for pagination. The supporting index on Detection(source_image, -created_at) + makes the per-row lookup an index scan, so this stays cheap without + denormalizing a timestamp onto SourceImage. + """ + return queryset.annotate( + last_processed=models.Subquery( + Detection.objects.filter(source_image=models.OuterRef("pk")) + .order_by("-created_at") + .values("created_at")[:1] + ) + ) + def prefetch_detections(self, queryset: QuerySet, project: Project | None = None) -> QuerySet: """ Return all detections for source images, but only include occurrence data diff --git a/ami/main/migrations/0088_detection_det_srcimg_created_idx.py b/ami/main/migrations/0088_detection_det_srcimg_created_idx.py new file mode 100644 index 000000000..185d5621a --- /dev/null +++ b/ami/main/migrations/0088_detection_det_srcimg_created_idx.py @@ -0,0 +1,16 @@ +# Generated by Django 4.2.10 on 2026-05-29 12:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("main", "0087_taxon_parents_json_gin_index"), + ] + + operations = [ + migrations.AddIndex( + model_name="detection", + index=models.Index(fields=["source_image", "-created_at"], name="det_srcimg_created_idx"), + ), + ] diff --git a/ami/main/models.py b/ami/main/models.py index 1de9a01ee..04210cbe6 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -2852,6 +2852,11 @@ class Meta: "frame_num", "timestamp", ] + indexes = [ + # Supports the "last processed" subquery on the captures list: the + # latest detection created_at per source image (index scan, top 1). + models.Index(fields=["source_image", "-created_at"], name="det_srcimg_created_idx"), + ] def best_classification(self): # @TODO where is this used? diff --git a/ami/main/tests.py b/ami/main/tests.py index 58277d186..5292a5f07 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -38,7 +38,13 @@ from ami.ml.models.pipeline import Pipeline from ami.ml.models.processing_service import ProcessingService from ami.ml.models.project_pipeline_config import ProjectPipelineConfig -from ami.tests.fixtures.main import create_captures, create_occurrences, create_taxa, setup_test_project +from ami.tests.fixtures.main import ( + create_captures, + create_detections, + create_occurrences, + create_taxa, + setup_test_project, +) from ami.tests.fixtures.storage import populate_bucket from ami.users.models import User from ami.users.roles import BasicMember, Identifier, MLDataManager, ProjectManager, create_roles_for_project @@ -1390,6 +1396,89 @@ def test_unrelated_list_endpoints_still_work_without_project_id(self): self.assertEqual(response.status_code, status.HTTP_200_OK, path) +class TestCapturesProcessedFilter(APITestCase): + """ + The captures list distinguishes two related filters: + + - ``?processed=true|false`` (the UI "Processing status" filter): a capture is + "processed" when it has *any* Detection row, including the null markers that + record a "processed, found nothing" result. + - ``?has_detections=true|false``: a capture has *real* detections (a detection + with a bounding box). Null markers are excluded. + + Fixture: 4 captures — 2 with a real detection, 1 with only a null marker + (processed but found nothing), 1 untouched. So: + processed=true -> 3 has_detections=true -> 2 + processed=false -> 1 has_detections=false -> 2 + """ + + def setUp(self) -> None: + self.project, self.deployment = setup_test_project(reuse=False) + self.captures = create_captures(self.deployment, num_nights=1, images_per_night=4) + # Two captures get a real detection (bounding box present). + for capture in self.captures[:2]: + create_detections(capture, bboxes=[(0.1, 0.1, 0.2, 0.2)]) + # One capture gets only a null marker: processed, but nothing found. + Detection.objects.create( + source_image=self.captures[2], + bbox=None, + timestamp=self.captures[2].timestamp, + ) + # self.captures[3] is left untouched (never processed). + self.user = User.objects.create_user(email="proc-filter@insectai.org", is_staff=True) # type: ignore + self.client.force_authenticate(user=self.user) + self.list_url = f"/api/v2/captures/?project_id={self.project.pk}" + return super().setUp() + + def _count(self, query: str = "") -> int: + response = self.client.get(f"{self.list_url}{query}") + self.assertEqual(response.status_code, status.HTTP_200_OK) + return response.json()["count"] + + def test_processed_counts_null_markers(self): + # The null-marker capture counts as processed (2 real + 1 marker); its + # complement is the single untouched capture. + self.assertEqual(self._count("&processed=true"), 3) + self.assertEqual(self._count("&processed=false"), 1) + + def test_has_detections_excludes_null_markers(self): + # Only the 2 real-detection captures; the processed-but-empty capture + # falls on the has_detections=false side. + self.assertEqual(self._count("&has_detections=true"), 2) + self.assertEqual(self._count("&has_detections=false"), 2) + + +class TestCapturesLastProcessed(APITestCase): + """ + The captures list annotates and can order by ``last_processed`` — the most + recent detection created_at for each capture. Captures that were never + processed expose ``last_processed = None``. + """ + + def setUp(self) -> None: + self.project, self.deployment = setup_test_project(reuse=False) + self.captures = create_captures(self.deployment, num_nights=1, images_per_night=2) + # First capture is processed (has a detection); the second is left untouched. + create_detections(self.captures[0], bboxes=[(0.1, 0.1, 0.2, 0.2)]) + self.user = User.objects.create_user(email="cap-lastproc@insectai.org", is_staff=True) # type: ignore + self.client.force_authenticate(user=self.user) + self.url = f"/api/v2/captures/?project_id={self.project.pk}" + return super().setUp() + + def _row(self, data: dict, capture_id: int) -> dict: + return next(c for c in data["results"] if c["id"] == capture_id) + + def test_last_processed_annotated_and_orderable(self): + # One request exercises the annotation, the serializer field, and the + # ordering registration together. + response = self.client.get(f"{self.url}&ordering=-last_processed") + self.assertEqual(response.status_code, status.HTTP_200_OK) + data = response.json() + # Processed capture has a timestamp; the untouched one is null. + self.assertIsNotNone(self._row(data, self.captures[0].pk)["last_processed"]) + self.assertIsNone(self._row(data, self.captures[1].pk)["last_processed"]) + + class TestProjectOwnerAutoAssignment(APITestCase): def setUp(self) -> None: self.user_1 = User.objects.create_user(email="testuser@insectai.org", is_staff=True, is_superuser=True) diff --git a/docs/claude/planning/2026-05-28-captures-processed-filter-design.md b/docs/claude/planning/2026-05-28-captures-processed-filter-design.md new file mode 100644 index 000000000..d6fbc5b82 --- /dev/null +++ b/docs/claude/planning/2026-05-28-captures-processed-filter-design.md @@ -0,0 +1,80 @@ +# Captures list — "Processed / Not processed" filter + +Date: 2026-05-28 +Status: design approved, pending spec review +Scope: first of several planned captures-list filters; this PR ships the processed filter only. + +## Goal + +Add a "Processing status" filter to the Captures (SourceImage) list view, letting users +narrow to captures that have been processed, not processed, or all (no filter). Lay the +groundwork (a planned filter set) for additional filters in later PRs. + +"Processed" = the image has been run through detection. Because PR #1093 writes a null +Detection marker for the "processed, found nothing" case, the presence of *any* Detection +row is an accurate signal of "was processed." + +## Backend — no change required + +The filter already exists and is exercised by the list endpoint: + +- `ami/main/api/views.py:630-636` — `SourceImageViewSet.filter_by_has_detections` + handles `?has_detections=true|false` by annotating + `Exists(Detection.objects.filter(source_image=OuterRef("pk")))` and filtering on it. + (`SourceImageViewSet` at `views.py:528`.) +- Called from `get_queryset` only for the `list` action (`views.py:600`), which is what + the captures list uses. + +Decision: reuse the existing `has_detections` query param. Zero backend change, already +tested behavior. The param name (`has_detections`) means "was processed" because of the +null-marker convention; we surface it to users with the label "Processing status" and keep +`has_detections` as the internal query key. This name/meaning gap is the one known wart and +is documented here rather than fixed (a `was_processed` alias was considered and rejected to +avoid extra surface area). + +## Frontend — four wiring changes + +1. **New component** `ui/src/components/filtering/filters/processing-status-filter.tsx`. + Model on `verification-status-filter.tsx`. Two options: "Processed" (true) / + "Not processed" (false). Wire `onValueChange={onAdd}` directly so both true and false + are settable. (The generic `BooleanFilter` is unusable here: its "No" branch calls + `onClear()` instead of filtering to false — see `boolean-filter.tsx:21-27`.) + Use a translated label string for the two options (add to `utils/language` if needed). + +2. **Register the component** in `ui/src/components/filtering/filter-control.tsx` + `ComponentMap`: `has_detections: ProcessingStatusFilter`. + +3. **Register the filter** in `ui/src/utils/useFilters.ts` `AVAILABLE_FILTERS`: + `{ label: 'Processing status', field: 'has_detections', tooltip: { text: ... } }`. + +4. **Render it** on the captures page `ui/src/pages/captures/captures.tsx` (inside the + existing `FilterSection`, alongside `deployment` and `collections`): + ``. + +State, URL params, page reset, and the clear-X ("All") behavior all come from the existing +`useFilters` machinery — no changes there. + +## Data flow + +UI select -> `addFilter('has_detections', 'true'|'false')` -> URL search param -> +`useFilters` -> `useCaptures` builds `?has_detections=...` via `getFetchUrl` +(`ui/src/data-services/utils.ts`) -> DRF `filter_by_has_detections` -> filtered queryset. +Clear-X removes the param -> "All". + +## Testing + +- Backend: verify existing coverage for `?has_detections=true|false` on the captures list + endpoint; add a test if missing (both branches + absent param). +- Frontend: manual verification against the running stack — select Processed, Not processed, + and clear; confirm result counts change and the URL param round-trips. + +## Out of scope (planned follow-up PRs) + +To live in a collapsible "Advanced" `FilterSection` on the captures page later: + +- **Date range** — `date_start`/`date_end` already in the FE registry with a `DateFilter` + component, but the SourceImage viewset needs backend support mapping them to a `timestamp` + range (new work). +- **Station** — already available via the existing `deployment` filter. +- **Site** — add `deployment__research_site` to `filterset_fields` + a Site filter component. +- **Device** — add `deployment__device` to `filterset_fields` + a Device filter component. diff --git a/docs/claude/planning/2026-05-28-captures-processed-filter-plan.md b/docs/claude/planning/2026-05-28-captures-processed-filter-plan.md new file mode 100644 index 000000000..1ca53c38a --- /dev/null +++ b/docs/claude/planning/2026-05-28-captures-processed-filter-plan.md @@ -0,0 +1,299 @@ +# Captures "Processed / Not processed" Filter — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a "Processing status" filter (Processed / Not processed / All) to the Captures list view, reusing the existing `has_detections` backend query param. + +**Architecture:** Backend already filters captures by `?has_detections=true|false` via `Exists(Detection)` (null Detection markers make "has any detection" == "was processed"). No backend logic change; add a regression test only. Frontend adds a 3-state select component (modeled on the verified filter) and wires it through the existing filter registry/URL-param machinery. + +**Tech Stack:** Django 4.2 + DRF (backend), React 18 + TypeScript + nova-ui-kit Select (frontend). + +Design spec: `docs/claude/planning/2026-05-28-captures-processed-filter-design.md` + +--- + +## Task 1: Backend regression test for `has_detections` filter + +No existing test covers `?has_detections=` on the captures list. Add one to lock in the behavior we depend on. This is the only backend change. + +**Files:** +- Test: `ami/main/tests.py` (add a new `APITestCase` class near the other captures/list tests, e.g. after `TestProjectRequiredOnListEndpoints` ~line 1392) + +- [ ] **Step 1: Write the failing test** + +Add this class to `ami/main/tests.py`. The fixtures `setup_test_project`, `create_captures`, and `create_detections` are already imported / available (`create_detections` lives in `ami.tests.fixtures.main` — add it to the existing import on line 41 if not present). + +Update the import line 41 to include `create_detections`: + +```python +from ami.tests.fixtures.main import ( + create_captures, + create_detections, + create_occurrences, + create_taxa, + setup_test_project, +) +``` + +Then add the test class: + +```python +class TestCapturesProcessedFilter(APITestCase): + """ + The captures list supports ?has_detections=true|false, which the UI surfaces + as the "Processing status" filter. A capture is "processed" when it has any + Detection row (including null markers for "processed, found nothing"). + """ + + def setUp(self) -> None: + self.project, self.deployment = setup_test_project(reuse=False) + self.captures = create_captures(self.deployment, num_nights=1, images_per_night=4) + # Mark the first two captures as processed by giving them a detection. + for capture in self.captures[:2]: + create_detections(capture, bboxes=[(0.1, 0.1, 0.2, 0.2)]) + self.user = User.objects.create_user(email="proc-filter@insectai.org", is_staff=True) # type: ignore + self.client.force_authenticate(user=self.user) + self.list_url = f"/api/v2/captures/?project_id={self.project.pk}" + return super().setUp() + + def test_no_filter_returns_all_captures(self): + response = self.client.get(self.list_url) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()["count"], 4) + + def test_has_detections_true_returns_only_processed(self): + response = self.client.get(f"{self.list_url}&has_detections=true") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()["count"], 2) + + def test_has_detections_false_returns_only_unprocessed(self): + response = self.client.get(f"{self.list_url}&has_detections=false") + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()["count"], 2) +``` + +- [ ] **Step 2: Run the test to verify it passes (it should — behavior already exists)** + +Run: +```bash +docker compose run --rm django python manage.py test ami.main.tests.TestCapturesProcessedFilter --keepdb -v 2 +``` +Expected: 3 tests PASS. (This is a characterization test for existing behavior; if `test_has_detections_false` fails returning 4 instead of 2, that means null-marker handling differs — stop and investigate before touching the UI.) + +- [ ] **Step 3: Commit** + +```bash +git add ami/main/tests.py +git commit -m "test: cover has_detections filter on captures list endpoint + +Co-Authored-By: Claude " +``` + +--- + +## Task 2: Add label strings for the processing-status filter + +**Files:** +- Modify: `ui/src/utils/language.ts` (the `STRING` enum and the string map) + +- [ ] **Step 1: Add enum keys** + +In the `STRING` enum in `ui/src/utils/language.ts`, add two keys (place them alphabetically near `PROCESSING`/`NOT_VERIFIED` neighbors — exact position is cosmetic): + +```typescript + NOT_PROCESSED, + PROCESSED, +``` + +- [ ] **Step 2: Add the string values** + +In the string-map object (where entries like `[STRING.NOT_VERIFIED]: 'Not verified',` live), add: + +```typescript + [STRING.NOT_PROCESSED]: 'Not processed', + [STRING.PROCESSED]: 'Processed', +``` + +- [ ] **Step 3: Commit** + +```bash +git add ui/src/utils/language.ts +git commit -m "feat(ui): add Processed / Not processed label strings + +Co-Authored-By: Claude " +``` + +--- + +## Task 3: Create the `ProcessingStatusFilter` component + +A 3-state select: empty (= All, cleared via the FilterControl X button), true (Processed), false (Not processed). Modeled on `verification-status-filter.tsx`. Do NOT reuse `BooleanFilter` — its "No" branch calls `onClear()` and cannot filter to false (`boolean-filter.tsx:21-27`). + +**Files:** +- Create: `ui/src/components/filtering/filters/processing-status-filter.tsx` + +- [ ] **Step 1: Write the component** + +```tsx +import { Select } from 'nova-ui-kit' +import { STRING, translate } from 'utils/language' +import { booleanToString, stringToBoolean } from '../utils' +import { FilterProps } from './types' + +export const ProcessingStatusFilter = ({ value: string, onAdd }: FilterProps) => { + const value = stringToBoolean(string) + const options = [ + { value: true, label: translate(STRING.PROCESSED) }, + { value: false, label: translate(STRING.NOT_PROCESSED) }, + ] + + return ( + + + + + + {options.map((option) => ( + + {option.label} + + ))} + + + ) +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add ui/src/components/filtering/filters/processing-status-filter.tsx +git commit -m "feat(ui): add ProcessingStatusFilter select component + +Co-Authored-By: Claude " +``` + +--- + +## Task 4: Register the component and the filter field + +**Files:** +- Modify: `ui/src/components/filtering/filter-control.tsx` (import + `ComponentMap`) +- Modify: `ui/src/utils/useFilters.ts` (`AVAILABLE_FILTERS`) + +- [ ] **Step 1: Import the component in filter-control.tsx** + +Add near the other filter imports (after the `PipelineFilter` import, ~line 10): + +```typescript +import { ProcessingStatusFilter } from './filters/processing-status-filter' +``` + +- [ ] **Step 2: Register in `ComponentMap`** + +In `ui/src/components/filtering/filter-control.tsx`, add to the `ComponentMap` object (keep keys alphabetical-ish; place after `pipeline:`): + +```typescript + has_detections: ProcessingStatusFilter, +``` + +- [ ] **Step 3: Register the filter field in `AVAILABLE_FILTERS`** + +In `ui/src/utils/useFilters.ts`, add an entry to the array returned by `AVAILABLE_FILTERS` (e.g. after the `event` entry, ~line 138): + +```typescript + { + label: 'Processing status', + field: 'has_detections', + tooltip: { + text: 'Filter captures by whether they have been processed by a detection pipeline.', + }, + }, +``` + +- [ ] **Step 4: Commit** + +```bash +git add ui/src/components/filtering/filter-control.tsx ui/src/utils/useFilters.ts +git commit -m "feat(ui): register processing-status filter (has_detections) + +Co-Authored-By: Claude " +``` + +--- + +## Task 5: Render the filter on the captures page + +**Files:** +- Modify: `ui/src/pages/captures/captures.tsx` (the `FilterSection`, ~lines 65-68) + +- [ ] **Step 1: Add the FilterControl** + +In `ui/src/pages/captures/captures.tsx`, inside the existing ``, add the new control below `collections`: + +```tsx + + + + + +``` + +- [ ] **Step 2: Commit** + +```bash +git add ui/src/pages/captures/captures.tsx +git commit -m "feat(ui): show processing-status filter on captures list + +Co-Authored-By: Claude " +``` + +--- + +## Task 6: Type-check, lint, and manual verification + +**Files:** none (verification only) + +- [ ] **Step 1: TypeScript type check** + +Run: +```bash +cd ui && yarn tsc --noEmit +``` +Expected: no errors. (If `yarn tsc` is not a script, use `npx tsc --noEmit`.) + +- [ ] **Step 2: Lint + format the touched files** + +Run: +```bash +cd ui && yarn lint && yarn format +``` +Expected: clean (or auto-fixed). Re-commit if format changed anything. + +- [ ] **Step 3: Manual verification against the running stack** + +Start the stack (`docker compose up -d` from repo root; for worktree code-only changes use the bind-mount Option A in CLAUDE.md if testing against the main stack). Then in the UI at `http://localhost:4000`, open a project's Captures list and: + - Select **Processed** → URL gains `?has_detections=true`, result count drops to processed captures only. + - Select **Not processed** → `has_detections=false`, count shows unprocessed only. + - Click the **X** clear button → param removed, all captures return. + - Confirm the page resets to page 1 when the filter changes (handled by `useFilters.addFilter`). + +- [ ] **Step 4: Final commit (only if lint/format changed files)** + +```bash +git add -A +git commit -m "chore(ui): lint/format for processing-status filter + +Co-Authored-By: Claude " +``` + +--- + +## Self-Review notes + +- **Spec coverage:** backend reuse (Task 1 verifies), new component (Task 3), registry wiring (Task 4), page render (Task 5), label strings (Task 2), testing (Tasks 1 + 6). All spec sections covered. +- **Type consistency:** component named `ProcessingStatusFilter` in Tasks 3 and 4; query field `has_detections` in Tasks 1, 4, 5; STRING keys `PROCESSED` / `NOT_PROCESSED` in Tasks 2 and 3. +- **Out of scope (later PRs):** date range, site, device filters — see design doc. diff --git a/docs/claude/reference/captures-processed-count-strategies.md b/docs/claude/reference/captures-processed-count-strategies.md new file mode 100644 index 000000000..85cfe4e35 --- /dev/null +++ b/docs/claude/reference/captures-processed-count-strategies.md @@ -0,0 +1,132 @@ +# Captures list: `processed` / `has_detections` COUNT strategies + +**Created:** 2026-05-29 (PR #1326). **Status:** reference — records a strategy that +was prototyped, benchmarked, and deliberately *not* shipped. + +## Context + +The captures list (`SourceImageViewSet`, `ami/main/api/views.py`) supports two +existence filters: + +- `?processed=true|false` — capture has *any* `Detection` row, including the null + markers (`NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[])`) that record + a "processed, found nothing" result. +- `?has_detections=true|false` — capture has a *real* detection (bounding box + present). Null markers excluded. + +Both translate to an `EXISTS` / `NOT EXISTS` subquery against `main_detection`. The +**page of rows** is cheap (the `LIMIT` prunes early), but the **pagination COUNT** +has no `LIMIT`, so `NOT EXISTS` becomes an anti-join over the whole source-image +table. + +## What shipped (PR #1326) + +- `?processed=` / `?has_detections=` filters. +- Sortable `last_processed` column (correlated subquery: most recent detection + `created_at`). +- Index `det_srcimg_created_idx` on `Detection(source_image, -created_at)` + (migration `0088`) — supports the `last_processed` **sort**. +- The pagination COUNT uses the **default DRF count** (the plain anti-join). No + custom count strategy. + +## The strategy that was NOT shipped: count by subtraction + +Prototype (reverted): a `SourceImagePagination` whose `get_count` computed the +existence-filter count without the anti-join: + +``` +total = COUNT(*) over the project/event/deployment/collection-scoped captures +processed_count = COUNT(DISTINCT source_image_id) off main_detection, scoped to the same captures + (has_detections: also .exclude(NULL_DETECTIONS_FILTER)) + +processed=true -> processed_count +processed=false -> total - processed_count +``` + +Both counts are exact. The cost scales with the number of *detection* rows rather +than the processed/unprocessed ratio, so it is symmetric (fast in both directions). +Implementation notes if it is ever revived: + +- DRF sets `paginator.request` *after* `get_count()` runs, so `paginate_queryset` + must stash `request` + `view` first. +- The base queryset (scoped, but *without* the processed/has_detections predicate) + was rebuilt in the view by applying only `DjangoFilterBackend` — *not* the + ordering backend, which would reference the absent `last_processed` annotation. +- The detection-side count used `Detection.objects.filter(source_image__in=base.values("pk"))`. + This is an `IN (subquery)` semi-join; its plan is less predictable than a direct + `source_image__project_id=` join (see cold-spike below). + +## Why it was reverted + +The original justification was a **12.8s** COUNT for `processed=false` on the +929k-capture project. Deploy-time benchmarking on the Serbia dev box (hardware +comparable to production) showed that number does not reproduce there. + +### Benchmarks + +Local dev box (cold, low RAM, 8 GB source-image table not cached) — the numbers +that originally motivated subtraction: + +| project | filter | default anti-join | subtraction | +|---|---|---|---| +| 18 / 929k (local) | processed=false | **12.8s** | ~1.7s | +| 18 / 929k (local) | processed=true | 4.8s | ~1.7s | +| 18 / 929k (local) | has_detections=false | 11.5s | ~1.9s | +| 18 / 929k (local) | has_detections=true | 3.5s | 0.2s | + +Serbia dev box (cold), real data — the numbers that changed the decision: + +| project | filter | default anti-join | subtraction | +|---|---|---|---| +| 18 / 929k | processed=false | **1.38s** | 0.58s | +| 18 / 929k | processed=true | 1.52s | 0.58s | +| 20 / 105k | processed=true | 0.44s | **7.71s cold** / 0.01s warm | +| 20 / 105k | processed=false | 0.27s | 0.04s | + +Counts matched exactly across both approaches (subtraction is correct): +project 18 → 17938 / 910996; project 20 → 8517 / 96574 (processed), +8476 / 96615 (has_detections). + +### Findings + +1. **The 12.8s was environment-dependent, not algorithmic.** `EXPLAIN (ANALYZE)` + for `processed=false` on project 18 (Serbia): + + ``` + Finalize Aggregate (actual time=1541..1567) + -> Parallel Hash Right Anti Join (rows=303665) + -> Parallel Seq Scan on main_detection (rows=455239) + -> Parallel Hash + -> Parallel Seq Scan on main_sourceimage (Filter: project_id = 18) + Execution Time: 1609 ms + ``` + + The anti-join seq-scans the wide source-image table. Serbia's RAM / OS cache / + parallel workers do it in ~1.6s; the local box did it in 12.8s cold. Serbia ≈ + production, so the real-world cost is far smaller than the local measurement. + +2. **`det_srcimg_created_idx` is not used by the COUNT** — the anti-join plan + ignores it. It only helps the `last_processed` sort. So the index already in the + PR does nothing for the count either way. + +3. **Subtraction has its own cold-plan risk.** On the *smaller* project 20 the + detection-side `IN (subquery)` distinct spiked to 7.71s on first disk touch + (cold seq scan of `main_detection`), settling to sub-second warm — *slower* than + the 0.44s default for that case. `EXPLAIN` (warm) = 634ms via a nested-loop + + pkey memoize + distinct. + +Net: subtraction is a modest, real win on the largest project (0.58 vs 1.38s) and +would protect a cold / memory-pressured environment, but it adds a custom paginator ++ base-queryset rebuild + a second query and an unpredictable cold-plan, for a +benefit that is small on production-class hardware. Not worth it for this PR. + +## General direction + +The durable fix for "COUNT is slow on huge filtered lists" is not per-filter +bespoke counting — it is an **estimated-count paginator** (ticket #1328): use the +PostgreSQL planner's row estimate (`EXPLAIN (FORMAT JSON)` → `Plan["Plan Rows"]`, +<15ms, ~3% accurate where it matters) with an exact-count fallback below a +threshold. That handles *any* filter, not just existence filters. Subtraction +(exact, existence-filters-only) remains a possible fast path to layer underneath it +if exactness is required. See also the annotation-strip count trick in +`ProjectPagination.get_count` and PR #1317. diff --git a/ui/src/components/filtering/filter-control.tsx b/ui/src/components/filtering/filter-control.tsx index 36bf832ae..0e0944919 100644 --- a/ui/src/components/filtering/filter-control.tsx +++ b/ui/src/components/filtering/filter-control.tsx @@ -16,6 +16,7 @@ import { TaxaListFilter } from './filters/taxa-list-filter' import { TaxonFilter } from './filters/taxon-filter' import { TypeFilter } from './filters/type-filter' import { FilterProps } from './filters/types' +import { ProcessingStatusFilter } from './filters/processing-status-filter' import { VerificationStatusFilter } from './filters/verification-status-filter' import { VerifiedByFilter } from './filters/verified-by-filter' @@ -30,6 +31,7 @@ const ComponentMap: { deployment: StationFilter, detections__source_image: ImageFilter, event: SessionFilter, + processed: ProcessingStatusFilter, include_unobserved: BooleanFilter, job_type_key: TypeFilter, not_algorithm: NotAlgorithmFilter, diff --git a/ui/src/components/filtering/filters/processing-status-filter.tsx b/ui/src/components/filtering/filters/processing-status-filter.tsx new file mode 100644 index 000000000..edfe33af9 --- /dev/null +++ b/ui/src/components/filtering/filters/processing-status-filter.tsx @@ -0,0 +1,30 @@ +import { Select } from 'nova-ui-kit' +import { STRING, translate } from 'utils/language' +import { booleanToString, stringToBoolean } from '../utils' +import { FilterProps } from './types' + +export const ProcessingStatusFilter = ({ value, onAdd }: FilterProps) => { + const booleanValue = stringToBoolean(value) + const options = [ + { value: true, label: translate(STRING.PROCESSED) }, + { value: false, label: translate(STRING.NOT_PROCESSED) }, + ] + + return ( + + + + + + {options.map((option) => ( + + {option.label} + + ))} + + + ) +} diff --git a/ui/src/data-services/models/capture.ts b/ui/src/data-services/models/capture.ts index c2f80bdec..054c6b1e7 100644 --- a/ui/src/data-services/models/capture.ts +++ b/ui/src/data-services/models/capture.ts @@ -96,6 +96,12 @@ export class Capture { }) } + get lastProcessed(): Date | undefined { + return this._capture.last_processed + ? new Date(this._capture.last_processed) + : undefined + } + get deploymentId(): string | undefined { return this._capture.deployment ? `${this._capture.deployment.id}` diff --git a/ui/src/pages/captures/capture-columns.tsx b/ui/src/pages/captures/capture-columns.tsx index ca27ef4d1..efb60be38 100644 --- a/ui/src/pages/captures/capture-columns.tsx +++ b/ui/src/pages/captures/capture-columns.tsx @@ -3,6 +3,7 @@ import { Capture } from 'data-services/models/capture' import { BasicTableCell, CellTheme, + DateTableCell, ImageCellTheme, ImageTableCell, TableColumn, @@ -151,6 +152,12 @@ export const columns = ({ sortField: 'path', renderCell: (item: Capture) => , }, + { + id: 'last-processed', + name: translate(STRING.FIELD_LABEL_LAST_PROCESSED), + sortField: 'last_processed', + renderCell: (item: Capture) => , + }, { id: 'occurrences', name: translate(STRING.FIELD_LABEL_OCCURRENCES), diff --git a/ui/src/pages/captures/captures.tsx b/ui/src/pages/captures/captures.tsx index 00f4b2558..71f2009dc 100644 --- a/ui/src/pages/captures/captures.tsx +++ b/ui/src/pages/captures/captures.tsx @@ -37,6 +37,7 @@ export const Captures = () => { dimensions: true, filename: false, path: false, + 'last-processed': true, }) const { selectedView, setSelectedView } = useSelectedView('table') const { filters } = useFilters() @@ -65,6 +66,7 @@ export const Captures = () => { +
diff --git a/ui/src/utils/language.ts b/ui/src/utils/language.ts index 5d46189aa..c07f6038c 100644 --- a/ui/src/utils/language.ts +++ b/ui/src/utils/language.ts @@ -115,6 +115,7 @@ export enum STRING { FIELD_LABEL_JOBS, FIELD_LABEL_KEY, FIELD_LABEL_LAST_DATE, + FIELD_LABEL_LAST_PROCESSED, FIELD_LABEL_LAST_SEEN, FIELD_LABEL_LAST_SYNCED, FIELD_LABEL_LATEST_JOB_STATUS, @@ -308,11 +309,13 @@ export enum STRING { MOST_OBSERVED_TAXA, NEW_ID, NOT_CONNECTED, + NOT_PROCESSED, NOT_VERIFIED, OR, OVERVIEW, PIPELINES, PROCESS, + PROCESSED, RECENT, REJECT_ID_SHORT, REJECT_ID, @@ -448,6 +451,7 @@ const ENGLISH_STRINGS: { [key in STRING]: string } = { [STRING.FIELD_LABEL_JOBS]: 'Jobs', [STRING.FIELD_LABEL_KEY]: 'Key', [STRING.FIELD_LABEL_LAST_DATE]: 'Last date', + [STRING.FIELD_LABEL_LAST_PROCESSED]: 'Last processed', [STRING.FIELD_LABEL_LAST_SEEN]: 'Last seen', [STRING.FIELD_LABEL_LAST_SYNCED]: 'Last synced with data source', [STRING.FIELD_LABEL_LATEST_JOB_STATUS]: 'Latest job status', @@ -702,10 +706,12 @@ const ENGLISH_STRINGS: { [key in STRING]: string } = { [STRING.MOST_OBSERVED_TAXA]: 'Most observed taxa', [STRING.NEW_ID]: 'New ID', [STRING.NOT_CONNECTED]: 'Not connected', + [STRING.NOT_PROCESSED]: 'Not processed', [STRING.NOT_VERIFIED]: 'Not verified', [STRING.OR]: 'Or', [STRING.OVERVIEW]: 'Overview', [STRING.PIPELINES]: 'Pipelines', + [STRING.PROCESSED]: 'Processed', [STRING.RECENT]: 'Recent', [STRING.REJECT_ID_SHORT]: 'Reject', [STRING.REJECT_ID]: 'Reject ID', diff --git a/ui/src/utils/useFilters.ts b/ui/src/utils/useFilters.ts index 656e5c6f0..b028a2592 100644 --- a/ui/src/utils/useFilters.ts +++ b/ui/src/utils/useFilters.ts @@ -136,6 +136,13 @@ export const AVAILABLE_FILTERS = (projectId: string): FilterConfig[] => [ }, }, }, + { + label: 'Processing status', + field: 'processed', + tooltip: { + text: 'Filter captures by whether they have been processed by a detection pipeline.', + }, + }, { label: 'Pipeline', field: 'pipeline',