diff --git a/ami/main/api/views.py b/ami/main/api/views.py index 5a6d5aece..d86962097 100644 --- a/ami/main/api/views.py +++ b/ami/main/api/views.py @@ -37,7 +37,6 @@ from ami.utils.storages import ConnectionTestResult from ..models import ( - NULL_DETECTIONS_FILTER, Classification, Deployment, Detection, @@ -581,7 +580,7 @@ def filter_by_has_detections(self, queryset: QuerySet) -> QuerySet: if has_detections is not None: has_detections = BooleanField(required=False).clean(has_detections) queryset = queryset.annotate( - has_detections=models.Exists(Detection.objects.filter(source_image=models.OuterRef("pk"))), + has_detections=models.Exists(Detection.objects.valid().filter(source_image=models.OuterRef("pk"))), ).filter(has_detections=has_detections) return queryset @@ -611,7 +610,7 @@ def prefetch_detections(self, queryset: QuerySet, project: Project | None = None score = get_default_classification_threshold(project, self.request) prefetch_queryset = ( - Detection.objects.exclude(NULL_DETECTIONS_FILTER) + Detection.objects.valid() .annotate( determination_score=models.Max("occurrence__detections__classifications__score"), # Store whether this occurrence should be included based on default filters @@ -910,7 +909,7 @@ class DetectionViewSet(DefaultViewSet, ProjectMixin): """ require_project_for_list = True # Unfiltered list scans are too expensive on this table - queryset = Detection.objects.exclude(NULL_DETECTIONS_FILTER).select_related("source_image", "detection_algorithm") + queryset = Detection.objects.valid().select_related("source_image", "detection_algorithm") serializer_class = DetectionSerializer filterset_fields = ["source_image", "detection_algorithm", "source_image__project"] ordering_fields = ["created_at", "updated_at", "detection_score", "timestamp"] diff --git a/ami/main/management/commands/cleanup_null_only_occurrences.py b/ami/main/management/commands/cleanup_null_only_occurrences.py new file mode 100644 index 000000000..a974c9fec --- /dev/null +++ b/ami/main/management/commands/cleanup_null_only_occurrences.py @@ -0,0 +1,86 @@ +""" +Delete phantom Occurrences and orphan null-marker Detections left by the Issue #1310 +field bug, on a per-project basis. + +The bug created two categories of rows that should never have been persisted: +- Occurrence rows with no real detections (or with determination=NULL), surfaced as + ghost rows in the API. +- Detection rows that mark a SourceImage as "processed" while no real detections + exist for it — these prevent filter_processed_images from re-yielding the image + on the next ML run. + +After cleanup, the source images become eligible for re-processing. + +Dry-run by default. Pass --commit to delete. +""" + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from django.db.models import Exists, OuterRef + +from ami.main.models import Detection, Occurrence, Project + + +class Command(BaseCommand): + help = "Delete phantom Occurrences and orphan null-marker Detections (Issue #1310)." + + def add_arguments(self, parser): + parser.add_argument( + "--project", + type=int, + required=True, + help="Project ID to clean up.", + ) + parser.add_argument( + "--commit", + action="store_true", + help="Actually delete. Defaults to dry-run.", + ) + + def handle(self, *args, **options): + project_id: int = options["project"] + commit: bool = options["commit"] + + try: + project = Project.objects.get(pk=project_id) + except Project.DoesNotExist as err: + raise CommandError(f"Project {project_id} does not exist") from err + + all_occs = Occurrence.objects.filter(project=project) + valid_occs = all_occs.valid() + phantom_occs = all_occs.exclude(pk__in=valid_occs.values("pk")) + + has_valid_detection = Detection.objects.valid().filter(source_image_id=OuterRef("source_image_id")) + orphan_null_markers = ( + Detection.objects.filter(source_image__project=project) + .null_markers() + .annotate(_has_valid=Exists(has_valid_detection)) + .filter(_has_valid=False) + ) + + phantom_count = phantom_occs.count() + null_count = orphan_null_markers.count() + + self.stdout.write(f"Project #{project.pk} ({project.name}):") + self.stdout.write(f" Phantom occurrences (no valid detection or null determination): {phantom_count}") + self.stdout.write(f" Orphan null-marker detections on images with no real detections: {null_count}") + + if phantom_count == 0 and null_count == 0: + self.stdout.write(self.style.SUCCESS("Nothing to clean up.")) + return + + if not commit: + self.stdout.write(self.style.WARNING("Dry run — pass --commit to delete.")) + return + + with transaction.atomic(): + orphan_null_markers.delete() + phantom_occs.delete() + + # Report the pre-calculated counts of the rows we targeted directly. The tuple from + # .delete() also counts cascade-deleted related rows (e.g. classifications under a + # phantom occurrence's detections), which would inflate the numbers and confuse the + # operator about what the command actually targeted. + self.stdout.write( + self.style.SUCCESS(f"Deleted {phantom_count} phantom occurrences and {null_count} orphan null markers.") + ) diff --git a/ami/main/models.py b/ami/main/models.py index b30b4e645..655ee60a0 100644 --- a/ami/main/models.py +++ b/ami/main/models.py @@ -98,6 +98,16 @@ class TaxonRank(OrderedEnum): NULL_DETECTIONS_FILTER = Q(bbox__isnull=True) | Q(bbox=[]) +def null_detections_q(prefix: str = "") -> Q: + """ + Return a Q expression matching null-marker Detection rows, optionally prefixed + for use across relations (e.g. null_detections_q("images__detections__") for an + aggregate filter on a parent table). For Detection queries directly, prefer + Detection.objects.null_markers() / .valid() instead. + """ + return Q(**{f"{prefix}bbox__isnull": True}) | Q(**{f"{prefix}bbox": []}) + + def get_media_url(path: str) -> str: """ If path is a full URL, return it as-is. @@ -814,7 +824,7 @@ def get_detections_count(self) -> int | None: was processed and no detections were found) to stay consistent with ``SourceImage.get_detections_count`` and ``Event.get_detections_count``. """ - qs = Detection.objects.filter(source_image__deployment=self).exclude(NULL_DETECTIONS_FILTER) + qs = Detection.objects.filter(source_image__deployment=self).valid() filter_q = build_occurrence_default_filters_q( project=self.project, request=None, @@ -1226,7 +1236,7 @@ def get_detections_count(self) -> int | None: Excludes null-bbox placeholder detections to stay consistent with ``SourceImage.get_detections_count`` and ``Deployment.get_detections_count``. """ - qs = Detection.objects.filter(source_image__event=self).exclude(NULL_DETECTIONS_FILTER) + qs = Detection.objects.filter(source_image__event=self).valid() filter_q = build_occurrence_default_filters_q( project=self.project, request=None, @@ -2034,7 +2044,7 @@ def get_detections_count(self) -> int: Excludes detections without bounding boxes — those are placeholder records indicating the image was successfully processed and no detections were found. """ - qs = self.detections.exclude(NULL_DETECTIONS_FILTER) + qs = self.detections.all().valid() project = self.project if not project: return qs.distinct().count() @@ -2240,7 +2250,7 @@ def update_detection_counts( if null_only: qs = qs.filter(detections_count__isnull=True) - detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).exclude(NULL_DETECTIONS_FILTER) + detection_qs = Detection.objects.filter(source_image_id=models.OuterRef("pk")).valid() if project is not None: filter_q = build_occurrence_default_filters_q( project=project, @@ -2718,7 +2728,23 @@ def save(self, *args, **kwargs): class DetectionQuerySet(BaseQuerySet): - def null_detections(self): + def valid(self): + """ + Detections suitable for consumer queries — excludes null-marker sentinels. + + Null markers are rows that record "an algorithm ran against this image and + found nothing." Consumers asking "give me detections" should always go + through .valid(). Future predicates to fold in here: soft-delete tombstones, + detections missing an algorithm reference, detections missing classifications. + """ + return self.exclude(NULL_DETECTIONS_FILTER) + + def null_markers(self): + """ + Sentinel rows that record "this algorithm ran against this image and found + nothing." Only relevant for SourceImage-level "has this been processed?" + questions. Detection consumers should use .valid() instead. + """ return self.filter(NULL_DETECTIONS_FILTER) @@ -2796,6 +2822,26 @@ class Detection(BaseModel): objects = DetectionManager() + NULL_BBOX = None + """Canonical bbox value for null markers (rows that record 'an algorithm ran but + found nothing'). Use Detection.build_null_marker() to construct them. The legacy + bbox=[] form is still recognised by .null_markers() / .is_null_marker for + backwards compatibility with historical rows.""" + + @property + def is_null_marker(self) -> bool: + """True for sentinel rows representing 'no detections found by this algorithm.'""" + return self.bbox is None or self.bbox == [] + + @classmethod + def build_null_marker(cls, source_image, detection_algorithm) -> "Detection": + """Construct (without saving) a null-marker Detection for the given image+algorithm.""" + return cls( + source_image=source_image, + bbox=cls.NULL_BBOX, + detection_algorithm=detection_algorithm, + ) + def get_bbox(self): if self.bbox: return BoundingBox( @@ -2911,7 +2957,20 @@ def __str__(self) -> str: class OccurrenceQuerySet(BaseQuerySet): def valid(self): - return self.exclude(detections__isnull=True) + """ + Occurrences fit to surface in API responses: at least one real detection AND + a determination set. + + Excludes: + - Occurrences with no detections at all (orphans) + - Occurrences whose only detections are null-marker sentinels (Issue #1310: + field bug created phantom occurrences with no real bounding box backing + them) + - Occurrences with determination__isnull=True (no taxonomic identification, + same field bug shape) + """ + has_valid_detection = Exists(Detection.objects.valid().filter(occurrence_id=OuterRef("pk"))) + return self.filter(has_valid_detection).exclude(determination__isnull=True) def with_detections_count(self): return self.annotate(detections_count=models.Count("detections", distinct=True)) @@ -4105,7 +4164,7 @@ def with_source_images_with_detections_count(self): return self.annotate( source_images_with_detections_count=models.Count( "images", - filter=(~models.Q(images__detections__bbox__isnull=True) & ~models.Q(images__detections__bbox=[])), + filter=~null_detections_q("images__detections__"), distinct=True, ) ) @@ -4497,10 +4556,11 @@ def sample_greatest_file_size_from_each_event(self, num_each: int = 1): return captures def sample_detections_only(self): - """Sample all source images with detections""" + """Sample all source images with at least one real (non-null-marker) detection.""" qs = self.get_queryset() - return qs.filter(detections__isnull=False).distinct() + valid_detection_image_ids = Detection.objects.valid().values("source_image_id") + return qs.filter(pk__in=valid_detection_image_ids).distinct() def sample_full( self, diff --git a/ami/main/tests.py b/ami/main/tests.py index 517c4c87b..cf5be71e0 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -4761,3 +4761,270 @@ def test_registration_order_preserves_occurrence_retrieve(self): retrieve_response = self.client.get(f"/api/v2/occurrences/{occurrence.pk}/?project_id={self.project.pk}") self.assertEqual(stats_response.status_code, 200, "stats URL must resolve") self.assertEqual(retrieve_response.status_code, 200, "occurrence retrieve must still work") + + +class TestDetectionNullMarker(TestCase): + """ + Covers the null-marker abstraction added for Issue #1310 follow-up: + Detection.is_null_marker, Detection.build_null_marker, and + DetectionQuerySet.valid() / .null_markers(). + """ + + def setUp(self): + from ami.ml.models.algorithm import Algorithm + + self.project = Project.objects.create(name="Null Marker Test Project") + self.deployment = Deployment.objects.create(project=self.project, name="dep") + self.source_image = SourceImage.objects.create( + deployment=self.deployment, + project=self.project, + path="nullmarker-test.jpg", + ) + self.algorithm = Algorithm.objects.create(name="test-detector", key="test-detector", task_type="detection") + + def test_is_null_marker_for_bbox_none(self): + det = Detection.objects.create( + source_image=self.source_image, + bbox=None, + detection_algorithm=self.algorithm, + ) + self.assertTrue(det.is_null_marker) + + def test_is_null_marker_for_bbox_empty_list_legacy(self): + det = Detection.objects.create( + source_image=self.source_image, + bbox=[], + detection_algorithm=self.algorithm, + ) + self.assertTrue(det.is_null_marker) + + def test_is_null_marker_false_for_real_detection(self): + det = Detection.objects.create( + source_image=self.source_image, + bbox=[0.0, 0.0, 1.0, 1.0], + detection_algorithm=self.algorithm, + ) + self.assertFalse(det.is_null_marker) + + def test_build_null_marker_sets_canonical_fields(self): + det = Detection.build_null_marker(self.source_image, self.algorithm) + self.assertIsNone(det.bbox) + self.assertEqual(det.source_image, self.source_image) + self.assertEqual(det.detection_algorithm, self.algorithm) + # timestamp is intentionally not set on the builder — Detection.save() backfills + # it from the source image's capture time, so the marker sorts by capture time + # rather than processing time. + self.assertIsNone(det.timestamp) + self.assertTrue(det.is_null_marker) + + def test_valid_and_null_markers_are_disjoint_and_complete(self): + real = Detection.objects.create( + source_image=self.source_image, + bbox=[0.0, 0.0, 1.0, 1.0], + detection_algorithm=self.algorithm, + ) + null_via_none = Detection.objects.create( + source_image=self.source_image, + bbox=None, + detection_algorithm=self.algorithm, + ) + null_via_empty = Detection.objects.create( + source_image=self.source_image, + bbox=[], + detection_algorithm=self.algorithm, + ) + scoped = Detection.objects.filter(source_image=self.source_image) + valid_pks = set(scoped.valid().values_list("pk", flat=True)) + null_pks = set(scoped.null_markers().values_list("pk", flat=True)) + self.assertEqual(valid_pks, {real.pk}) + self.assertEqual(null_pks, {null_via_none.pk, null_via_empty.pk}) + self.assertEqual(valid_pks & null_pks, set()) + + +class TestOccurrenceValidQuerySet(TestCase): + """ + Covers OccurrenceQuerySet.valid() tightening for Issue #1310: + excludes occurrences with no real detections and occurrences missing a + determination, in addition to the existing detections__isnull=True + exclusion. + """ + + def setUp(self): + from ami.main.models import Taxon + from ami.ml.models.algorithm import Algorithm + + self.project = Project.objects.create(name="Occurrence Valid Test Project") + self.deployment = Deployment.objects.create(project=self.project, name="dep") + self.event = Event.objects.create( + project=self.project, + deployment=self.deployment, + group_by="2024-01-01", + start=datetime.datetime(2024, 1, 1, 0, 0), + ) + self.source_image = SourceImage.objects.create( + deployment=self.deployment, + project=self.project, + event=self.event, + path="occvalid-test.jpg", + ) + self.algorithm = Algorithm.objects.create(name="valid-detector", key="valid-detector", task_type="detection") + self.taxon = Taxon.objects.create(name="Occurrence Valid Test Taxon") + + def _make_occurrence_with_real_detection(self) -> Occurrence: + occ = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=self.taxon, + ) + Detection.objects.create( + source_image=self.source_image, + bbox=[0.0, 0.0, 1.0, 1.0], + detection_algorithm=self.algorithm, + occurrence=occ, + ) + return occ + + def _make_occurrence_with_only_null_detection(self) -> Occurrence: + occ = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=self.taxon, + ) + Detection.objects.create( + source_image=self.source_image, + bbox=None, + detection_algorithm=self.algorithm, + occurrence=occ, + ) + return occ + + def _make_occurrence_with_null_determination(self) -> Occurrence: + occ = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=None, + ) + Detection.objects.create( + source_image=self.source_image, + bbox=[0.0, 0.0, 1.0, 1.0], + detection_algorithm=self.algorithm, + occurrence=occ, + ) + return occ + + def test_valid_returns_only_real_with_determination(self): + real = self._make_occurrence_with_real_detection() + null_only = self._make_occurrence_with_only_null_detection() + no_determination = self._make_occurrence_with_null_determination() + + valid_qs = Occurrence.objects.filter(project=self.project).valid() + valid_pks = set(valid_qs.values_list("pk", flat=True)) + + self.assertIn(real.pk, valid_pks) + self.assertNotIn(null_only.pk, valid_pks) + self.assertNotIn(no_determination.pk, valid_pks) + + +class TestCleanupNullOnlyOccurrencesCommand(TestCase): + """ + Covers ami/main/management/commands/cleanup_null_only_occurrences.py. + Verifies dry-run reports counts without deleting and --commit deletes + phantom occurrences and orphan null markers, leaving valid rows alone. + """ + + def setUp(self): + from ami.main.models import Taxon + from ami.ml.models.algorithm import Algorithm + + self.project = Project.objects.create(name="Cleanup Cmd Test Project") + self.deployment = Deployment.objects.create(project=self.project, name="dep") + self.event = Event.objects.create( + project=self.project, + deployment=self.deployment, + group_by="2024-01-01", + start=datetime.datetime(2024, 1, 1, 0, 0), + ) + self.algorithm = Algorithm.objects.create( + name="cleanup-detector", key="cleanup-detector", task_type="detection" + ) + self.taxon = Taxon.objects.create(name="Cleanup Test Taxon") + + self.img_with_real = SourceImage.objects.create( + deployment=self.deployment, + project=self.project, + event=self.event, + path="with-real.jpg", + ) + self.img_only_null = SourceImage.objects.create( + deployment=self.deployment, + project=self.project, + event=self.event, + path="only-null.jpg", + ) + + self.valid_occurrence = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=self.taxon, + ) + self.real_detection = Detection.objects.create( + source_image=self.img_with_real, + bbox=[0.0, 0.0, 1.0, 1.0], + detection_algorithm=self.algorithm, + occurrence=self.valid_occurrence, + ) + self.null_on_processed_image = Detection.objects.create( + source_image=self.img_with_real, + bbox=None, + detection_algorithm=self.algorithm, + ) + + self.phantom_occurrence = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=self.taxon, + ) + self.phantom_null = Detection.objects.create( + source_image=self.img_only_null, + bbox=None, + detection_algorithm=self.algorithm, + occurrence=self.phantom_occurrence, + ) + + def _call_command(self, *args): + from io import StringIO + + from django.core.management import call_command + + out = StringIO() + call_command("cleanup_null_only_occurrences", *args, stdout=out) + return out.getvalue() + + def test_dry_run_reports_counts_without_deleting(self): + output = self._call_command(f"--project={self.project.pk}") + self.assertIn("Phantom occurrences", output) + self.assertIn("Orphan null-marker detections", output) + self.assertIn("Dry run", output) + self.assertTrue(Occurrence.objects.filter(pk=self.phantom_occurrence.pk).exists()) + self.assertTrue(Detection.objects.filter(pk=self.phantom_null.pk).exists()) + + def test_commit_deletes_phantoms_and_orphan_null_markers(self): + self._call_command(f"--project={self.project.pk}", "--commit") + self.assertFalse(Occurrence.objects.filter(pk=self.phantom_occurrence.pk).exists()) + self.assertFalse(Detection.objects.filter(pk=self.phantom_null.pk).exists()) + self.assertTrue(Occurrence.objects.filter(pk=self.valid_occurrence.pk).exists()) + self.assertTrue(Detection.objects.filter(pk=self.real_detection.pk).exists()) + self.assertTrue( + Detection.objects.filter(pk=self.null_on_processed_image.pk).exists(), + "Null markers on images with at least one real detection must be kept", + ) + + def test_commit_is_idempotent(self): + self._call_command(f"--project={self.project.pk}", "--commit") + second_run = self._call_command(f"--project={self.project.pk}", "--commit") + self.assertIn("Nothing to clean up", second_run) diff --git a/ami/ml/models/pipeline.py b/ami/ml/models/pipeline.py index c259e4aea..9b0da5b3a 100644 --- a/ami/ml/models/pipeline.py +++ b/ami/ml/models/pipeline.py @@ -23,7 +23,6 @@ from ami.base.models import BaseModel, BaseQuerySet from ami.base.schemas import ConfigurableStage, default_stages from ami.main.models import ( - NULL_DETECTIONS_FILTER, Classification, Deployment, Detection, @@ -96,11 +95,11 @@ def filter_processed_images( task_logger.debug(f"Image {image} needs processing: has no existing detections from pipeline's detector") # If there are no existing detections from this pipeline, send the image yield image - elif not existing_detections.exclude(NULL_DETECTIONS_FILTER).exists(): # type: ignore + elif not existing_detections.valid().exists(): # All detections for this image are null (processed but nothing found) — skip task_logger.debug(f"Image {image} has only null detections from pipeline {pipeline}, skipping!") continue - elif existing_detections.exclude(NULL_DETECTIONS_FILTER).filter(classifications__isnull=True).exists(): + elif existing_detections.valid().filter(classifications__isnull=True).exists(): # Check if any real detections (non-null) have no classifications task_logger.debug( f"Image {image} needs processing: has existing detections with no classifications " @@ -439,9 +438,10 @@ def get_or_create_detection( ), f"Detection belongs to a different source image: {detection_repr}" if serialized_bbox is None: - # Null detection: algorithm-specific lookup so different pipelines don't share sentinels. - # Use bbox__isnull=True because JSONField filter(bbox=None) matches JSON null literal, - # not SQL NULL which is what Detection(bbox=None) stores. + # Null marker: algorithm-specific lookup so different pipelines don't share sentinels. + # Use .null_markers() so legacy bbox=[] sentinels from older runs are also matched and + # re-used instead of producing duplicate rows. Detection.NULL_BBOX is the canonical + # sentinel value used for new writes; .null_markers() recognises both forms. assert detection_resp.algorithm, f"No detection algorithm was specified for detection {detection_repr}" try: detection_algo = algorithms_known[detection_resp.algorithm.key] @@ -451,11 +451,14 @@ def get_or_create_detection( "The processing service must declare it in the /info endpoint. " f"Known algorithms: {list(algorithms_known.keys())}" ) from err - existing_detection = Detection.objects.filter( - source_image=source_image, - bbox__isnull=True, - detection_algorithm=detection_algo, - ).first() + existing_detection = ( + Detection.objects.filter( + source_image=source_image, + detection_algorithm=detection_algo, + ) + .null_markers() + .first() + ) else: # Real detection: algorithm-agnostic — same bbox = same physical detection existing_detection = Detection.objects.filter( @@ -952,15 +955,6 @@ def save_results( "Algorithms and category maps must be registered before processing, using /info endpoint." ) - # Ensure all images have detections - # if not, add a NULL detection (empty bbox) to the results - null_detections = create_null_detections_for_undetected_images( - results=results, - detection_algorithm=detection_algorithm, - logger=job_logger, - ) - results.detections = results.detections + null_detections - detections = create_detections( detections=results.detections, algorithms_known=algorithms_known, @@ -999,6 +993,22 @@ def save_results( for deployment in Deployment.objects.filter(pk__in=deployment_ids): deployment.update_calculated_fields(save=True) + # Mark images with no real detections as processed by creating null-bbox sentinels. + # Issue #1310: MUST be the final write. Persisting a null marker is the signal that + # filter_processed_images uses to skip an image on future runs, so it can only be + # written after every step that might fail has succeeded. Any raise earlier in the + # pipeline leaves the image unmarked so it gets retried on the next run. + null_detection_responses = create_null_detections_for_undetected_images( + results=results, + detection_algorithm=detection_algorithm, + logger=job_logger, + ) + create_detections( + detections=null_detection_responses, + algorithms_known=algorithms_known, + logger=job_logger, + ) + total_time = time.time() - start_time job_logger.info(f"Saved results from pipeline {pipeline} in {total_time:.2f} seconds") diff --git a/ami/ml/tests.py b/ami/ml/tests.py index 0eaa0a237..83b4907fb 100644 --- a/ami/ml/tests.py +++ b/ami/ml/tests.py @@ -13,6 +13,7 @@ Deployment, Detection, Event, + Occurrence, Project, SourceImage, SourceImageCollection, @@ -1024,6 +1025,117 @@ def test_null_detection_deduplication_same_pipeline(self): null_detections = image.detections.filter(bbox__isnull=True) self.assertEqual(null_detections.count(), 1, "Same pipeline should not create duplicate null detections") + def test_null_detection_does_not_create_phantom_occurrence(self): + """ + Issue #1310: a null detection (empty-bbox sentinel marking "image processed, + nothing found") must NOT spawn an Occurrence. Occurrences with no + determination and no real detections leak to the API as ghost rows. + """ + image = self.test_images[0] + results = self.fake_pipeline_results([image], self.pipeline) + results.detections = [] # pipeline found nothing + + save_results(results) + + null_dets = image.detections.filter(bbox__isnull=True) + self.assertEqual(null_dets.count(), 1, "Null marker should still be created") + self.assertIsNone( + null_dets.first().occurrence, + "Null detection must NOT be associated with an Occurrence", + ) + # No phantom Occurrence in DB tied to this image at all + phantom_occs = Occurrence.objects.filter(detections__source_image=image, determination__isnull=True) + self.assertEqual( + phantom_occs.count(), + 0, + "No Occurrence with NULL determination should exist for an image that had no detections", + ) + + def test_captures_not_marked_processed_after_failure(self): + """ + Issue #1310: null markers should only flag images as processed AFTER all + downstream save steps (classifications, occurrences) succeed. If any + downstream step raises, the image must remain unmarked so the next run + re-processes it. + + Reproduces the field bug where 400 images ended up with null markers but + no real detections — created when null-creation ran ahead of a later step + that failed. + """ + from unittest.mock import patch + + from ami.ml.models.pipeline import filter_processed_images + + # Mix: image_with_real has a detection in the response, image_without_real does not. + # The without-real image is the one that would get a null marker. + image_with_real, image_without_real = self.test_images + results = self.fake_pipeline_results(self.test_images, self.pipeline) + # Trim detections to only the first image so the second qualifies for null-marker creation + results.detections = [d for d in results.detections if str(d.source_image_id) == str(image_with_real.pk)] + + # Inject failure in a step that runs AFTER detection bulk_create + with patch( + "ami.ml.models.pipeline.create_classifications", + side_effect=RuntimeError("simulated classification failure"), + ): + with self.assertRaises(RuntimeError): + save_results(results) + + # The image with no real detection must NOT have a null marker — + # the run failed, so it should be re-tried. + null_dets = image_without_real.detections.filter(bbox__isnull=True) + self.assertEqual( + null_dets.count(), + 0, + "Image without real detections must not be marked processed when downstream step fails", + ) + # filter_processed_images should still yield it for the next run + retry_yield = list(filter_processed_images([image_without_real], self.pipeline)) + self.assertEqual( + retry_yield, + [image_without_real], + "Image with failed run must be re-yielded for processing", + ) + + def test_null_marker_not_persisted_when_broker_dispatch_fails(self): + """ + Issue #1310 (takeaway-review follow-up): null markers must be the FINAL + write in save_results. Failures in any of the trailing steps — + create_detection_images.delay (broker outage), update_calculated_fields_for_events + (DB error), Deployment.update_calculated_fields (DB error) — must leave the + image unmarked. + + This test patches the celery dispatch to raise, simulating a broker + outage between the real-detection save and the null-marker save. + """ + from unittest.mock import patch + + from ami.ml.models.pipeline import filter_processed_images + + image_with_real, image_without_real = self.test_images + results = self.fake_pipeline_results(self.test_images, self.pipeline) + results.detections = [d for d in results.detections if str(d.source_image_id) == str(image_with_real.pk)] + + with patch( + "ami.ml.models.pipeline.create_detection_images.delay", + side_effect=RuntimeError("simulated broker outage"), + ): + with self.assertRaises(RuntimeError): + save_results(results) + + null_dets = image_without_real.detections.filter(bbox__isnull=True) + self.assertEqual( + null_dets.count(), + 0, + "Null marker must not be persisted when create_detection_images.delay fails", + ) + retry_yield = list(filter_processed_images([image_without_real], self.pipeline)) + self.assertEqual( + retry_yield, + [image_without_real], + "Image with failed broker dispatch must be re-yielded for processing", + ) + class TestAlgorithmCategoryMaps(TestCase): def setUp(self): diff --git a/docs/claude/planning/pr-1312-null-marker-followup.md b/docs/claude/planning/pr-1312-null-marker-followup.md new file mode 100644 index 000000000..e0b6a9964 --- /dev/null +++ b/docs/claude/planning/pr-1312-null-marker-followup.md @@ -0,0 +1,154 @@ +# PR #1312 follow-up — null-marker abstraction + move-to-end + cleanup + +Created 2026-05-20. Builds on existing branch `fix/premptive-processed-marker`. + +Splits remaining work from the takeaway review of PR #1312 into two PRs. + +## PR-A (this plan): move null to end + null-detection abstraction + project 171 cleanup + +Extends PR #1312. Adds commits on top of the existing `4e33f96` ordering fix. + +### Scope rationale + +Current PR moves null markers AFTER real detection / classification / occurrence saves but BEFORE `source_image.save()` loop, `create_detection_images.delay()`, `update_calculated_fields_for_events`, and `Deployment.update_calculated_fields`. Per Copilot's review: any raise in those four steps still leaves a null marker persisted while the function failed. + +PR-A closes that window by moving null persistence to the absolute final step. Failure window left = the return statement — effectively zero. + +Bundled because all changes touch the same call graph (`save_results` body + `Detection` manager + `OccurrenceQuerySet.valid()` + cleanup of the rows the bug produced). Reviewing them together gives a single coherent semantic for what "null marker" means going forward. + +### Final order in `save_results` + +1. `create_detections(real_responses, ...)` +2. `create_classifications(...)` +3. `create_and_update_occurrences_for_detections(...)` +4. `source_image.save()` loop +5. `create_detection_images.delay(...)` +6. `update_calculated_fields_for_events(pks=event_ids)` +7. `Deployment.update_calculated_fields(save=True)` loop +8. `create_detections(null_responses, ...)` ← was step 0, now last write +9. return + +### Null-detection abstraction + +`ami/main/models.py`: + +```python +class Detection(BaseModel): + NULL_BBOX = None # canonical sentinel value for new rows + + @property + def is_null_marker(self) -> bool: + return self.bbox is None or self.bbox == [] + + @classmethod + def build_null_marker(cls, source_image, detection_algorithm) -> "Detection": + return cls( + source_image=source_image, + bbox=cls.NULL_BBOX, + detection_algorithm=detection_algorithm, + timestamp=now(), + ) + + +class DetectionQuerySet(BaseQuerySet): + def valid(self): + """ + Detections suitable for consumer queries — excludes null-marker sentinels. + Future predicates to fold in here: soft-delete tombstones, missing + detection_algorithm, missing classifications. + Consumers asking 'give me detections' should always go through .valid(). + """ + return self.exclude(NULL_DETECTIONS_FILTER) + + def null_markers(self): + """ + Sentinel rows recording 'this algorithm ran against this image and + found nothing.' Only for SourceImage-level 'has this been processed?' + questions. Detection consumers should use .valid() instead. + """ + return self.filter(NULL_DETECTIONS_FILTER) +``` + +Rename existing `DetectionQuerySet.null_detections()` (`ami/main/models.py:2721`) → `null_markers()`. Single sweep across codebase. + +### Call-site sweep + +`.exclude(NULL_DETECTIONS_FILTER)` → `.valid()`: +- `ami/main/models.py:817, 1229, 2037, 2243` +- `ami/main/api/views.py:614, 913` +- `ami/ml/models/pipeline.py:99, 103` + +`.filter(NULL_DETECTIONS_FILTER)` → `.null_markers()`: +- `ami/main/models.py:2722` (the method itself, becomes the body of `null_markers()`) + +Drifted inline at `ami/main/models.py:4108` (`~Q(...bbox__isnull=True) & ~Q(...bbox=[])`) → rewrite to use `.valid()` via subquery or join-based predicate. Verify it still works against ORM aggregation. + +Inline `bbox__isnull=True` at `ami/ml/models/pipeline.py:454-458` — intentional (only `IS NULL` form, comment explains). Leave alone but add a one-line comment pointing readers to `Detection.NULL_BBOX` for the canonical sentinel. + +### Occurrence valid() tightening + +`OccurrenceQuerySet.valid()` at `ami/main/models.py:2913` currently only excludes `detections__isnull=True`. Extend to also exclude: +- Occurrences whose only detections are null markers +- Occurrences with `determination__isnull=True` + +Use new `Detection.objects.valid()` helper to express "has at least one valid detection." + +### Project 171 cleanup + +Management command `ami/main/management/commands/cleanup_null_only_occurrences.py`: +- For each Occurrence in given project with no `Detection.objects.valid()` rows: delete the Occurrence and its null-marker Detection rows +- Source images then re-yield from `filter_processed_images` on next pipeline run +- Idempotent: re-running on a cleaned project is a no-op +- Dry-run mode by default + +### TDD test plan + +Tests added to `ami/ml/tests.py::TestPipeline` (extends the two already added in `4e33f96`): + +1. **Broker outage path** — patch `create_detection_images.delay` to raise `OperationalError`. Assert: + - `RuntimeError`/`OperationalError` propagates + - Zero null markers on the undetected image + - `filter_processed_images` yields the image on a second run + +2. **Calc-field DB error path** — patch `update_calculated_fields_for_events` to raise. Same assertions as #1. + +3. **`Detection.is_null_marker` property** — direct property test for both `bbox=None` and `bbox=[]` legacy form. + +4. **`Detection.objects.valid()` and `.null_markers()`** — assert disjoint querysets covering full Detection set for a fixture image with one real + one null detection. + +5. **OccurrenceQuerySet.valid() exclusion** — fixture with three occurrences: real detection only / null detection only / null determination. Assert `valid()` returns only the first. + +6. **Cleanup management command dry-run** — fixture project with phantom occurrences. Run command with `--dry-run`. Assert reported counts match. Run without dry-run. Assert deletion. + +### Commit shape + +Roughly: +1. `test(ml): RED test for broker-outage leaving null marker` +2. `fix(ml): move null-marker creation to final step in save_results` +3. `refactor(main): add DetectionQuerySet.valid()/.null_markers() + Detection.is_null_marker + build_null_marker` +4. `refactor(main): sweep call sites to use .valid() / .null_markers()` +5. `fix(main): tighten OccurrenceQuerySet.valid() to exclude null-only and null-determination` +6. `feat(main): cleanup_null_only_occurrences management command` + +### Out of scope for PR-A + +- `transaction.atomic()` + `transaction.on_commit` wrap — see PR-B below +- Re-classification gap in `filter_processed_images` — separate ticket +- `bbox = '[]'` legacy data migration — `NULL_DETECTIONS_FILTER` absorbs it; rewrite is unnecessary churn + +## PR-B (follow-up): narrow transaction.atomic() + on_commit + +After PR-A merges. Tracked at `docs/claude/planning/pr-1312-tx-wrap-followup.md` (to be written). + +Wraps `create_detections` + `create_classifications` + `create_and_update_occurrences_for_detections` + final `create_detections(null_responses, ...)` in `transaction.atomic()`. Dispatches `create_detection_images.delay` via `transaction.on_commit`. Calc-field updates stay outside. + +Closes the narrow "real detection committed but classification failed mid-bulk-create" window that PR-A leaves open. Separate PR because tx changes carry concurrency risk (PR-1261 scar) — needs its own multi-worker contention e2e plan and clean revert path. + +## E2E validation plan (PR-A) + +Reusing the dev-box atomic-rollback testing pattern from the previous session: + +1. Happy path: full async_api job. Assert no new phantom occurrences, all real detections persist, null markers on undetected images persist. +2. Broker-outage simulation: kill RabbitMQ mid-job or patch `delay()` to raise. Assert image stays in `filter_processed_images` yield list. +3. Calc-field DB error: patch `update_calculated_fields_for_events` to raise. Same assertion. +4. Cleanup command: run dry-run on project 171, capture counts, run for real, assert API no longer returns null-only Occurrences. diff --git a/docs/claude/sessions/2026-05-19-pr-1312-premptive-processed-marker.md b/docs/claude/sessions/2026-05-19-pr-1312-premptive-processed-marker.md new file mode 100644 index 000000000..09d700c68 --- /dev/null +++ b/docs/claude/sessions/2026-05-19-pr-1312-premptive-processed-marker.md @@ -0,0 +1,97 @@ +# PR #1312 — fix preemptive null-detection marker (Issue #1310) + +Created 2026-05-19. Branch: `fix/premptive-processed-marker`. URL: https://github.com/RolnickLab/antenna/pull/1312 + +## Status + +- ✅ PR opened against `main` +- ✅ TDD: 2 new failing tests added (`ami/ml/tests.py::TestPipeline`), then code fix made them pass +- ✅ Local test suites: `ami.ml.tests` + `ami.jobs.tests` = 170/170 pass; 18/18 in `TestPipeline` +- ✅ Manual e2e on serbia dev server (2026-05-19) — 3 paths validated, see below + +## Bug + +Issue #1310: project 171 had ~400 captures marked as processed with only null-bbox detections, no real detections, and phantom Occurrences with `determination=NULL` leaking to API. + +Root cause in `ami/ml/models/pipeline.py::save_results` (pre-fix order): +1. `create_null_detections_for_undetected_images` ran FIRST, built null `DetectionResponse`s for images with no real detections +2. Null responses merged into `results.detections` (`results.detections = results.detections + null_detections`) +3. `create_detections` bulk-saved the merged list (null + real) +4. `create_classifications` and `create_and_update_occurrences_for_detections` then iterated **all** detections — including nulls — so each null spawned an Occurrence with `determination=NULL` + +Failure mode: any exception in steps 3-4 left the image with a null marker already in DB but no real detections + phantom Occurrences. `filter_processed_images` then permanently skipped these images. + +## Fix (PR #1312) + +`ami/ml/models/pipeline.py::save_results`: +- Real `create_detections` / `create_classifications` / `create_and_update_occurrences_for_detections` run first on real DetectionResponses only +- THEN `create_null_detections_for_undetected_images` builds null responses, passed to a SECOND `create_detections` call +- Nulls never enter the classification or occurrence loops → no phantom Occurrences even on happy path +- If any earlier step raises, null markers are never persisted → `filter_processed_images` re-yields the image on retry + +## Tests added + +`ami/ml/tests.py::TestPipeline`: +- `test_null_detection_does_not_create_phantom_occurrence` (line ~1028) — happy path: pipeline finds nothing, null marker created, no Occurrence +- `test_captures_not_marked_processed_after_failure` (line ~1054) — patches `create_classifications` to raise; asserts no null marker persisted, `filter_processed_images` re-yields image + +Both tests confirmed RED against pre-fix code, GREEN after fix. + +## Verified safe (during brainstorm) + +Skipping Occurrence creation for null detections is safe because: +- `Detection.occurrence` FK is `null=True, on_delete=SET_NULL` (`ami/main/models.py:2764-2770`) +- Tracking not yet implemented — `create_and_update_occurrences_for_detections` carries `@TODO remove when we implement tracking!` comment +- All `Detection.objects.filter(occurrence=…)` traversals start FROM an Occurrence; never traverse from a null detection +- `create_classifications` loops paired responses; null DetectionResponses have empty `.classifications` → never iterated +- `seed_synthetic_occurrences` already excludes `occurrence__isnull=True` + +## Out-of-scope follow-ups (in PR body) + +1. **API filter for null-only / no-determination occurrences.** `OccurrenceQuerySet.valid()` (`ami/main/models.py:2913-2914`) currently only excludes `detections__isnull=True`. Doesn't exclude null-only-detection occurrences or `determination__isnull=True`. With this PR no new phantom Occurrences will be created, but project 171's existing phantoms still surface via API until `valid()` is tightened or `OccurrenceViewSet.get_queryset` (`ami/main/api/views.py:1220-1238`) adds the exclusion. + +2. **Cleanup of project 171's broken state.** ~400 source images have null-only detections + phantom Occurrences. Need management command to delete null-only Detection rows + their phantom Occurrences so `filter_processed_images` re-processes them. + +3. **TODO ticket: re-classification gap.** `filter_processed_images` notes "we don't yet have a mechanism to reclassify detections" — current behavior is to reprocess from scratch. Not blocking but worth tracking. + +## E2E results (serbia dev box, 2026-05-19) + +Serbia originally on `copilot/implement-option-a-job-logs`. Fetched + checked out `fix/premptive-processed-marker`, restarted django + celeryworker, confirmed reordered `save_results` loaded via `inspect.getsource`. After testing, restored to original branch. + +### Path 1: happy path — async_api job (full stack) + +Triggered via `manage.py test_ml_job_e2e --project 9 --collection 38 --pipeline quebec_vermont_moths_2023 --dispatch-mode async_api`. Job 162 succeeded in 44s. NATS path, 10 images, 8 save_results batches. Detection / occurrence counts unchanged (project 9 has `reprocess_existing_detections=False`, so existing rows were not duplicated). No new phantom occurrences. + +### Path 2: zero-detection path — synthetic save_results inside atomic + +Live script `/tmp/test_save_results_live.py` (copied into django container). Built a `PipelineResultsResponse` with two source images: 173110 (one bbox detection) and 173740 (no detection in `results.detections`). Called `save_results` inside `transaction.atomic()` with explicit rollback at end. + +In-flight state observed: +- si=173110: 1 new real detection + pre-existing pristine null det (52287 → pre-existing phantom occ 52073) +- si=173740: 1 new null marker (det 61357, `bbox=None`, `occurrence_id=None`) +- New phantom occurrences (excluding pre-existing 52073): **0** + +Post-rollback: baseline restored. + +### Path 3: failure path — patch `create_classifications` to raise + +Live script `/tmp/test_failure_path.py`. Same `PipelineResultsResponse` as path 2, but wrapped `save_results` call in `patch("ami.ml.models.pipeline.create_classifications", side_effect=RuntimeError(...))`. Confirmed RuntimeError propagates up out of `save_results`. In-flight state: +- si=173110: 1 new real detection created (create_detections ran before classification raised) +- si=173740: **0 new null markers** — `create_null_detections_for_undetected_images` never ran because the exception fired before it. + +This confirms the core fix: image without real detections stays unmarked when downstream save fails, so `filter_processed_images` will re-yield it on retry. + +### Same `save_results` is shared by v1 sync + v2 async (NATS) + +Confirmed: path 1 was async_api (NATS), but the function under test is identical for sync_api too. Single e2e on either dispatch path validates both. + +## Files touched + +- `ami/ml/models/pipeline.py` — `save_results` body reordered, +null_detection_responses path +- `ami/ml/tests.py` — Occurrence import + 2 new tests + +## Compose gotcha (worth remembering) + +For tests on main repo while a worktree override is active in `docker-compose.override.yml`, use `docker-compose.ci.yml` which doesn't pull the override (only mounts `.:/app:z` from current directory). The local dev stack will read the bound worktree subdir instead of the main repo. + +Stash trap encountered: `git stash` on the main repo branch did NOT stash uncommitted edits to `ami/ml/tests.py` + `ami/ml/models/pipeline.py` cleanly during a baseline-verification run. Edits were lost on `stash pop`. Workaround: don't stash to compare against baseline; instead run the test pre-edit on a clean checkout, or `git diff` the file directly.