Merge pull request #44 from MozillaSecurity/import-ml-classification

denschub · web-flow · commit 927abeca3593 · 2025-07-10T11:40:24.000+02:00
Import BugBot ML spam score, and only show reports with &gt;=95% valid probability.
diff --git a/server/frontend/src/components/Buckets/View.vue b/server/frontend/src/components/Buckets/View.vue
@@ -247,6 +247,7 @@ export default {
         query: JSON.stringify({
           op: "AND",
           comments__length__gt: 0,
+          ml_valid_probability__gt: 0.95,
           bucket_id: this.bucket.id,
         }),
       };
diff --git a/server/reportmanager/management/commands/import_reports_from_bigquery.py b/server/reportmanager/management/commands/import_reports_from_bigquery.py
@@ -38,10 +38,14 @@ def handle(self, *args, **options):
         # These shouldn't even exist, but we have quite a few rows like that
         # anyway. Since they're most likely just broken reports, we don't care.
         result = client.query_and_wait(
-            f"""SELECT r.*, t.language_code, t.translated_text
+            f"""SELECT
+                    r.*, t.language_code, t.translated_text,
+                    c.label as ml_label, c.probability as ml_probability
                 FROM `{settings.BIGQUERY_TABLE}` as r
                 LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
                     ON r.uuid = t.report_uuid
+                LEFT JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
+                    ON r.uuid = c.report_uuid
                 WHERE r.url IS NOT NULL
                     AND r.comments IS NOT NULL
                     AND r.reported_at >= @since;""",
@@ -53,6 +57,20 @@ def handle(self, *args, **options):
         )
 
         for row in result:
+            # The BugBot ML prediction can assign two labels, invalid or valid,
+            # with a probability between 0 and 1. Having two labels makes
+            # filtering and sorting harder, so let's transform "invalid 95%"
+            # into "valid 5%".
+            # There is a rare chance that a bug will have no score. In this case,
+            # we just assign None, which will get treated as invalid in the
+            # frontend.
+            ml_valid_probability = None
+            match row.ml_label:
+                case "invalid":
+                    ml_valid_probability = 1 - row.ml_probability
+                case "valid":
+                    ml_valid_probability = row.ml_probability
+
             report_obj = Report(
                 app_name=row.app_name,
                 app_channel=row.app_channel,
@@ -66,6 +84,7 @@ def handle(self, *args, **options):
                 url=urlsplit(row.url),
                 os=row.os,
                 uuid=row.uuid,
+                ml_valid_probability=ml_valid_probability,
             )
             with suppress(IntegrityError):
                 ReportEntry.objects.create_from_report(report_obj)
diff --git a/server/reportmanager/migrations/0010_reportentry_ml_valid_probability.py b/server/reportmanager/migrations/0010_reportentry_ml_valid_probability.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.17 on 2025-07-09 11:46
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('reportmanager', '0009_reportentry_comments_original_language_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='reportentry',
+            name='ml_valid_probability',
+            field=models.FloatField(null=True),
+        ),
+    ]
diff --git a/server/reportmanager/models.py b/server/reportmanager/models.py
@@ -451,6 +451,7 @@ def create_from_report(self, report):
             comments=report.comments,
             comments_translated=report.comments_translated,
             comments_original_language=report.comments_original_language,
+            ml_valid_probability=report.ml_valid_probability,
         )
 
 
@@ -468,6 +469,7 @@ class ReportEntry(models.Model):
     reported_at = models.DateTimeField()
     url = models.URLField(max_length=8192)
     uuid = models.UUIDField(unique=True)
+    ml_valid_probability = models.FloatField(null=True)
 
     objects = ReportEntryManager()
 
@@ -524,6 +526,7 @@ def get_report(self):
                 breakage_category=self.breakage_category.value
                 if self.breakage_category is not None
                 else None,
+                ml_valid_probability=self.ml_valid_probability,
             )
         return self._cached_report
 
diff --git a/server/reportmanager/serializers.py b/server/reportmanager/serializers.py
@@ -203,6 +203,7 @@ class Meta:
             "comments_original_language",
             "details",
             "id",
+            "ml_valid_probability",
             "os",
             "reported_at",
             "url",
diff --git a/server/reportmanager/views.py b/server/reportmanager/views.py
@@ -708,6 +708,7 @@ class ReportEntryViewSet(
         "comments",
         "comments__length",
         "details",
+        "ml_valid_probability",
         "os__name",
         "reported_at",
         "url",
@@ -1094,7 +1095,7 @@ def json_to_query(json_str):
         raise RuntimeError(f"Invalid JSON: {e}")
 
     def get_query_obj(obj, key=None):
-        if obj is None or isinstance(obj, str | list | int):
+        if obj is None or isinstance(obj, str | list | int | float):
             kwargs = {key: obj}
             qobj = Q(**kwargs)
             return qobj
diff --git a/server/server/settings.py b/server/server/settings.py
@@ -130,6 +130,9 @@ def resolver_context_processor(request):
 BIGQUERY_TRANSLATIONS_TABLE = (
     "moz-fx-dev-dschubert-wckb.webcompat_user_reports.translations"
 )
+BIGQUERY_CLASSIFICATION_TABLE = (
+    "moz-fx-dev-dschubert-wckb.webcompat_user_reports.bugbug_predictions"
+)
 BIGQUERY_SERVICE_ACCOUNT = ""
 
 # Modify the way we generate our usernames, based on the email address
diff --git a/src/webcompat/models.py b/src/webcompat/models.py
@@ -53,6 +53,7 @@ class Report:
     comments_original_language: str | None = None
     app_channel: str | None = None
     breakage_category: str | None = None
+    ml_valid_probability: float | None = None
 
     @classmethod
     def load(cls, data: str) -> Report:

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,9 @@ def resolver_context_processor(request):`
`130`	`130`	`BIGQUERY_TRANSLATIONS_TABLE = (`
`131`	`131`	`"moz-fx-dev-dschubert-wckb.webcompat_user_reports.translations"`
`132`	`132`	`)`
	`133`	`+BIGQUERY_CLASSIFICATION_TABLE = (`
	`134`	`+ "moz-fx-dev-dschubert-wckb.webcompat_user_reports.bugbug_predictions"`
	`135`	`+)`
`133`	`136`	`BIGQUERY_SERVICE_ACCOUNT = ""`
`134`	`137`
`135`	`138`	`# Modify the way we generate our usernames, based on the email address`