Skip to content

Commit 927abec

Browse files
authored
Merge pull request #44 from MozillaSecurity/import-ml-classification
Import BugBot ML spam score, and only show reports with >=95% valid probability.
2 parents 785d1e7 + 74f796e commit 927abec

8 files changed

Lines changed: 49 additions & 2 deletions

File tree

server/frontend/src/components/Buckets/View.vue

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ export default {
247247
query: JSON.stringify({
248248
op: "AND",
249249
comments__length__gt: 0,
250+
ml_valid_probability__gt: 0.95,
250251
bucket_id: this.bucket.id,
251252
}),
252253
};

server/reportmanager/management/commands/import_reports_from_bigquery.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,14 @@ def handle(self, *args, **options):
3838
# These shouldn't even exist, but we have quite a few rows like that
3939
# anyway. Since they're most likely just broken reports, we don't care.
4040
result = client.query_and_wait(
41-
f"""SELECT r.*, t.language_code, t.translated_text
41+
f"""SELECT
42+
r.*, t.language_code, t.translated_text,
43+
c.label as ml_label, c.probability as ml_probability
4244
FROM `{settings.BIGQUERY_TABLE}` as r
4345
LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
4446
ON r.uuid = t.report_uuid
47+
LEFT JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
48+
ON r.uuid = c.report_uuid
4549
WHERE r.url IS NOT NULL
4650
AND r.comments IS NOT NULL
4751
AND r.reported_at >= @since;""",
@@ -53,6 +57,20 @@ def handle(self, *args, **options):
5357
)
5458

5559
for row in result:
60+
# The BugBot ML prediction can assign two labels, invalid or valid,
61+
# with a probability between 0 and 1. Having two labels makes
62+
# filtering and sorting harder, so let's transform "invalid 95%"
63+
# into "valid 5%".
64+
# There is a rare chance that a bug will have no score. In this case,
65+
# we just assign None, which will get treated as invalid in the
66+
# frontend.
67+
ml_valid_probability = None
68+
match row.ml_label:
69+
case "invalid":
70+
ml_valid_probability = 1 - row.ml_probability
71+
case "valid":
72+
ml_valid_probability = row.ml_probability
73+
5674
report_obj = Report(
5775
app_name=row.app_name,
5876
app_channel=row.app_channel,
@@ -66,6 +84,7 @@ def handle(self, *args, **options):
6684
url=urlsplit(row.url),
6785
os=row.os,
6886
uuid=row.uuid,
87+
ml_valid_probability=ml_valid_probability,
6988
)
7089
with suppress(IntegrityError):
7190
ReportEntry.objects.create_from_report(report_obj)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.17 on 2025-07-09 11:46
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('reportmanager', '0009_reportentry_comments_original_language_and_more'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='reportentry',
15+
name='ml_valid_probability',
16+
field=models.FloatField(null=True),
17+
),
18+
]

server/reportmanager/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ def create_from_report(self, report):
451451
comments=report.comments,
452452
comments_translated=report.comments_translated,
453453
comments_original_language=report.comments_original_language,
454+
ml_valid_probability=report.ml_valid_probability,
454455
)
455456

456457

@@ -468,6 +469,7 @@ class ReportEntry(models.Model):
468469
reported_at = models.DateTimeField()
469470
url = models.URLField(max_length=8192)
470471
uuid = models.UUIDField(unique=True)
472+
ml_valid_probability = models.FloatField(null=True)
471473

472474
objects = ReportEntryManager()
473475

@@ -524,6 +526,7 @@ def get_report(self):
524526
breakage_category=self.breakage_category.value
525527
if self.breakage_category is not None
526528
else None,
529+
ml_valid_probability=self.ml_valid_probability,
527530
)
528531
return self._cached_report
529532

server/reportmanager/serializers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ class Meta:
203203
"comments_original_language",
204204
"details",
205205
"id",
206+
"ml_valid_probability",
206207
"os",
207208
"reported_at",
208209
"url",

server/reportmanager/views.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,7 @@ class ReportEntryViewSet(
708708
"comments",
709709
"comments__length",
710710
"details",
711+
"ml_valid_probability",
711712
"os__name",
712713
"reported_at",
713714
"url",
@@ -1094,7 +1095,7 @@ def json_to_query(json_str):
10941095
raise RuntimeError(f"Invalid JSON: {e}")
10951096

10961097
def get_query_obj(obj, key=None):
1097-
if obj is None or isinstance(obj, str | list | int):
1098+
if obj is None or isinstance(obj, str | list | int | float):
10981099
kwargs = {key: obj}
10991100
qobj = Q(**kwargs)
11001101
return qobj

server/server/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ def resolver_context_processor(request):
130130
BIGQUERY_TRANSLATIONS_TABLE = (
131131
"moz-fx-dev-dschubert-wckb.webcompat_user_reports.translations"
132132
)
133+
BIGQUERY_CLASSIFICATION_TABLE = (
134+
"moz-fx-dev-dschubert-wckb.webcompat_user_reports.bugbug_predictions"
135+
)
133136
BIGQUERY_SERVICE_ACCOUNT = ""
134137

135138
# Modify the way we generate our usernames, based on the email address

src/webcompat/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class Report:
5353
comments_original_language: str | None = None
5454
app_channel: str | None = None
5555
breakage_category: str | None = None
56+
ml_valid_probability: float | None = None
5657

5758
@classmethod
5859
def load(cls, data: str) -> Report:

0 commit comments

Comments
 (0)