Skip to content

Commit ea5508a

Browse files
authored
Import report country code data (#170)
* Import report country code data * Update backfill command to import country data
1 parent bbc621d commit ea5508a

5 files changed

Lines changed: 51 additions & 12 deletions

File tree

server/reportmanager/management/commands/backfill_missing_report_data.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747

4848
from django.conf import settings
4949
from django.core.management import BaseCommand
50+
from django.db.models import Q
5051
from google.cloud import bigquery
5152
from google.oauth2 import service_account
5253

@@ -62,6 +63,7 @@ class BackfillData:
6263
ml_valid_probability: float | None
6364
language_code: str | None
6465
translated_text: str | None
66+
country: str | None
6567

6668

6769
class Command(BaseCommand):
@@ -80,17 +82,19 @@ def handle(self, *args, **options) -> None:
8082

8183
def run_backfill(self) -> None:
8284
# Find reports needing ML updates (only those with non-empty comments)
83-
reports_to_update = ReportEntry.objects.filter(
84-
ml_valid_probability__isnull=True, comments__isnull=False
85-
).exclude(comments="")
85+
reports_to_update = (
86+
ReportEntry.objects.filter(comments__isnull=False)
87+
.exclude(comments="")
88+
.filter(Q(ml_valid_probability__isnull=True) | Q(country__isnull=True))
89+
)
8690

8791
total_reports = reports_to_update.count()
8892

8993
if total_reports == 0:
90-
LOG.info("No reports need ML backfill")
94+
LOG.info("No reports need backfill")
9195
return
9296

93-
LOG.info("Found %d reports needing ML backfill", total_reports)
97+
LOG.info("Found %d reports needing backfill", total_reports)
9498

9599
all_reports = list(reports_to_update)
96100
batches = list(batched(all_reports, self.BQ_BATCH_SIZE))
@@ -114,12 +118,12 @@ def run_backfill(self) -> None:
114118
len(report_batch),
115119
)
116120

117-
uuid_batch: list[str] = [str(report.uuid) for report in report_batch]
121+
uuid_batch = {str(report.uuid): report for report in report_batch}
118122

119123
query: str = f"""
120124
SELECT r.uuid,
121125
c.label as ml_label, c.probability as ml_probability,
122-
t.language_code, t.translated_text
126+
t.language_code, t.translated_text, r.country
123127
FROM `{settings.BIGQUERY_TABLE}` as r
124128
INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
125129
ON r.uuid = c.report_uuid
@@ -138,13 +142,18 @@ def run_backfill(self) -> None:
138142

139143
bq_data: dict[str, BackfillData] = {}
140144
for row in result:
141-
ml_valid_probability = transform_ml_label(
142-
row.ml_label, row.ml_probability
143-
)
145+
report = uuid_batch[row.uuid]
146+
if report.ml_valid_probability is None:
147+
ml_valid_probability = transform_ml_label(
148+
row.ml_label, row.ml_probability
149+
)
150+
else:
151+
ml_valid_probability = report.ml_valid_probability
144152
bq_data[row.uuid] = BackfillData(
145153
ml_valid_probability=ml_valid_probability,
146154
language_code=row.language_code,
147155
translated_text=row.translated_text,
156+
country=row.country,
148157
)
149158

150159
LOG.info("Fetched data for %d reports from BigQuery", len(bq_data))
@@ -160,13 +169,15 @@ def run_backfill(self) -> None:
160169
if uuid in bq_data:
161170
data = bq_data[uuid]
162171
updated = False
172+
retriage = False
163173

164174
if (
165175
report.ml_valid_probability is None
166176
and data.ml_valid_probability is not None
167177
):
168178
report.ml_valid_probability = data.ml_valid_probability
169179
updated = True
180+
retriage = True
170181

171182
if (
172183
report.comments_translated is None
@@ -178,12 +189,17 @@ def run_backfill(self) -> None:
178189
data.translated_text
179190
)
180191
updated = True
192+
retriage = True
193+
194+
if report.country is None and data.country is not None:
195+
report.country = data.country
196+
updated = True
181197

182198
if updated:
183199
reports_to_update.append(report)
184200

185201
# Clear bucket assignment to re-triage these reports
186-
if report.cluster_id is None:
202+
if retriage and report.cluster_id is None:
187203
report.bucket_id = None
188204

189205
if reports_to_update:
@@ -195,6 +211,7 @@ def run_backfill(self) -> None:
195211
"comments_original_language",
196212
"comments_preprocessed",
197213
"bucket_id",
214+
"country",
198215
],
199216
batch_size=self.DB_BATCH_SIZE,
200217
)

server/reportmanager/management/commands/import_reports_from_bigquery.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def handle(self, *args, **options):
4141
result = client.query(
4242
f"""SELECT
4343
r.*, t.language_code, t.translated_text,
44-
c.label as ml_label, c.probability as ml_probability
44+
c.label as ml_label, c.probability as ml_probability,
4545
FROM `{settings.BIGQUERY_TABLE}` as r
4646
LEFT JOIN `{settings.BIGQUERY_TRANSLATIONS_TABLE}` t
4747
ON r.uuid = t.report_uuid
@@ -68,6 +68,7 @@ def handle(self, *args, **options):
6868
comments=row.comments,
6969
comments_translated=row.translated_text,
7070
comments_original_language=row.language_code,
71+
country=row.country,
7172
details=row.details,
7273
reported_at=row.reported_at.replace(tzinfo=UTC),
7374
url=urlsplit(row.url),
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 6.0.3 on 2026-04-23 19:03
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('reportmanager', '0016_joblock'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='reportentry',
15+
name='country',
16+
field=models.TextField(max_length=2, null=True),
17+
),
18+
]

server/reportmanager/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,7 @@ def create_from_report(self, report, bucket_id=None, cluster_id=None):
655655
cluster_id=cluster_id,
656656
domain=domain,
657657
comments_preprocessed=preprocessed,
658+
country=report.country,
658659
)
659660

660661

@@ -680,6 +681,7 @@ class ReportEntry(models.Model):
680681
)
681682
domain: models.CharField = models.CharField(max_length=255, null=True)
682683
comments_preprocessed: models.TextField = models.TextField(null=True)
684+
country: models.TextField = models.TextField(max_length=2, null=True)
683685

684686
objects = ReportEntryManager()
685687

src/webcompat/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class Report:
5555
breakage_category: str | None = None
5656
ml_valid_probability: float | None = None
5757
cluster_id: str | None = None
58+
country: str | None = None
5859

5960
@classmethod
6061
def load(cls, data: str) -> Report:

0 commit comments

Comments
 (0)