Skip to content

Commit d0fb79b

Browse files
committed
Update backfill command to import country data
1 parent 524b3e1 commit d0fb79b

1 file changed

Lines changed: 28 additions & 11 deletions

File tree

server/reportmanager/management/commands/backfill_missing_report_data.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747

4848
from django.conf import settings
4949
from django.core.management import BaseCommand
50+
from django.db.models import Q
5051
from google.cloud import bigquery
5152
from google.oauth2 import service_account
5253

@@ -62,6 +63,7 @@ class BackfillData:
6263
ml_valid_probability: float | None
6364
language_code: str | None
6465
translated_text: str | None
66+
country: str | None
6567

6668

6769
class Command(BaseCommand):
@@ -80,17 +82,19 @@ def handle(self, *args, **options) -> None:
8082

8183
def run_backfill(self) -> None:
8284
# Find reports needing ML updates (only those with non-empty comments)
83-
reports_to_update = ReportEntry.objects.filter(
84-
ml_valid_probability__isnull=True, comments__isnull=False
85-
).exclude(comments="")
85+
reports_to_update = (
86+
ReportEntry.objects.filter(comments__isnull=False)
87+
.exclude(comments="")
88+
.filter(Q(ml_valid_probability__isnull=True) | Q(country__isnull=True))
89+
)
8690

8791
total_reports = reports_to_update.count()
8892

8993
if total_reports == 0:
90-
LOG.info("No reports need ML backfill")
94+
LOG.info("No reports need backfill")
9195
return
9296

93-
LOG.info("Found %d reports needing ML backfill", total_reports)
97+
LOG.info("Found %d reports needing backfill", total_reports)
9498

9599
all_reports = list(reports_to_update)
96100
batches = list(batched(all_reports, self.BQ_BATCH_SIZE))
@@ -114,12 +118,12 @@ def run_backfill(self) -> None:
114118
len(report_batch),
115119
)
116120

117-
uuid_batch: list[str] = [str(report.uuid) for report in report_batch]
121+
uuid_batch = {str(report.uuid): report for report in report_batch}
118122

119123
query: str = f"""
120124
SELECT r.uuid,
121125
c.label as ml_label, c.probability as ml_probability,
122-
t.language_code, t.translated_text
126+
t.language_code, t.translated_text, r.country
123127
FROM `{settings.BIGQUERY_TABLE}` as r
124128
INNER JOIN `{settings.BIGQUERY_CLASSIFICATION_TABLE}` c
125129
ON r.uuid = c.report_uuid
@@ -138,13 +142,18 @@ def run_backfill(self) -> None:
138142

139143
bq_data: dict[str, BackfillData] = {}
140144
for row in result:
141-
ml_valid_probability = transform_ml_label(
142-
row.ml_label, row.ml_probability
143-
)
145+
report = uuid_batch[row.uuid]
146+
if report.ml_valid_probability is None:
147+
ml_valid_probability = transform_ml_label(
148+
row.ml_label, row.ml_probability
149+
)
150+
else:
151+
ml_valid_probability = report.ml_valid_probability
144152
bq_data[row.uuid] = BackfillData(
145153
ml_valid_probability=ml_valid_probability,
146154
language_code=row.language_code,
147155
translated_text=row.translated_text,
156+
country=row.country,
148157
)
149158

150159
LOG.info("Fetched data for %d reports from BigQuery", len(bq_data))
@@ -160,13 +169,15 @@ def run_backfill(self) -> None:
160169
if uuid in bq_data:
161170
data = bq_data[uuid]
162171
updated = False
172+
retriage = False
163173

164174
if (
165175
report.ml_valid_probability is None
166176
and data.ml_valid_probability is not None
167177
):
168178
report.ml_valid_probability = data.ml_valid_probability
169179
updated = True
180+
retriage = True
170181

171182
if (
172183
report.comments_translated is None
@@ -178,12 +189,17 @@ def run_backfill(self) -> None:
178189
data.translated_text
179190
)
180191
updated = True
192+
retriage = True
193+
194+
if report.country is None and data.country is not None:
195+
report.country = data.country
196+
updated = True
181197

182198
if updated:
183199
reports_to_update.append(report)
184200

185201
# Clear bucket assignment to re-triage these reports
186-
if report.cluster_id is None:
202+
if retriage and report.cluster_id is None:
187203
report.bucket_id = None
188204

189205
if reports_to_update:
@@ -195,6 +211,7 @@ def run_backfill(self) -> None:
195211
"comments_original_language",
196212
"comments_preprocessed",
197213
"bucket_id",
214+
"country",
198215
],
199216
batch_size=self.DB_BATCH_SIZE,
200217
)

0 commit comments

Comments
 (0)