4747
4848from django .conf import settings
4949from django .core .management import BaseCommand
50+ from django .db .models import Q
5051from google .cloud import bigquery
5152from google .oauth2 import service_account
5253
@@ -62,6 +63,7 @@ class BackfillData:
6263 ml_valid_probability : float | None
6364 language_code : str | None
6465 translated_text : str | None
66+ country : str | None
6567
6668
6769class Command (BaseCommand ):
@@ -80,17 +82,19 @@ def handle(self, *args, **options) -> None:
8082
8183 def run_backfill (self ) -> None :
8284 # Find reports needing ML updates (only those with non-empty comments)
83- reports_to_update = ReportEntry .objects .filter (
84- ml_valid_probability__isnull = True , comments__isnull = False
85- ).exclude (comments = "" )
85+ reports_to_update = (
86+ ReportEntry .objects .filter (comments__isnull = False )
87+ .exclude (comments = "" )
88+ .filter (Q (ml_valid_probability__isnull = True ) | Q (country__isnull = True ))
89+ )
8690
8791 total_reports = reports_to_update .count ()
8892
8993 if total_reports == 0 :
90- LOG .info ("No reports need ML backfill" )
94+ LOG .info ("No reports need backfill" )
9195 return
9296
93- LOG .info ("Found %d reports needing ML backfill" , total_reports )
97+ LOG .info ("Found %d reports needing backfill" , total_reports )
9498
9599 all_reports = list (reports_to_update )
96100 batches = list (batched (all_reports , self .BQ_BATCH_SIZE ))
@@ -114,12 +118,12 @@ def run_backfill(self) -> None:
114118 len (report_batch ),
115119 )
116120
117- uuid_batch : list [ str ] = [ str (report .uuid ) for report in report_batch ]
121+ uuid_batch = { str (report .uuid ): report for report in report_batch }
118122
119123 query : str = f"""
120124 SELECT r.uuid,
121125 c.label as ml_label, c.probability as ml_probability,
122- t.language_code, t.translated_text
126+ t.language_code, t.translated_text, r.country
123127 FROM `{ settings .BIGQUERY_TABLE } ` as r
124128 INNER JOIN `{ settings .BIGQUERY_CLASSIFICATION_TABLE } ` c
125129 ON r.uuid = c.report_uuid
@@ -138,13 +142,18 @@ def run_backfill(self) -> None:
138142
139143 bq_data : dict [str , BackfillData ] = {}
140144 for row in result :
141- ml_valid_probability = transform_ml_label (
142- row .ml_label , row .ml_probability
143- )
145+ report = uuid_batch [row .uuid ]
146+ if report .ml_valid_probability is None :
147+ ml_valid_probability = transform_ml_label (
148+ row .ml_label , row .ml_probability
149+ )
150+ else :
151+ ml_valid_probability = report .ml_valid_probability
144152 bq_data [row .uuid ] = BackfillData (
145153 ml_valid_probability = ml_valid_probability ,
146154 language_code = row .language_code ,
147155 translated_text = row .translated_text ,
156+ country = row .country ,
148157 )
149158
150159 LOG .info ("Fetched data for %d reports from BigQuery" , len (bq_data ))
@@ -160,13 +169,15 @@ def run_backfill(self) -> None:
160169 if uuid in bq_data :
161170 data = bq_data [uuid ]
162171 updated = False
172+ retriage = False
163173
164174 if (
165175 report .ml_valid_probability is None
166176 and data .ml_valid_probability is not None
167177 ):
168178 report .ml_valid_probability = data .ml_valid_probability
169179 updated = True
180+ retriage = True
170181
171182 if (
172183 report .comments_translated is None
@@ -178,12 +189,17 @@ def run_backfill(self) -> None:
178189 data .translated_text
179190 )
180191 updated = True
192+ retriage = True
193+
194+ if report .country is None and data .country is not None :
195+ report .country = data .country
196+ updated = True
181197
182198 if updated :
183199 reports_to_update .append (report )
184200
185201 # Clear bucket assignment to re-triage these reports
186- if report .cluster_id is None :
202+ if retriage and report .cluster_id is None :
187203 report .bucket_id = None
188204
189205 if reports_to_update :
@@ -195,6 +211,7 @@ def run_backfill(self) -> None:
195211 "comments_original_language" ,
196212 "comments_preprocessed" ,
197213 "bucket_id" ,
214+ "country" ,
198215 ],
199216 batch_size = self .DB_BATCH_SIZE ,
200217 )
0 commit comments