-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathio_utils.py
More file actions
70 lines (55 loc) · 2.12 KB
/
io_utils.py
File metadata and controls
70 lines (55 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""I/O utilities: progress tracking and CSV writing."""
from __future__ import annotations
import csv
import json
import os
from datetime import datetime, timezone
from typing import Dict, List
from config import CSV_FIELDS, LOCK, PROGRESS_FILE
def save_progress(last_id: int) -> None:
"""
Save the current scraping progress to a JSON file.
This function is thread-safe and stores both the last processed athlete ID
and a timestamp for tracking purposes.
"""
with LOCK:
progress_data = {
"last_id": int(last_id),
"updated_at": datetime.now(timezone.utc).isoformat(),
}
with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
json.dump(progress_data, f, indent=2)
def load_progress() -> int:
"""
Load the last processed athlete ID from the progress file.
Returns the last processed athlete ID, or 0 if no progress file exists or
if there's an error reading it.
"""
if not os.path.exists(PROGRESS_FILE):
return 0
try:
with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
progress_data = json.load(f)
return int(progress_data.get("last_id", 0))
except (json.JSONDecodeError, ValueError, KeyError):
return 0
def ensure_csv_header(file_path: str) -> None:
"""Create CSV file with headers if it doesn't exist."""
if not os.path.exists(file_path):
with open(file_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
writer.writeheader()
def append_rows(file_path: str, rows: List[Dict[str, str]]) -> None:
"""
Thread-safely append athlete data rows to the CSV file. Ensures header and
converts None values to empty strings.
"""
if not rows:
return
with LOCK:
ensure_csv_header(file_path)
with open(file_path, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
for row in rows:
clean_row = {field: (row.get(field) or "") for field in CSV_FIELDS}
writer.writerow(clean_row)