Olympedia-Athlete-Scraper/io_utils.py at main · Wydoinn/Olympedia-Athlete-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""I/O utilities: progress tracking and CSV writing."""

from __future__ import annotations

import csv
import json
import os
from datetime import datetime, timezone
from typing import Dict, List

from config import CSV_FIELDS, LOCK, PROGRESS_FILE


def save_progress(last_id: int) -> None:
    """
    Save the current scraping progress to a JSON file.

    This function is thread-safe and stores both the last processed athlete ID
    and a timestamp for tracking purposes.
    """
    with LOCK:
        progress_data = {
            "last_id": int(last_id),
            "updated_at": datetime.now(timezone.utc).isoformat(),
        }
        with open(PROGRESS_FILE, "w", encoding="utf-8") as f:
            json.dump(progress_data, f, indent=2)


def load_progress() -> int:
    """
    Load the last processed athlete ID from the progress file.

    Returns the last processed athlete ID, or 0 if no progress file exists or
    if there's an error reading it.
    """
    if not os.path.exists(PROGRESS_FILE):
        return 0

    try:
        with open(PROGRESS_FILE, "r", encoding="utf-8") as f:
            progress_data = json.load(f)
            return int(progress_data.get("last_id", 0))
    except (json.JSONDecodeError, ValueError, KeyError):
        return 0


def ensure_csv_header(file_path: str) -> None:
    """Create CSV file with headers if it doesn't exist."""
    if not os.path.exists(file_path):
        with open(file_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
            writer.writeheader()


def append_rows(file_path: str, rows: List[Dict[str, str]]) -> None:
    """
    Thread-safely append athlete data rows to the CSV file. Ensures header and
    converts None values to empty strings.
    """
    if not rows:
        return

    with LOCK:
        ensure_csv_header(file_path)
        with open(file_path, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
            for row in rows:
                clean_row = {field: (row.get(field) or "") for field in CSV_FIELDS}
                writer.writerow(clean_row)