-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
86 lines (66 loc) · 3.06 KB
/
scraper.py
File metadata and controls
86 lines (66 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
"""
CLI entrypoint for the Olympedia athlete scraper.
This thin module parses CLI args and delegates to the core modules.
"""
from __future__ import annotations
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timezone
from config import STOP_THRESHOLD
from io_utils import ensure_csv_header, load_progress, save_progress
from scrape import worker_task
def main() -> None:
parser = argparse.ArgumentParser(
description="Scrape Olympic athlete data from Olympedia.org",
epilog="Example: python scraper.py --start 1 --concurrency 10 --delay 0.4",
)
parser.add_argument("--start", type=int, default=1, help="Starting athlete ID (default: 1)")
parser.add_argument("--concurrency", type=int, default=10, help="Number of concurrent threads (default: 10)")
parser.add_argument("--delay", type=float, default=0.4, help="Base delay between requests in seconds (default: 0.4)")
parser.add_argument("--csv", type=str, default="athletes.csv", help="Output CSV file path (default: athletes.csv)")
parser.add_argument("--resume", action="store_true", help="Resume from last saved progress")
args = parser.parse_args()
start_id = args.start
csv_path = args.csv
if args.resume:
last_processed = load_progress()
if last_processed >= start_id:
start_id = last_processed + 1
print(f"Resuming from athlete ID {start_id}")
ensure_csv_header(csv_path)
consecutive_missing = 0
current_id = start_id
print(f"Starting scraper with {args.concurrency} threads, {args.delay}s delay")
print(f"Output file: {csv_path}")
print(f"Will stop after {STOP_THRESHOLD} consecutive missing athletes")
with ThreadPoolExecutor(max_workers=args.concurrency) as executor:
while True:
future_to_id = {
executor.submit(worker_task, current_id + offset, csv_path, args.delay): current_id + offset
for offset in range(args.concurrency)
}
for future in as_completed(future_to_id):
athlete_id = future_to_id[future]
try:
athlete_found = future.result()
except Exception as e:
print(f"Error processing athlete {athlete_id}: {e}")
athlete_found = False
if athlete_found:
consecutive_missing = 0
if athlete_id % 100 == 0:
print(f"Processed athlete ID {athlete_id}")
else:
consecutive_missing += 1
if consecutive_missing >= STOP_THRESHOLD:
print(
f"[{datetime.now(timezone.utc)}] "
f"{STOP_THRESHOLD} consecutive missing athletes reached. "
f"Stopping at ID {athlete_id}."
)
save_progress(athlete_id)
return
current_id += args.concurrency
if __name__ == "__main__":
main()