-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathoutput_csv.py
More file actions
121 lines (99 loc) · 3.51 KB
/
output_csv.py
File metadata and controls
121 lines (99 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "python-dateutil",
# ]
# ///
"""
Output councillor data as CSV, filtering to current councils only.
Usage:
uv run output_csv.py > councillors.csv
uv run output_csv.py --all > all_councillors.csv # include non-current
"""
import csv
import glob
import json
import sys
from datetime import date
from pathlib import Path
from dateutil.parser import parse
def is_current_council(metadata: dict) -> bool:
"""Check if a council is current based on its metadata."""
today = date.today()
# Check for dates in everyelectiion_data first (new format)
everyelectiion_data = metadata.get("everyelectiion_data", {})
end_date = everyelectiion_data.get("end_date") or metadata.get("end_date")
start_date = everyelectiion_data.get("start_date") or metadata.get("start_date")
if end_date and parse(end_date).date() < today:
return False
if start_date and parse(start_date).date() > today:
return False
return True
def get_current_council_ids() -> set[str]:
"""Get set of council IDs that are currently active."""
current_ids = set()
scrapers_dir = Path("scrapers")
for metadata_file in scrapers_dir.glob("*/metadata.json"):
council_id = metadata_file.parent.name.split("-")[0].upper()
try:
with open(metadata_file) as f:
metadata = json.load(f)
if is_current_council(metadata):
current_ids.add(council_id)
except (json.JSONDecodeError, OSError):
continue
return current_ids
field_names = [
"council_id",
"raw_division",
"raw_identifier",
"email",
"url",
"raw_name",
"raw_party",
]
csvout = csv.DictWriter(sys.stdout, fieldnames=field_names)
csvout.writeheader()
# Check for --all flag
include_all = "--all" in sys.argv
# Get current councils (unless --all is specified)
current_council_ids = None if include_all else get_current_council_ids()
for file_name in glob.glob("./data/**/json/*.json"):
council_id = file_name.split("/")[-3]
# Skip non-current councils unless --all
if current_council_ids is not None and council_id not in current_council_ids:
continue
councillor = json.load(open(file_name))
for k in list(councillor.keys()):
if k not in field_names:
del councillor[k]
councillor["council_id"] = council_id
csvout.writerow(councillor)
# council_id = file_name.split("/")[-3]
# if not council_id in councillor_counter:
# councillor_counter[council_id] = 0
# councillor_counter[council_id] += 1
# with open(file_name) as f:
# json_data = json.loads(f.read())
# if not json_data["FaceDetails"]:
# continue
# face = json_data["FaceDetails"][0]
# out_dict = json_data['councillor_json']
# out_dict.update({
# "gender": face["Gender"]["Value"],
# "age_low": face["AgeRange"]["Low"],
# "age_high": face["AgeRange"]["High"],
# "smile": face["Smile"]["Value"],
# "glasses": face["Eyeglasses"]["Value"],
# "beard": face["Beard"]["Value"],
# "happy": any(
# [
# x
# for x in face["Emotions"]
# if x["Type"] == "HAPPY" and x["Confidence"] > 70
# ]
# )
# })
# csvout.writerow(out_dict)
# for council_id, count in councillor_counter.items():
# print(",".join((council_id, str(count))))