LGSF/output_csv.py at master · DemocracyClub/LGSF · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "python-dateutil",
# ]
# ///
"""
Output councillor data as CSV, filtering to current councils only.

Usage:
    uv run output_csv.py > councillors.csv
    uv run output_csv.py --all > all_councillors.csv  # include non-current
"""

import csv
import glob
import json
import sys
from datetime import date
from pathlib import Path

from dateutil.parser import parse


def is_current_council(metadata: dict) -> bool:
    """Check if a council is current based on its metadata."""
    today = date.today()

    # Check for dates in everyelectiion_data first (new format)
    everyelectiion_data = metadata.get("everyelectiion_data", {})
    end_date = everyelectiion_data.get("end_date") or metadata.get("end_date")
    start_date = everyelectiion_data.get("start_date") or metadata.get("start_date")

    if end_date and parse(end_date).date() < today:
        return False
    if start_date and parse(start_date).date() > today:
        return False
    return True


def get_current_council_ids() -> set[str]:
    """Get set of council IDs that are currently active."""
    current_ids = set()
    scrapers_dir = Path("scrapers")

    for metadata_file in scrapers_dir.glob("*/metadata.json"):
        council_id = metadata_file.parent.name.split("-")[0].upper()
        try:
            with open(metadata_file) as f:
                metadata = json.load(f)
            if is_current_council(metadata):
                current_ids.add(council_id)
        except (json.JSONDecodeError, OSError):
            continue

    return current_ids


field_names = [
    "council_id",
    "raw_division",
    "raw_identifier",
    "email",
    "url",
    "raw_name",
    "raw_party",
]

csvout = csv.DictWriter(sys.stdout, fieldnames=field_names)
csvout.writeheader()

# Check for --all flag
include_all = "--all" in sys.argv

# Get current councils (unless --all is specified)
current_council_ids = None if include_all else get_current_council_ids()

for file_name in glob.glob("./data/**/json/*.json"):
    council_id = file_name.split("/")[-3]

    # Skip non-current councils unless --all
    if current_council_ids is not None and council_id not in current_council_ids:
        continue

    councillor = json.load(open(file_name))
    for k in list(councillor.keys()):
        if k not in field_names:
            del councillor[k]
    councillor["council_id"] = council_id
    csvout.writerow(councillor)

    # council_id = file_name.split("/")[-3]
    # if not council_id in councillor_counter:
    #     councillor_counter[council_id] = 0
    # councillor_counter[council_id] += 1

    # with open(file_name) as f:
    #     json_data = json.loads(f.read())
    #     if not json_data["FaceDetails"]:
    #         continue
    #     face = json_data["FaceDetails"][0]
    #     out_dict = json_data['councillor_json']
    #     out_dict.update({
    #         "gender": face["Gender"]["Value"],
    #         "age_low": face["AgeRange"]["Low"],
    #         "age_high": face["AgeRange"]["High"],
    #         "smile": face["Smile"]["Value"],
    #         "glasses": face["Eyeglasses"]["Value"],
    #         "beard": face["Beard"]["Value"],
    #         "happy": any(
    #             [
    #                 x
    #                 for x in face["Emotions"]
    #                 if x["Type"] == "HAPPY" and x["Confidence"] > 70
    #             ]
    #         )
    #     })
    #     csvout.writerow(out_dict)

# for council_id, count in councillor_counter.items():
#     print(",".join((council_id, str(count))))