W4H-ImportHub/w4h_db_utils.py at main · USC-InfoLab/W4H-ImportHub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import datetime
from loguru import logger
import pandas as pd
from sqlalchemy import create_engine, text, MetaData, Table, Column, String, ForeignKey, DateTime, REAL
from sqlalchemy.orm import sessionmaker
from sqlalchemy_utils import database_exists, create_database


from geoalchemy2 import Geometry

from utils import load_config, get_db_engine


def create_tables(db_name: str, config_file='config.yaml'):
    """Create the W4H tables in the database with the given name based on the config file

    Args:
        db_name (str): Name of the database to create the tables in
        config_file (str, optional): Path to the config file. Defaults to 'config.yaml'.
    """
    metadata = MetaData()
    config = load_config(config_file=config_file)
    db_engine = get_db_engine(config_file, db_name=db_name)

    columns_config = config["mapping"]["columns"]

    # Create the user table
    user_table_config = config["mapping"]["tables"]["user_table"]
    user_columns = [eval(f'Column("{col_name}", {col_dtype}, primary_key={col_name == columns_config["user_id"]})') for col_name, col_dtype in user_table_config["columns"].items()]  # Convert string to actual SQLAlchemy type
    user_table = Table(user_table_config["name"], metadata, *user_columns)


    # Create time series tables
    for table_name in config["mapping"]["tables"]["time_series"]:
        table = Table(table_name, metadata,
            Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
            Column(columns_config["timestamp"], DateTime, primary_key=True),
            Column(columns_config["value"], REAL),
        )

    # Create geo tables
    for table_name in config["mapping"]["tables"]["geo"]:
        table = Table(table_name, metadata,
            Column(columns_config["user_id"], ForeignKey(user_table_config["name"] + '.' + columns_config["user_id"]), primary_key=True),
            Column(columns_config["timestamp"], DateTime, primary_key=True),
            Column(columns_config["value"], Geometry('POINT'))
        )

    metadata.create_all(db_engine)
    db_engine.dispose()


def create_w4h_instance(db_name: str, config_file='config.yaml'):
    """Create a new W4H database instance with the given name and initialize the tables based on the config file

    Args:
        db_name (str): Name of the database to create
        config_file (str, optional): Path to the config file. Defaults to 'config.yaml'.
    """
    db_engine_tmp = get_db_engine(config_file)
    logger.info('Database engine created!')
    # Execute the SQL command to create the database if it doesn't exist
    if not database_exists(f'{db_engine_tmp.url}{db_name}'):
        create_database(f'{db_engine_tmp.url}{db_name}')
        logger.success(f"Database {db_name} created!")
        db_engine_tmp.dispose()
    else:
        logger.error(f"Database {db_name} already exists!")
        db_engine_tmp.dispose()
        return
    db_engine = get_db_engine(config_file, db_name=db_name)
    # Enable PostGIS extension
    with db_engine.connect() as connection:
        connection.execute(text(f"CREATE EXTENSION postgis;"))
        logger.success(f"PostGIS extension enabled for {db_name}!")
        connection.commit()
    db_engine.dispose()
    # Create the W4H tables
    create_tables(config_file=config_file, db_name=db_name)
    logger.success(f"W4H tables initialized!")


def get_existing_databases(config_file='config.yaml') -> list:
    """Get a list of all existing databases

    Args:
        config_file (str, optional): Path to the config file. Defaults to 'config.yaml'.

    Returns:
        list: List of all existing databases (strings)
    """
    config = load_config(config_file=config_file)
    db_engine = get_db_engine(config_file)
    with db_engine.connect() as connection:
        result = connection.execute(text("SELECT datname FROM pg_database WHERE datistemplate = false;"))
        databases = [row[0] for row in result]
    db_engine.dispose()
    return databases


def populate_tables(df: pd.DataFrame, db_name: str, mappings: dict, config_path='config.yaml'):
    """Populate the W4H tables in the given database with the data from the given dataframe based on
    the mappings between the CSV columns and the database tables.

    Args:
        df (pd.DataFrame): Dataframe containing the data to be inserted into the database
        db_name (str): Name of the database to insert the data into
        mappings (dict): Dictionary containing the mappings between the CSV columns and the database tables
        config_path (str, optional): Path to the config file. Defaults to 'config.yaml'.
    """
    # Load the config
    config = load_config(config_path)

    # Extract default column names from the config
    default_user_id = config['mapping']['columns']['user_id']
    default_timestamp = config['mapping']['columns']['timestamp']
    default_value = config['mapping']['columns']['value']
    user_table_name = config['mapping']['tables']['user_table']['name']

    # Create a session
    engine = get_db_engine(config_path, db_name=db_name)
    Session = sessionmaker(bind=engine)
    session = Session()

    # Ensure all unique users from the dataframe exist in the user table
    unique_users = df[mappings[default_user_id]].unique().astype(str)
    existing_users = session.query(Table(user_table_name, MetaData(bind=engine), autoload=True).c[default_user_id]).all()
    existing_users = [x[0] for x in existing_users]

    # Identify users that are not yet in the database
    new_users = set(unique_users) - set(existing_users)

    if new_users:
        # Convert the set of new users into a DataFrame
        all_new_users = pd.DataFrame({default_user_id: list(new_users)})

        # Use to_sql to insert all new users into the user table
        all_new_users.to_sql(user_table_name, engine, if_exists='append', index=False)

    # Get the subset of mappings that doesn't include default_user_id and default_timestamp
    table_mappings = {k: v for k, v in mappings.items() if k not in [default_user_id, default_timestamp]}

    # Loop through each table in table_mappings
    for table_name, csv_column in table_mappings.items():
        # Check if the mapping is not NULL and exists in the df
        if csv_column and csv_column in df.columns:

            # Ensure that the dataframe columns match the user_id, timestamp, and value from your CSV
            columns_to_insert = [mappings[default_user_id], mappings[default_timestamp], csv_column]

            subset_df = df[columns_to_insert].copy()

            # Rename columns to match the table's column names using the defaults from config
            subset_df.columns = [default_user_id, default_timestamp, default_value]

            # dropping duplicate user_id and timestamp
            subset_df.drop_duplicates(subset=[default_user_id, default_timestamp], inplace=True)
            # subset_df = subset_df.groupby([default_user_id, default_timestamp]).mean().reset_index()

            # handling geometry data
            if table_name in config["mapping"]["tables"]["geo"]:
                subset_df[default_value] = subset_df[default_value].apply(lambda x: f'POINT{x}'.replace(',', ''))

            # Insert data into the table
            subset_df.to_sql(table_name, engine, if_exists='append', index=False)

    # Commit the remaining changes and close the session
    session.commit()
    session.close()
    engine.dispose()


def populate_subject_table(df: pd.DataFrame, db_name: str, config_path='config.yaml', user_tbl_name=None):
    """Populate the W4H subject table in the given database with the data from the given dataframe based on
    the given subject table name in the config file.

    Args:
        df (pd.DataFrame): Dataframe containing the subject data to be inserted into the database
        db_name (str): Name of the subject database to insert the data into
        config_path (str, optional): Path to the config file. Defaults to 'config.yaml'.
    """
    # Load the config
    config = load_config(config_path)

    # Create a session
    engine = get_db_engine(config_path, db_name=db_name)

    # populate the user table (directly push df to table), if already exists, append new users
    df.to_sql(user_tbl_name, engine, if_exists='replace', index=False)

    # Commit the remaining changes and close the session
    engine.dispose()