Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion infra/ansible/group_vars/all/all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ keycloak_realm:
display_name: "ISIS Analytics Data Platform"
keycloak_realm_url: "{{ keycloak_url }}/realms/{{ keycloak_realm.name }}"
keycloak_bootstrap:
admin_user: "{{ secrets_keycloak['bootstrap_admin_user'] }}"
admin_user: "temp-admin"
admin_password: "{{ secrets_keycloak['bootstrap_admin_password'] }}"

lakekeeper_base_path: /iceberg
Expand Down
4 changes: 4 additions & 0 deletions infra/ansible/group_vars/keycloak.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ keycloak_db_name: "{{ secrets_keycloak['db_name'] }}"
keycloak_db_user: "{{ secrets_keycloak['db_user'] }}"
keycloak_db_password: "{{ secrets_keycloak['db_password'] }}"

keycloak_local_admin:
user: "{{ secrets_keycloak['master_local_admin_user'] }}"
password: "{{ secrets_keycloak['master_local_admin_password'] }}"

# Eveything is assigned to the realm defined by keycloak_realm.name in all/all.yml
keycloak_client_scopes:
- name: lakekeeper
Expand Down
70 changes: 68 additions & 2 deletions infra/ansible/roles/keycloak/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,24 @@
path: "{{ keycloak_working_dir }}"
rebuild: always

- name: Check if we need to bootstrap an admin
no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
ansible.builtin.uri:
url: "{{ keycloak_url }}/realms/master/protocol/openid-connect/token"
method: POST
body_format: form-urlencoded
body:
client_id: "admin-cli"
username: "{{ keycloak_local_admin.user }}"
password: "{{ keycloak_local_admin.password }}"
grant_type: "password"
ignore_errors: true
register: keycloak_token_response

- name: Set admin required fact
ansible.builtin.set_fact:
local_admin_user_exists: "{{ keycloak_token_response['status'] == 200 }}"

- name: Bootstrap Keycloak admin
no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
community.docker.docker_container:
Expand All @@ -53,7 +71,6 @@
[
"bootstrap-admin",
"user",
"--username={{ keycloak_bootstrap.admin_user }}",
"--password:env=KC_BOOTSTRAP_ADMIN_PASSWORD",
"--optimized",
"--no-prompt",
Expand All @@ -69,6 +86,7 @@
KC_DB_URL: "jdbc:postgresql://{{ keycloak_db_host }}:{{ keycloak_db_port }}/{{ keycloak_db_name }}"
KC_DB_USERNAME: "{{ keycloak_db_user }}"
KC_DB_PASSWORD: "{{ keycloak_db_password }}"
when: not local_admin_user_exists

- name: Run Keycloak
community.docker.docker_container:
Expand Down Expand Up @@ -105,6 +123,54 @@
timeout: 2s
retries: 15

- ansible.builtin.import_tasks: setup-realm.yml
# Configure master realm admin
- name: Create permanent admin user
no_log: true
community.general.keycloak_user:
auth_keycloak_url: "{{ keycloak_url }}"
auth_realm: master
auth_username: "{{ keycloak_bootstrap.admin_user }}"
auth_password: "{{ keycloak_bootstrap.admin_password }}"
realm: master
username: "{{ keycloak_local_admin.user }}"
enabled: true
emailVerified: true
credentials:
- type: password
value: "{{ keycloak_local_admin.password }}"
temporary: false
state: present
register: kc_new_admin
when: not local_admin_user_exists

- name: Assign admin realm role to permanent user
no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
community.general.keycloak_user_rolemapping:
auth_keycloak_url: "{{ keycloak_url }}"
auth_realm: master
auth_username: "{{ keycloak_bootstrap.admin_user }}"
auth_password: "{{ keycloak_bootstrap.admin_password }}"
realm: master
uid: "{{ kc_new_admin.end_state.id }}"
roles:
- name: "admin"
state: present
when: not local_admin_user_exists

- name: Disable temp-admin bootstrap account
no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
community.general.keycloak_user:
auth_keycloak_url: "{{ keycloak_url }}"
auth_realm: "master"
auth_username: "{{ keycloak_local_admin.user }}"
auth_password: "{{ keycloak_local_admin.password }}"
realm: master
username: "{{ keycloak_bootstrap.admin_user }}"
enabled: false
state: present
when: not local_admin_user_exists

# Configure custom realm
- ansible.builtin.import_tasks: setup-target-realm.yml
vars:
target_realm: "{{ keycloak_realm.name }}"
4 changes: 2 additions & 2 deletions infra/ansible/roles/keycloak/tasks/setup-ldap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
auth_client_id: admin-cli
auth_keycloak_url: "{{ keycloak_url }}"
auth_realm: master
auth_username: "{{ keycloak_bootstrap.admin_user }}"
auth_password: "{{ keycloak_bootstrap.admin_password }}"
auth_username: "{{ keycloak_local_admin.user }}"
auth_password: "{{ keycloak_local_admin.password }}"
realm: "{{ target_realm }}"
name: "STFC LDAP"
state: present
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
auth_client_id: admin-cli
auth_keycloak_url: "{{ keycloak_url }}"
auth_realm: master
auth_username: "{{ keycloak_bootstrap.admin_user }}"
auth_password: "{{ keycloak_bootstrap.admin_password }}"
auth_username: "{{ keycloak_local_admin.user }}"
auth_password: "{{ keycloak_local_admin.password }}"
when: false

- name: Create Keycloak realm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,20 @@

import dlt
import dlt.common.logger as logger
from dlt.sources import TDataItems
import pendulum
import pandas as pd
import pendulum
import pyarrow.compute as pc
import pytz

from dlt.sources import TDataItems
from elt_common.cli_utils import cli_main
from elt_common.dlt_sources.m365 import (
sharepoint,
M365DriveItem,
)
from elt_common.dlt_destinations.pyiceberg.helpers import load_iceberg_table
from elt_common.dlt_destinations.pyiceberg.pyiceberg_adapter import (
pyiceberg_adapter,
PartitionTrBuilder,
pyiceberg_adapter,
)
from elt_common.dlt_sources.m365 import (
M365DriveItem,
sharepoint,
)

EXCEL_ENGINE = "calamine"
Expand All @@ -41,19 +40,45 @@
PIPELINE_NAME = "electricity_sharepoint"
RDM_TIMEZONE = "Europe/London"

# File information
# The file can contain a record from a single export or multiple records combined. Each record starts with
# a header:

# Source details
#
# There are 3 known formats for the data. Each contains a preamble of the form:
#
# Site Information:
# RAL ISIS RDM
# RAL ISIS RDM
# Controller: ISIS
# Controller description: ISIS Energy Totals
# Status: Online
#
# This preamble is discarded from all formats.
#
# 1. Excel (old, manual export format)
# ------------------------------------
#
# These were produced by manual export from the original system.
#
# There are two columns: "Time,Total Power (MW)". Time is in the format "YYYY-mm-DDTHH:MM:SS".
#
# 2. Automated CSV export
# -----------------------
#
# An automation deposits .csv files at regular intervals.
#
# There are three columns: "Time,Date,ISIS Elec Total Power", where Time is in format HH:MM:SS and Date is in format DD/mm/YY.
# Several of these files can be concatenated together to form larger timespan records - they are concatenated as is and repeated
# preamble sections are not discarded.
#
# 3. Manual CSV export
# --------------------
#
# These were produced more recently by manual export from the original system.
# There are three columns: "Time,ISIS Elec Total Energy,ISIS Elec Total Power", where Time is in format "DD/mm/YY HH:MM:SS".
# The second column is discarded.


# Time Date ISIS Elec Total Power {MW}
CSV_HEADER_ANCHOR = "time"
CSV_PREAMBLE_ANCHOR = "time"
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated


def to_utc(ts: pd.Series) -> pd.Series:
Expand All @@ -66,26 +91,38 @@ def csv_section_to_df(file_name: str, lines: Sequence[str]) -> pd.DataFrame | No
df = pd.read_csv(io.StringIO("\n".join(lines)))
# clean up column name (strip any whitespace)
df.columns = df.columns.str.strip()
cols = [c for c in df.columns]
assert len(cols) == 3

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟠 Major | ⚡ Quick win

Replace the assertion with recoverable CSV-shape validation.

assert can be stripped in optimised Python, and an uncaught AssertionError will abort ingestion for one malformed export instead of skipping that section like timestamp parse failures.

Proposed fix
     cols = [c for c in df.columns]
-    assert len(cols) == 3
+    if len(cols) != 3:
+        logger.warning(
+            f"Error loading section of {file_name!r}: expected 3 columns, got {len(cols)}. "
+            "There will be gaps in the data."
+        )
+        return None
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cols = [c for c in df.columns]
assert len(cols) == 3
cols = [c for c in df.columns]
if len(cols) != 3:
logger.warning(
f"Error loading section of {file_name!r}: expected 3 columns, got {len(cols)}. "
"There will be gaps in the data."
)
return None
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In
`@warehouses/facility_ops_landing/ingest/estates/electricity_sharepoint/electricity_sharepoint.py`
around lines 94 - 95, Replace the assert-based column count check in the
electricity_sharepoint ingestion flow with explicit CSV-shape validation and
recovery. In the section that builds cols from df.columns, detect the unexpected
number of columns with a normal conditional, log or record the malformed export,
and skip that section instead of raising AssertionError. Keep the behavior
aligned with the existing recoverable timestamp parse handling in the same
ingest path so one bad file does not abort the whole ingestion.

try:
df["DateTime"] = to_utc(
pd.to_datetime(df["Date"] + " " + df["Time"], format="%d/%m/%y %H:%M:%S") # type: ignore
)
return df.drop(["Date", "Time"], axis=1)
if "Date" in cols:
# Automated CSV format
df["DateTime"] = to_utc(
pd.to_datetime(
df["Date"] + " " + df["Time"], format="%d/%m/%y %H:%M:%S"
) # type: ignore
)
df = df.drop(["Date", "Time"], axis=1)
else:
# Manual CSV format
df["DateTime"] = to_utc(
pd.to_datetime(df["Time"], format="%d/%m/%y %H:%M:%S")
)
# Drop time and energy columns
df = df.drop(["Time", cols[1]], axis=1)
except (pytz.exceptions.AmbiguousTimeError, ValueError) as exc:
logger.warning(
f"'Error loading section of {file_name}'. There will be gaps in the data.\nDetails: {str(exc)}"
)
return None

return df


def read_power_consumption_csv(
file_content: io.BytesIO,
file_name: str,
) -> pd.DataFrame | None:
"""Read csv-formatted power consumption records. This can return None if there is not data in the file.

The Date & Time columns together to create a single DateTime column.
"""
# See comment at the top of this describing the format

def _append_if_not_none(seq, df):
if df is not None:
Expand All @@ -97,7 +134,7 @@ def _append_if_not_none(seq, df):
line = line.strip()
line_lower = line.lower()

if line_lower.startswith(CSV_HEADER_ANCHOR):
if line_lower.startswith(CSV_PREAMBLE_ANCHOR):
# Save any previous section
if current_lines:
_append_if_not_none(
Expand Down Expand Up @@ -127,11 +164,8 @@ def _append_if_not_none(seq, df):


def read_power_consumption_excel(file_content: io.BytesIO) -> pd.DataFrame:
"""Read an excel-formatted power consumption record.
# See comment at the top of this describing the format

Expected columns: Time, Total Power (MW)
The Time column is renamed DateTime for consistency.
"""
df = pd.read_excel(file_content, engine=EXCEL_ENGINE, skiprows=EXCEL_SKIP_ROWS)
df = df.rename(columns={"Time": "DateTime"})
df["DateTime"] = to_utc(df["DateTime"])
Expand Down Expand Up @@ -179,42 +213,39 @@ def read_as_dataframe(file_obj: M365DriveItem) -> pd.DataFrame | None:
yield df_batch


@dlt.resource(merge_key="DateTime", columns={"DateTime": {"nullable": False}})
@dlt.resource(
primary_key="DateTime",
write_disposition={"disposition": "merge", "strategy": "upsert"},
)
def rdm_data(
site_url: str = dlt.config.value,
root_dir: str = dlt.config.value,
backfill: bool = False,
backfill_glob: str | None = None,
) -> Iterator[TDataItems]:
if backfill:
historic_xl = sharepoint(
if backfill_glob is not None:
file_globs = [backfill_glob]
else:
file_globs = [
f"{root_dir}/**/*.xlsx",
f"{root_dir}/**/*-daily.csv",
f"{root_dir}/**/*-manual-export.csv",
]
modified_after = None
else:
file_globs = [f"{root_dir}/*-ISIS.csv"]
modified_after = get_latest_timestamp(dlt.current.pipeline())

for file_glob in file_globs:
file_listing = sharepoint(
site_url=site_url,
file_glob=f"{root_dir}/**/*.xlsx",
file_glob=file_glob,
extract_content=False,
modified_after=modified_after,
)
reader = historic_xl | extract_content_and_read()
yield from reader.apply_hints(
write_disposition="merge",
)
historic_csv = sharepoint(
site_url=site_url,
file_glob=f"{root_dir}/**/*-daily.csv",
extract_content=False,
)
reader = historic_csv | extract_content_and_read()
yield from reader.apply_hints(
write_disposition="merge",
)

latest_files = sharepoint(
site_url=site_url,
file_glob=f"{root_dir}/*-ISIS.csv",
extract_content=False,
modified_after=get_latest_timestamp(dlt.current.pipeline()),
)
reader = latest_files | extract_content_and_read()
yield from reader.apply_hints(
write_disposition="merge",
)
reader = file_listing | extract_content_and_read()
yield from reader


def get_latest_timestamp(pipeline: dlt.Pipeline) -> pendulum.DateTime | None:
Expand Down