ISISNeutronMuon · martyngigg · Jun 30, 2026 · Jul 1, 2026 · Jul 2, 2026 · Jul 3, 2026
@@ -33,7 +33,7 @@ keycloak_realm:
   display_name: "ISIS Analytics Data Platform"
 keycloak_realm_url: "{{ keycloak_url }}/realms/{{ keycloak_realm.name }}"
 keycloak_bootstrap:
-  admin_user: "{{ secrets_keycloak['bootstrap_admin_user'] }}"
+  admin_user: "temp-admin"
   admin_password: "{{ secrets_keycloak['bootstrap_admin_password'] }}"
 
 lakekeeper_base_path: /iceberg

@@ -5,6 +5,10 @@ keycloak_db_name: "{{ secrets_keycloak['db_name'] }}"
 keycloak_db_user: "{{ secrets_keycloak['db_user'] }}"
 keycloak_db_password: "{{ secrets_keycloak['db_password'] }}"
 
+keycloak_local_admin:
+  user: "{{ secrets_keycloak['master_local_admin_user'] }}"
+  password: "{{ secrets_keycloak['master_local_admin_password'] }}"
+
 # Eveything is assigned to the realm defined by keycloak_realm.name in all/all.yml
 keycloak_client_scopes:
   - name: lakekeeper

@@ -44,6 +44,24 @@
     path: "{{ keycloak_working_dir }}"
     rebuild: always
 
+- name: Check if we need to bootstrap an admin
+  no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
+  ansible.builtin.uri:
+    url: "{{ keycloak_url }}/realms/master/protocol/openid-connect/token"
+    method: POST
+    body_format: form-urlencoded
+    body:
+      client_id: "admin-cli"
+      username: "{{ keycloak_local_admin.user }}"
+      password: "{{ keycloak_local_admin.password }}"
+      grant_type: "password"
+  ignore_errors: true
+  register: keycloak_token_response
+
+- name: Set admin required fact
+  ansible.builtin.set_fact:
+    local_admin_user_exists: "{{ keycloak_token_response['status'] == 200 }}"
+
 - name: Bootstrap Keycloak admin
   no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
   community.docker.docker_container:
@@ -53,7 +71,6 @@
       [
         "bootstrap-admin",
         "user",
-        "--username={{ keycloak_bootstrap.admin_user }}",
         "--password:env=KC_BOOTSTRAP_ADMIN_PASSWORD",
         "--optimized",
         "--no-prompt",
@@ -69,6 +86,7 @@
       KC_DB_URL: "jdbc:postgresql://{{ keycloak_db_host }}:{{ keycloak_db_port }}/{{ keycloak_db_name }}"
       KC_DB_USERNAME: "{{ keycloak_db_user }}"
       KC_DB_PASSWORD: "{{ keycloak_db_password }}"
+  when: not local_admin_user_exists
 
 - name: Run Keycloak
   community.docker.docker_container:
@@ -105,6 +123,54 @@
       timeout: 2s
       retries: 15
 
-- ansible.builtin.import_tasks: setup-realm.yml
+# Configure master realm admin
+- name: Create permanent admin user
+  no_log: true
+  community.general.keycloak_user:
+    auth_keycloak_url: "{{ keycloak_url }}"
+    auth_realm: master
+    auth_username: "{{ keycloak_bootstrap.admin_user }}"
+    auth_password: "{{ keycloak_bootstrap.admin_password }}"
+    realm: master
+    username: "{{ keycloak_local_admin.user }}"
+    enabled: true
+    emailVerified: true
+    credentials:
+      - type: password
+        value: "{{ keycloak_local_admin.password }}"
+        temporary: false
+    state: present
+  register: kc_new_admin
+  when: not local_admin_user_exists
+
+- name: Assign admin realm role to permanent user
+  no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
+  community.general.keycloak_user_rolemapping:
+    auth_keycloak_url: "{{ keycloak_url }}"
+    auth_realm: master
+    auth_username: "{{ keycloak_bootstrap.admin_user }}"
+    auth_password: "{{ keycloak_bootstrap.admin_password }}"
+    realm: master
+    uid: "{{ kc_new_admin.end_state.id }}"
+    roles:
+      - name: "admin"
+    state: present
+  when: not local_admin_user_exists
+
+- name: Disable temp-admin bootstrap account
+  no_log: "{{ not (keycloak_bootstrap_logging | default(false)) }}"
+  community.general.keycloak_user:
+    auth_keycloak_url: "{{ keycloak_url }}"
+    auth_realm: "master"
+    auth_username: "{{ keycloak_local_admin.user }}"
+    auth_password: "{{ keycloak_local_admin.password }}"
+    realm: master
+    username: "{{ keycloak_bootstrap.admin_user }}"
+    enabled: false
+    state: present
+  when: not local_admin_user_exists
+
+# Configure custom realm
+- ansible.builtin.import_tasks: setup-target-realm.yml
   vars:
     target_realm: "{{ keycloak_realm.name }}"
@@ -4,8 +4,8 @@
     auth_client_id: admin-cli
     auth_keycloak_url: "{{ keycloak_url }}"
     auth_realm: master
-    auth_username: "{{ keycloak_bootstrap.admin_user }}"
-    auth_password: "{{ keycloak_bootstrap.admin_password }}"
+    auth_username: "{{ keycloak_local_admin.user }}"
+    auth_password: "{{ keycloak_local_admin.password }}"
     realm: "{{ target_realm }}"
     name: "STFC LDAP"
     state: present

@@ -6,8 +6,8 @@
     auth_client_id: admin-cli
     auth_keycloak_url: "{{ keycloak_url }}"
     auth_realm: master
-    auth_username: "{{ keycloak_bootstrap.admin_user }}"
-    auth_password: "{{ keycloak_bootstrap.admin_password }}"
+    auth_username: "{{ keycloak_local_admin.user }}"
+    auth_password: "{{ keycloak_local_admin.password }}"
   when: false
 
 - name: Create Keycloak realm

@@ -17,21 +17,20 @@
 
 import dlt
 import dlt.common.logger as logger
-from dlt.sources import TDataItems
-import pendulum
 import pandas as pd
+import pendulum
 import pyarrow.compute as pc
 import pytz
-
+from dlt.sources import TDataItems
 from elt_common.cli_utils import cli_main
-from elt_common.dlt_sources.m365 import (
-    sharepoint,
-    M365DriveItem,
-)
 from elt_common.dlt_destinations.pyiceberg.helpers import load_iceberg_table
 from elt_common.dlt_destinations.pyiceberg.pyiceberg_adapter import (
-    pyiceberg_adapter,
     PartitionTrBuilder,
+    pyiceberg_adapter,
+)
+from elt_common.dlt_sources.m365 import (
+    M365DriveItem,
+    sharepoint,
 )
 
 EXCEL_ENGINE = "calamine"
@@ -41,19 +40,45 @@
 PIPELINE_NAME = "electricity_sharepoint"
 RDM_TIMEZONE = "Europe/London"
 
-# File information
-# The file can contain a record from a single export or multiple records combined. Each record starts with
-# a header:
-
+# Source details
+#
+# There are 3 known formats for the data. Each contains a preamble of the form:
+#
 # Site Information:
 # RAL ISIS RDM
 # RAL ISIS RDM
 # Controller: ISIS
 # Controller description: ISIS Energy Totals
 # Status: Online
+#
+# This preamble is discarded from all formats.
+#
+# 1. Excel (old, manual export format)
+# ------------------------------------
+#
+# These were produced by manual export from the original system.
+#
+# There are two columns: "Time,Total Power (MW)". Time is in the format "YYYY-mm-DDTHH:MM:SS".
+#
+# 2. Automated CSV export
+# -----------------------
+#
+# An automation deposits .csv files at regular intervals.
+#
+# There are three columns: "Time,Date,ISIS Elec Total Power", where Time is in format HH:MM:SS and Date is in format DD/mm/YY.
+# Several of these files can be concatenated together to form larger timespan records - they are concatenated as is and repeated
+# preamble sections are not discarded.
+#
+# 3. Manual CSV export
+# --------------------
+#
+# These were produced more recently by manual export from the original system.
+# There are three columns: "Time,ISIS Elec Total Energy,ISIS Elec Total Power", where Time is in format "DD/mm/YY HH:MM:SS".
+# The second column is discarded.
+
 
 # Time	Date	ISIS Elec Total Power {MW}
-CSV_HEADER_ANCHOR = "time"
+CSV_PREAMBLE_ANCHOR = "time"
 
 
 def to_utc(ts: pd.Series) -> pd.Series:
@@ -66,26 +91,38 @@ def csv_section_to_df(file_name: str, lines: Sequence[str]) -> pd.DataFrame | No
     df = pd.read_csv(io.StringIO("\n".join(lines)))
     # clean up column name (strip any whitespace)
     df.columns = df.columns.str.strip()
+    cols = [c for c in df.columns]
+    assert len(cols) == 3
-    cols = [c for c in df.columns]
-    assert len(cols) == 3
+    cols = [c for c in df.columns]
+    if len(cols) != 3:
+        logger.warning(
+            f"Error loading section of {file_name!r}: expected 3 columns, got {len(cols)}. "
+            "There will be gaps in the data."
+        )
+        return None
-    cols = [c for c in df.columns]
-    assert len(cols) == 3
+    cols = [c for c in df.columns]
+    if len(cols) != 3:
+        logger.warning(
+            f"Error loading section of {file_name!r}: expected 3 columns, got {len(cols)}. "
+            "There will be gaps in the data."
+        )
+        return None
     try:
-        df["DateTime"] = to_utc(
-            pd.to_datetime(df["Date"] + " " + df["Time"], format="%d/%m/%y %H:%M:%S")  # type: ignore
-        )
-        return df.drop(["Date", "Time"], axis=1)
+        if "Date" in cols:
+            # Automated CSV format
+            df["DateTime"] = to_utc(
+                pd.to_datetime(
+                    df["Date"] + " " + df["Time"], format="%d/%m/%y %H:%M:%S"
+                )  # type: ignore
+            )
+            df = df.drop(["Date", "Time"], axis=1)
+        else:
+            # Manual CSV format
+            df["DateTime"] = to_utc(
+                pd.to_datetime(df["Time"], format="%d/%m/%y %H:%M:%S")
+            )
+            # Drop time and energy columns
+            df = df.drop(["Time", cols[1]], axis=1)
     except (pytz.exceptions.AmbiguousTimeError, ValueError) as exc:
         logger.warning(
             f"'Error loading section of {file_name}'. There will be gaps in the data.\nDetails: {str(exc)}"
         )
         return None
 
+    return df
+
 
 def read_power_consumption_csv(
     file_content: io.BytesIO,
     file_name: str,
 ) -> pd.DataFrame | None:
-    """Read csv-formatted power consumption records. This can return None if there is not data in the file.
-
-    The Date & Time columns together to create a single DateTime column.
-    """
+    # See comment at the top of this describing the format
 
     def _append_if_not_none(seq, df):
         if df is not None:
@@ -97,7 +134,7 @@ def _append_if_not_none(seq, df):
         line = line.strip()
         line_lower = line.lower()
 
-        if line_lower.startswith(CSV_HEADER_ANCHOR):
+        if line_lower.startswith(CSV_PREAMBLE_ANCHOR):
             # Save any previous section
             if current_lines:
                 _append_if_not_none(
@@ -127,11 +164,8 @@ def _append_if_not_none(seq, df):
 
 
 def read_power_consumption_excel(file_content: io.BytesIO) -> pd.DataFrame:
-    """Read an excel-formatted power consumption record.
+    # See comment at the top of this describing the format
 
-    Expected columns: Time, Total Power (MW)
-    The Time column is renamed DateTime for consistency.
-    """
     df = pd.read_excel(file_content, engine=EXCEL_ENGINE, skiprows=EXCEL_SKIP_ROWS)
     df = df.rename(columns={"Time": "DateTime"})
     df["DateTime"] = to_utc(df["DateTime"])
@@ -179,42 +213,39 @@ def read_as_dataframe(file_obj: M365DriveItem) -> pd.DataFrame | None:
     yield df_batch
 
 
-@dlt.resource(merge_key="DateTime", columns={"DateTime": {"nullable": False}})
+@dlt.resource(
+    primary_key="DateTime",
+    write_disposition={"disposition": "merge", "strategy": "upsert"},
+)
 def rdm_data(
     site_url: str = dlt.config.value,
     root_dir: str = dlt.config.value,
     backfill: bool = False,
+    backfill_glob: str | None = None,
 ) -> Iterator[TDataItems]:
     if backfill:
-        historic_xl = sharepoint(
+        if backfill_glob is not None:
+            file_globs = [backfill_glob]
+        else:
+            file_globs = [
+                f"{root_dir}/**/*.xlsx",
+                f"{root_dir}/**/*-daily.csv",
+                f"{root_dir}/**/*-manual-export.csv",
+            ]
+        modified_after = None
+    else:
+        file_globs = [f"{root_dir}/*-ISIS.csv"]
+        modified_after = get_latest_timestamp(dlt.current.pipeline())
+
+    for file_glob in file_globs:
+        file_listing = sharepoint(
             site_url=site_url,
-            file_glob=f"{root_dir}/**/*.xlsx",
+            file_glob=file_glob,
             extract_content=False,
+            modified_after=modified_after,
         )
-        reader = historic_xl | extract_content_and_read()
-        yield from reader.apply_hints(
-            write_disposition="merge",
-        )
-        historic_csv = sharepoint(
-            site_url=site_url,
-            file_glob=f"{root_dir}/**/*-daily.csv",
-            extract_content=False,
-        )
-        reader = historic_csv | extract_content_and_read()
-        yield from reader.apply_hints(
-            write_disposition="merge",
-        )
-
-    latest_files = sharepoint(
-        site_url=site_url,
-        file_glob=f"{root_dir}/*-ISIS.csv",
-        extract_content=False,
-        modified_after=get_latest_timestamp(dlt.current.pipeline()),
-    )
-    reader = latest_files | extract_content_and_read()
-    yield from reader.apply_hints(
-        write_disposition="merge",
-    )
+        reader = file_listing | extract_content_and_read()
+        yield from reader
 
 
 def get_latest_timestamp(pipeline: dlt.Pipeline) -> pendulum.DateTime | None: