From 7e13dbfe9ac074c0963c85a94cef767b1de4a374 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 19 Mar 2026 12:00:35 -0400 Subject: [PATCH 01/11] add requirements file --- requirements.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9ee58dd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +icepyx + +# Need to confirm which of these remain dependencies +hdbscan +netcdf4 +utm + +geopandas +h5py +matplotlib +numpy +pandas +pyproj +rasterio +Rtree +scikit_learn +scipy +Shapely \ No newline at end of file From c8d9629e84f2d8704030a34b903ec3b974fbf9d8 Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 19 Mar 2026 12:25:37 -0400 Subject: [PATCH 02/11] add code files --- aok/core/kd_utils/Kd_analysis.py | 264 ++++ aok/core/kd_utils/README.md | 198 +++ aok/core/kd_utils/SeaSurfaceFlattening.py | 446 +++++++ .../kd_utils/SeafloorFilterByGEBCO_Module.py | 108 ++ .../IS2_BG_Rate_Land_Ocean_plot.py | 483 +++++++ .../analyze_nighttime_BG_rate_icesat2.py | 520 ++++++++ .../analyze_solarbckgrd_icesat2.py | 496 +++++++ aok/core/kd_utils/SolarBckgrd/config.py | 79 ++ aok/core/kd_utils/__init__.py | 8 + aok/core/kd_utils/bathy_processing.py | 647 +++++++++ aok/core/kd_utils/config.py | 247 ++++ aok/core/kd_utils/data_processing.py | 733 ++++++++++ aok/core/kd_utils/interpolation.py | 19 + aok/core/kd_utils/kd_utils.py | 1181 +++++++++++++++++ aok/core/kd_utils/sea_photons_analysis.py | 449 +++++++ aok/core/kd_utils/sliderule_adapter.py | 181 +++ aok/core/kd_utils/visualization.py | 263 ++++ 17 files changed, 6322 insertions(+) create mode 100644 aok/core/kd_utils/Kd_analysis.py create mode 100644 aok/core/kd_utils/README.md create mode 100644 aok/core/kd_utils/SeaSurfaceFlattening.py create mode 100644 aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py create mode 100644 aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py create mode 100644 aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py create mode 100644 aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py create mode 100644 aok/core/kd_utils/SolarBckgrd/config.py create mode 100644 aok/core/kd_utils/__init__.py create mode 100644 aok/core/kd_utils/bathy_processing.py create mode 100644 aok/core/kd_utils/config.py create mode 100644 aok/core/kd_utils/data_processing.py create mode 100644 aok/core/kd_utils/interpolation.py create mode 100644 aok/core/kd_utils/kd_utils.py create mode 100644 aok/core/kd_utils/sea_photons_analysis.py create mode 100644 aok/core/kd_utils/sliderule_adapter.py create mode 100644 aok/core/kd_utils/visualization.py diff --git a/aok/core/kd_utils/Kd_analysis.py b/aok/core/kd_utils/Kd_analysis.py new file mode 100644 index 0000000..6e28301 --- /dev/null +++ b/aok/core/kd_utils/Kd_analysis.py @@ -0,0 +1,264 @@ +# utils/Kd_analysis.py +# updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. + +import numpy as np +import pandas as pd +import logging +# from scipy.optimize import curve_fit +from sklearn.linear_model import LinearRegression + +# def log_model(z, kd, e0): +# return np.log(e0) - kd * z + +## This is wrong because the input is already a bined data, +## it is not necessary to do a histogram again +# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): +# if df.empty or 'lat_bins' not in df.columns: +# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) + +# # Get the latitude bin value +# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan +# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan +# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + +# # Calculate photon height range +# photon_height_min = df['photon_height'].min() +# photon_height_max = df['photon_height'].max() + +# # Check for sufficient data range +# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# height_bins_range = abs(photon_height_max - photon_height_min) +# height_bins_number = round(height_bins_range / vertical_res) + +# # Ensure there are enough bins +# if height_bins_number < 5: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# # Create histogram of photon heights +# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) +# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) +# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + +# # Store histogram data +# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) + +# # x value for model +# # Reverse zdepth to align with the MATLAB approach +# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + +# # Log-transform photon counts, replacing zeros with NaN +# # y value for model +# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) +# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan + +# # Filter out rows with NaNs in either column for regression +# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) + +# # Check for enough valid data points +# # Skip the regression if there are fewer than 5 datapoints +# if valid_data['log_photon_counts'].notna().sum() > 3: +# # Drop NaNs for regression +# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) +# log_counts_valid = valid_data['log_photon_counts'].values + +# # Perform linear regression +# model = LinearRegression() +# model.fit(zdepth_valid, log_counts_valid) + +# # Extract kd as the negative of the slope and e0 from the intercept +# kd = -model.coef_[0] + +# print('zdepth_valid:',zdepth_valid) +# print('photon_counts:',hist_df['photon_counts']) +# print('kd:',kd) + +# e0 = np.exp(model.intercept_) + +# # Set kd to NaN if negative +# if kd < 0: +# kd = np.nan +# else: +# kd, e0 = np.nan, np.nan + +# return pd.DataFrame({ +# 'lat_bins': [lat_bin_value], +# 'kd': [kd], +# 'e0': [e0], +# 'latitude': [latitude], +# 'longitude': [longitude] +# }) + + +# # one solution is to adjust the vertical_res to 0.25 +# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): +# if df.empty or 'lat_bins' not in df.columns: +# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) + +# # Get the latitude bin value +# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan +# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan +# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + +# # Calculate photon height range +# photon_height_min = df['photon_height'].min() +# photon_height_max = df['photon_height'].max() + +# # Check for sufficient data range +# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# height_bins_range = abs(photon_height_max - photon_height_min) +# height_bins_number = round(height_bins_range / vertical_res) + +# # Ensure there are enough bins +# if height_bins_number < 5: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# # Create histogram of photon heights +# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) +# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) +# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + +# # Store histogram data +# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) + +# # x value for model +# # Reverse zdepth to align with the MATLAB approach +# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + +# # Log-transform photon counts, replacing zeros with NaN +# # y value for model +# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) +# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan + +# # Filter out rows with NaNs in either column for regression +# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) + +# # Check for enough valid data points +# # Skip the regression if there are fewer than 5 datapoints +# if valid_data['log_photon_counts'].notna().sum() > 3: +# # Drop NaNs for regression +# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) +# log_counts_valid = valid_data['log_photon_counts'].values + +# # Perform linear regression +# model = LinearRegression() +# model.fit(zdepth_valid, log_counts_valid) + +# # Extract kd as the negative of the slope and e0 from the intercept +# kd = -model.coef_[0] + +# print('zdepth_valid:',zdepth_valid) +# print('photon_counts:',hist_df['photon_counts']) +# print('kd:',kd) + +# e0 = np.exp(model.intercept_) + +# # Set kd to NaN if negative +# if kd < 0: +# kd = np.nan +# else: +# kd, e0 = np.nan, np.nan + +# return pd.DataFrame({ +# 'lat_bins': [lat_bin_value], +# 'kd': [kd], +# 'e0': [e0], +# 'latitude': [latitude], +# 'longitude': [longitude] +# }) + + +# another solution is to calculate kd without hist +def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): + # Early exit if DataFrame is empty or missing required column + if df.empty or 'lat_bins' not in df.columns: + return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) + + # Retrieve latitude and longitude + lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan + latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan + longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + + # Use value_counts to get photon counts in each height bin + height_counts = df['height_bins'].value_counts().sort_index() + bin_centers = height_counts.index.astype(float) + + # Create a DataFrame for the height bins and counts + hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) + + # Reverse zdepth for model alignment + hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + + # Log-transform photon counts, replacing zeros with NaN for regression + hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) + hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan + + # Filter for rows without NaNs for regression + valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) + + # Perform regression if there are sufficient valid data points + if valid_data['log_photon_counts'].notna().sum() > 3: + zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) + log_counts_valid = valid_data['log_photon_counts'].values + + # Perform linear regression + model = LinearRegression() + model.fit(zdepth_valid, log_counts_valid) + + # Calculate kd and e0 + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + + # Set kd to NaN if negative + if kd < 0: + kd = np.nan + else: + kd, e0 = np.nan, np.nan + + return pd.DataFrame({ + 'lat_bins': [lat_bin_value], + 'kd': [kd], + 'e0': [e0], + 'latitude': [latitude], + 'longitude': [longitude] + }) + + +# Original kd calculation function remains unchanged +def calculate_kd(filtered_seafloor_subsurface_photon_dataset): + logging.info("Calculating Kd from filtered subsurface photon dataset") + SubsurfacePhotonDFAddedKd = filtered_seafloor_subsurface_photon_dataset\ + .groupby('lat_bins', observed=False)\ + .apply(CalculateKdFromFilteredSubsurfacePhoton, include_groups=True) + + + # Remove the index without resetting it if 'lat_bins' already exists + SubsurfacePhotonDFAddedKd = SubsurfacePhotonDFAddedKd.droplevel(0) + return SubsurfacePhotonDFAddedKd + + +# Updated function to apply kd calculation beam-by-beam +def process_kd_calculation(Final_filtered_subsurface_photon_dataset): + # Initialize list to store results for each beam + kd_beam_datasets = [] + + # Group by 'beam_id' to process each beam independently + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): + logging.info(f"Calculating Kd for beam: {beam_id}") + + # Apply the calculate_kd function to the current beam's dataset + SubsurfacePhotonDFAddedKd = calculate_kd(beam_data) + + # Add a column to track the beam_id in the results + SubsurfacePhotonDFAddedKd['beam_id'] = beam_id + + # Append the result to the list + kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) + + # Combine results from all beams into a single DataFrame + combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) + + return combined_kd_dataset \ No newline at end of file diff --git a/aok/core/kd_utils/README.md b/aok/core/kd_utils/README.md new file mode 100644 index 0000000..28f6ee4 --- /dev/null +++ b/aok/core/kd_utils/README.md @@ -0,0 +1,198 @@ +# icesat-2_kdph_py + + +This repository contains Python code developed in collaboration to replicate the functionality of the original MATLAB scripts (https://github.com/emilyeidam/icesat-2_kdph +) by Dr. Emily Eidam (emily.eidam@oregonstate.edu). The code facilitates processing ICESat-2 data to calculate the diffuse attenuation coefficient (Kd) based on space-based lidar photon profiles, following methods described in the original MATLAB code. Please cite appropriately both Python and the MATLAB version under the GNU GPLv3 license if you use this code in your research. + +## Repository Structure + +Current structure: + +```text +icesat-2_kdph_py/ ++-- main.py ++-- config.py ++-- requirements.txt ++-- README.md ++-- kd_utils/ + +-- __init__.py + +-- data_processing.py + +-- sea_photons_analysis.py + +-- bathy_processing.py + +-- Kd_analysis.py + +-- interpolation.py + +-- visualization.py + +-- sliderule_adapter.py + +-- SeaSurfaceFlattening.py + +-- SolarBckgrd/ +``` + +Key modules: + +- `main.py`: End-to-end pipeline entrypoint for local ATL03 workflows. +- `config.py`: All CLI switches, paths, and thresholds. +- `kd_utils/data_processing.py`: ATL03 ingestion, land/sea masking, optional pre-filters. +- `kd_utils/sea_photons_analysis.py`: Binning and sea-surface detection. +- `kd_utils/bathy_processing.py`: Subsurface filtering and optional gap steps (GEBCO/ATL24/refraction/etc.). +- `kd_utils/Kd_analysis.py`: Kd fitting. +- `kd_utils/sliderule_adapter.py`: SlideRule ATL03 fetch + conversion to framework schema for notebook/script imports. + +### Getting Started + +### Prerequisites + +- Python 3.x +- Libraries: `numpy`, `pandas`, `scipy`, `h5py`, `matplotlib` +- Ensure you have ICESat-2 ATL03 `.h5` files available in the appropriate directory, as specified in `config.py`, and also refers to the original MATLAB code. + +### Installation + +1. Clone the repository: + + ```bash + git clone https://github.com/ChaoEcohydroRS/icesat-2_kdph_py.git + cd IS2_Kd_Proj + ``` + + +2. Install the required packages: + ``` + pip install -r requirements.txt + ``` + +3. Ensure .h5 files are placed in the appropriate default folder structure. + +### Usage +1. Edit the config.py file to set up file paths and any parameters. +2. Run the entire workflow through main.py: + + ```bash + python main.py + ``` + +### SlideRule Ingestion (Notebook Import Path) + +You can reuse the same framework without local ATL03 `.h5` files by importing the SlideRule adapter: + +```python +from kd_utils.sliderule_adapter import fetch_and_prepare_sliderule_dataset +from kd_utils.sea_photons_analysis import process_sea_photon_binning +from kd_utils.bathy_processing import process_subsurface_photon_filtering +from kd_utils.Kd_analysis import process_kd_calculation + +region = [ + {"lon": -84.029, "lat": 29.732}, + {"lon": -84.029, "lat": 29.954}, + {"lon": -83.969, "lat": 29.954}, + {"lon": -83.969, "lat": 29.732}, + {"lon": -84.029, "lat": 29.732}, +] + +sea_photon_dataset = fetch_and_prepare_sliderule_dataset( + region=region, + t0="2025-05-22T00:00:00Z", + t1="2025-05-24T00:00:00Z", + rgt=1033, + strong_beams_only=True, +) + +binned = process_sea_photon_binning(sea_photon_dataset, horizontal_res=500, vertical_res=0.25) +sea_h, sea_lbl, subsurface = process_subsurface_photon_filtering( + binned, GEBCO_paths=[], subsurface_thresh=1.0, Ignore_Subsurface_Height_Thres=-6 +) +kd_df = process_kd_calculation(subsurface) +``` + +This path reuses your existing processing framework; only ingestion switches from local HDF5 to SlideRule API. + +### Optional Gap Steps (Sensitivity On/Off) + +All extra processing steps are optional and disabled by default. + +- IR/AP proxy photon filtering: + ```bash + python main.py --enable_ir_ap_filter + ``` +- Solar background filtering: + ```bash + python main.py --enable_solar_background_filter + ``` +- GEBCO seafloor filtering: + ```bash + python main.py --enable_gebco_filter + ``` +- ATL24-first with GEBCO fallback: + ```bash + python main.py --enable_atl24_filter --atl24_file path/to/atl24_points.csv --enable_gebco_filter + ``` +- Sea surface flattening: + ```bash + python main.py --enable_sea_surface_flattening + ``` +- Convex hull reasonableness filter: + ```bash + python main.py --enable_convex_hull_filter + ``` +- Paired-beam combine before Kd fit: + ```bash + python main.py --enable_paired_beam_combine + ``` +- Histogram quality filter (<5% signal at 6-7 m depth band): + ```bash + python main.py --enable_histogram_quality_filter + ``` +- Surface Gaussian sigma filter: + ```bash + python main.py --enable_surface_sigma_filter + ``` +- Refraction correction: + ```bash + python main.py --enable_refraction_correction + ``` +- Post-refraction rebuild + re-fit: + ```bash + python main.py --enable_refraction_correction --enable_post_refraction_refit + ``` + +### Gap Implementation Progress + +The flowchart gaps are being implemented one-by-one with optional switches. + +- [x] Gap 1: Remove photons flagged as IR/AP (proxy implementation via `quality_ph` and `photon_conf`) + - Switch: `--enable_ir_ap_filter` + - Tuning: `--ir_ap_quality_max`, `--ir_ap_min_signal_conf` +- [x] Gap 2: Histogram quality review (<5% signal at 6-7 m) + - Switch: `--enable_histogram_quality_filter` + - Tuning: + - `--histogram_quality_min_ratio` (default `0.05`) + - `--histogram_quality_depth_min` (default `6.0`) + - `--histogram_quality_depth_max` (default `7.0`) +- [x] Gap 3: Discard bins with high surface Gaussian std dev + - Switch: `--enable_surface_sigma_filter` + - Tuning: `--surface_sigma_max` (default `0.5`) +- [x] Gap 4: Refraction correction integration + - Switch: `--enable_refraction_correction` + - Tuning: + - `--refraction_water_temp_c` (default `20.0`) + - `--refraction_wavelength_nm` (default `532.0`) +- [x] Gap 5: Rebuild histogram and re-fit after refraction + - Switch: `--enable_post_refraction_refit` (requires `--enable_refraction_correction`) +- [x] Gap 6: ATL24 query + fallback orchestration + - Switch: `--enable_atl24_filter` + - Note: current implementation uses local ATL24 point files provided via `--atl24_file`. + - Inputs: + - `--atl24_file` (CSV/Parquet/GPKG/SHP with lon/lat + bathymetry field) + - `--atl24_max_match_distance_deg` (default `0.01`) + - Fallback: if ATL24 has no usable matches and `--enable_gebco_filter` is on, GEBCO filtering is used. +- [x] Gap 7: Combine paired beams + - Switch: `--enable_paired_beam_combine` + - Method: maps left/right beams to `gt1_pair`, `gt2_pair`, `gt3_pair` and re-bins by `relative_AT_dist` using `horizontal_res`. + +### Citation +If you use this code, please cite: + +Eidam, E.F., K. Bisson, C. Wang, C. Walker, and A. Gibbons (2024). ICESat-2 and ocean particulates: A roadmap for calculating Kd from space-based lidar photon profiles. Remote Sensing of Environment. Vol. 311. https://doi.org/10.1016/j.rse.2024.114222 + +### License +This project is licensed under the GNU GPLv3 license. + diff --git a/aok/core/kd_utils/SeaSurfaceFlattening.py b/aok/core/kd_utils/SeaSurfaceFlattening.py new file mode 100644 index 0000000..6b3d64f --- /dev/null +++ b/aok/core/kd_utils/SeaSurfaceFlattening.py @@ -0,0 +1,446 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 25 13:25:53 2024 + +@author: wayne +""" + +from kd_utils import * +import scipy.interpolate + + +path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/' +ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5" +ATL03_h5_file_path = path + ATL03_h5_file + + +# Set the shoreline data path +Current_Path='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' + +#load the shoreline package +shoreline_data_path = Current_Path+'Shorelines/GeoPkgGlobalShoreline.gpkg' + + + +# speed of light +c = 299792458.0 + +# bin data first based on horizontal resolution +horizontal_res=10 + +# vertical resolution +# vertical_res=0.1 +vertical_res=0.25 +# vertical_res=1 + +# sea subsurface thresh +# determine subsurface by setting how depth below the sea surface +subsurface_thresh = 0.5 + + +# define specific rectangular target zone for batch analysis +# i.e., From subsurface depth 0.5 to specific KdMaxDepth depth (e.g, 6) +Kd_max_depth = 6 + + +######################################## +#readATL03_github +IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams = read_granule(ATL03_h5_file_path,ATTRIBUTES=True) + +# extract parameters from ICESat-2 ATLAS HDF5 file name +rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' + r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') +SUB,PRD,YY,MM,DD,HH,MN,SS,TRK,CYCL,GRAN,RL,VERS,AUX = rx.findall(ATL03_h5_file_path).pop() + + +# initialize data storage variables +# variables of interest for generating three types classification +Segment_ID = {} +Segment_Index_begin = {} +Segment_PE_count = {} + +Equator_Segment_Distance = {} +Segment_Length = {} + +Segment_Is_Land={} + +# mean geolocation, height and delta time +Segment_Lon = {} +Segment_Lat = {} +Segment_Elev = {} +Segment_Time = {} + +Segment_ref_elev= {} +Segment_ref_azimuth= {} + +# relative_AT_dist = {} +# relative_seg_dist = {} + +# background photon rate +background_rate = {} +background_counts={} + + + +######################################## +#proc02_manually_select_water_forGitHub +#remove water surface based on simple 0.5 m + + +#raw beam-by-beam photon data after cutoff below the max value 0.5 below peak +IS2_atl03Cut50cmBelowPeak={} + +# for each input beam within the file +for gtx in sorted(IS2_atl03_beams[0:1]): +# for gtx in sorted(IS2_atl03_beams): + # data and attributes for beam gtx + IS2_val = IS2_atl03_mds[gtx] + IS2_attrs = IS2_atl03_attrs[gtx] + + # ATL03 Segment ID + Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + # number of valid overlapping ATL03 segments + n_seg = len(Segment_ID[gtx]) + # number of photon events + n_pe, = IS2_val['heights']['delta_time'].shape + + # first photon ID (1-based) in each segment (convert to 0-based indexing) + Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 + + # number of photon events in the segment + Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] + + # along-track distance from the equator crossing to the start of each ATL03 20 meter geolocation segment + Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] + + # along-track length for each ATL03 segment + Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] + + # Transmit time of the reference photon + delta_time = IS2_val['geolocation']['delta_time'] + + # get geolocation lat/lon + segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() + segment_lon= IS2_val['geolocation']['reference_photon_lon'][:].copy() + + # get parameters ref elev and azimuth for refraction correction + ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() + ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() + + #get geoid for converting WGS84 ellipsoid to geoid correction + geoid = IS2_val['geophys_corr']['geoid'][:].copy() + + # photon event heights + h_ph = IS2_val['heights']['h_ph'][:].copy() + lat_ph = IS2_val['heights']['lat_ph'][:].copy() + lon_ph = IS2_val['heights']['lon_ph'][:].copy() +# dist_ph_along = IS2_val['heights']['dist_ph_along'][:].copy() + signal_conf_photon = IS2_val['heights']['signal_conf_ph'][...,0].copy() + + # along-track and across-track distance for photon events + x_atc = IS2_val['heights']['dist_ph_along'][:].copy() + y_atc = IS2_val['heights']['dist_ph_across'][:].copy() + + # photon quality + quality_ph=IS2_val['heights']['quality_ph'] + + #calculate along track distance + for seg_index in range(n_seg): + # index for 20m segment j + idx = Segment_Index_begin[gtx][seg_index] + # number of photons in 20m segment + cnt = Segment_PE_count[gtx][seg_index] + # add segment distance to along-track coordinates + x_atc[idx:idx+cnt] += Equator_Segment_Distance[gtx][seg_index] + + # calculate along track distance relative to the beginning of the cut segment + relative_AT_dist=(x_atc-x_atc[0])/1000 + + + relative_seg_dist=(Equator_Segment_Distance[gtx]-Equator_Segment_Distance[gtx][0])/1000 + + + # this function is a significant slowdown without pygeos + Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + + # create a geo dataframe + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land,crs="EPSG:4326") + + #Step1: isolate water using landmask + # spatial joint to determine land/sea mask + Segment_Is_Land_Labels=isolate_sea_land_photons(shoreline_data_path,ICESat2_GDF) + +# # set the labels back to ICESat2 geopandas dataframe + ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + + # interpolate is_land labels based on photon lat + island_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], + Segment_Is_Land_Labels[:], + fill_value="extrapolate") + + # Apply interpid model + is_land_label_interp1d = island_interp1d_model(lat_ph[:]) + + # Ref_elev on a per photon level (assign seg ref_elev to photons) + ref_elev_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], + ref_elev[:], + fill_value="extrapolate") + # apply interpid model + ph_ref_elev = ref_elev_interp1d_model(lat_ph[:]) + + # Ref_azimuth on a per photon level (assign seg ref_azimuth to photons) + ref_azimuth_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], + ref_azimuth[:], + fill_value="extrapolate") + + # apply interpid model + ph_ref_azimuth = ref_azimuth_interp1d_model(lat_ph[:]) + + + # interpolate geoid based on segment lat + geoid_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], + geoid[:], + fill_value="extrapolate") + + # Apply interpid model + ph_geoid = geoid_interp1d_model(lat_ph[:]) + + + #geoid correct + h_ph_geoid_cor = h_ph[:] - ph_geoid[:] + + # Find the epsg code + epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) + epsg_num = int(epsg_code.split(':')[-1]) + + # Orthometrically correct the data using the epsg code + lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) + + # Aggregate data into dataframe + sea_photon_dataset = \ + pd.DataFrame({'latitude': lat_ph, 'longitude': lon_ph, + 'lat': lat_utm, 'lon': lon_utm, + 'photon_height': h_ph_geoid_cor, + 'quality_ph': quality_ph, + 'is_land_label': is_land_label_interp1d, + 'photon_conf': signal_conf_photon, + 'ref_elevation': ph_ref_elev, + 'ref_azimuth': ph_ref_azimuth, + 'relative_AT_dist':relative_AT_dist}, + columns=['latitude', 'longitude', + 'lat', 'lon', + 'photon_height', + 'quality_ph', + 'is_land_label', 'photon_conf', + 'ref_elevation', 'ref_azimuth', + 'relative_AT_dist']) + + # #remove the saturated photons + # sea_photon_dataset=sea_photon_dataset[sea_photon_dataset['quality_ph'] < 1] + + #remove the land photons + sea_photon_dataset=sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] + + # bin the data by 25 cm + binned_dataset_sea_surface=horizontal_vertical_bin_dataset(sea_photon_dataset, horizontal_res, vertical_res) + + # get sea surface height + # threshold to determine how much depth below sea surface will be accounted + # Output: + # ->final_sea_surface_height + # ->sea_surface_height_abnormal_label + # ->sea_surface_dominated_label + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + get_sea_surface_height(binned_dataset_sea_surface, subsurface_thresh) + + +sea_surface_height.reverse() + + + + + +#################################################################### +# Tasks: Sea Surface flattening +# I have a list value of sea_surface_height over each 10 m bin, I would like to +# 1) calculate the mean sea_surface_height value along each big bin with 500 m ; +# 2) Within each 10-m along track increment, +# adjust all of the photons (i.e., subsurface_photon_dataset) within that 10-m column either up or down +# based on the deviation of the local sea surface height from the mean sea surface height + +###################### +# Step 1: Creating a DataFrame for sea_surface_height with a corresponding 10 m small bin index +# each element in sea_surface_height corresponds to a 10 m bin +df_sea_surface = pd.DataFrame({ + 'sea_surface_height': sea_surface_height, # sea_surface_height is provided as a list + 'small_bins': np.arange(len(sea_surface_height)) # Index for each 10 m bin +}) + +# # Map 'small_bins'(i.e., 'lat_bins') to 500 m 'big_bins' +df_sea_surface['big_bins'] = df_sea_surface['small_bins'] // 50 # 50 10 m bins in each 500 m bin + + +###################### +# Step 2: Calculate Mean Sea_surface_height value for Each 500-m Big Bin +mean_sea_surface_elevation_bigBin = df_sea_surface.groupby('big_bins')['sea_surface_height'].mean().reset_index() +mean_sea_surface_elevation_bigBin.rename(columns={'sea_surface_height': 'mean_sea_surface_height_big_bins'}, inplace=True) + +# Prepare subsurface_photon_dataset for merging +subsurface_photon_dataset['big_bins'] = subsurface_photon_dataset['lat_bins'].astype(int) // 50 + +# Merge the mean elevation data with the subsurface_photon_dataset +subsurface_photon_dataset = subsurface_photon_dataset.merge(mean_sea_surface_elevation_bigBin,\ + on='big_bins', how='left') + +# Merge sea_surface_height data with subsurface_photon_dataset based on the small_bins +subsurface_photon_dataset = subsurface_photon_dataset.merge(df_sea_surface[['small_bins', 'sea_surface_height']], \ + left_on='lat_bins', right_on='small_bins', how='left') + + +####################### +# Step 3: Within each 10 m small bin, adjust the photon heights +# based on the deviation between their sea surface height at each small bin and the mean sea surface height within each 500-m bin +subsurface_photon_dataset['adjusted_photon_height'] = subsurface_photon_dataset['photon_height'] - \ + (subsurface_photon_dataset['mean_sea_surface_height_big_bins']-\ + subsurface_photon_dataset['sea_surface_height']\ + ) + +subsurface_photon_dataset['Dif_photon_height'] = (subsurface_photon_dataset['mean_sea_surface_height_big_bins']-\ + subsurface_photon_dataset['sea_surface_height']\ + ) +###################################### +################################### +###################################### +################################### +###Visualization + +import matplotlib.pyplot as plt +from mpl_toolkits.axes_grid1.inset_locator import inset_axes +from matplotlib.patches import Rectangle +import matplotlib.transforms as mtransforms + + + +sea_surface_x_axis_bins = np.linspace(relative_seg_dist.min(), + relative_seg_dist.max(), len(sea_surface_height)) + + + +mean_sea_surface_x_axis_bins = np.linspace(relative_seg_dist.min(), + relative_seg_dist.max(), len(mean_sea_surface_elevation_bigBin)) + + +# Adjusting the global font size +plt.rcParams['font.size'] = 18 # You can choose your own font size here + +# Define your zoom area limits for the x-axis +zoom_xlim = (10.5, 11.5) # for example, from 10 to 20 on your x-axis + +# Filter the data based on your x-axis limits +# Here, we're creating a boolean index that matches your zoom criteria +x_index_within_zoom = (sea_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & (sea_photon_dataset.relative_AT_dist <= zoom_xlim[1]) + +# Apply the index to your x and y data +zoomed_relative_AT_dist = sea_photon_dataset.relative_AT_dist[x_index_within_zoom] +zoomed_photon_height = sea_photon_dataset.photon_height[x_index_within_zoom] + +#get the x index for adjusted subsurface_photon +subsurface_x_index_within_zoom = (subsurface_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & (subsurface_photon_dataset.relative_AT_dist <= zoom_xlim[1]) +subsurface_zoomed_relative_AT_dist = subsurface_photon_dataset.relative_AT_dist[subsurface_x_index_within_zoom] +zoomed_photon_height_adjusted = subsurface_photon_dataset.adjusted_photon_height[subsurface_x_index_within_zoom] +zoomed_photon_height_Dif = subsurface_photon_dataset.Dif_photon_height[subsurface_x_index_within_zoom] + + + +# Now proceed with creating your main plot +hlims = [-40, 5] # height limits +sub_hlims = [-10, 3] # height limits + + + +######################### +fig = plt.figure(figsize=(16, 8), dpi=300, facecolor='w', edgecolor='k') +ax = fig.add_subplot(111) +ax.set_xlabel('Distance From Start of Track (km)') +ax.set_ylabel('Photon Height (m)') + +ax.scatter(sea_photon_dataset.relative_AT_dist, sea_photon_dataset.photon_height, + s=10, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') +ax.plot(sea_surface_x_axis_bins, sea_surface_height, linewidth=0.8, color='b', label='Surface Peak') +ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], linewidth=0.8, color='#DD571C', label='0.5 m Below Surface Peak') +ax.plot(sea_surface_x_axis_bins, [x - 1 for x in sea_surface_height], linewidth=0.8, color='#FDA172', label='1 m Below Surface Peak') +ax.plot(sea_surface_x_axis_bins, [x - 2 for x in sea_surface_height], linewidth=0.8, color='#FCAE1E', label='2 m Below Surface Peak') + +# Create an inset axis for the zoomed area +# axins = inset_axes(ax, width=5, height=3, loc='lower left') # adjust the location as needed + +# Assuming 'ax' is your main axes +trans = mtransforms.blended_transform_factory(ax.figure.transFigure, ax.transAxes) +axins = inset_axes(ax, width=5, height=3, loc=3, + bbox_to_anchor=(0.2, 0.1, 0.4, 0.4), # The numbers are x0, y0, width, and height in percentages of the figure size. + bbox_transform=trans, + borderpad=0) + +# Plot the filtered data on the inset axes +axins.scatter(zoomed_relative_AT_dist, zoomed_photon_height, + s=10, c='k', alpha=0.15, edgecolors='none', label='Zoomed ATL03 Photons') + +axins.scatter(subsurface_zoomed_relative_AT_dist, zoomed_photon_height_adjusted, + s=10, c='r', alpha=0.15, edgecolors='none', label='Zoomed Adjusted ATL03 Photons') + + +axins.scatter(subsurface_zoomed_relative_AT_dist, zoomed_photon_height_Dif, + s=20, c='g', alpha=0.15, edgecolors='none', label='Zoomed Dif ATL03 Photons') + +# Add horizontal line at y=0 +axins.axhline(y=0, color='blue', alpha=0.15, linestyle='--', linewidth=1) + +# # For the lines, you might need to adjust your data arrays to match the zoomed range +# # assuming your sea_surface_x_axis_bins aligns directly with your sea_surface_height data +# zoomed_sea_surface_x = [x for x in sea_surface_x_axis_bins if zoom_xlim[0] <= x <= zoom_xlim[1]] +# zoomed_sea_surface_height = sea_surface_height[:len(zoomed_sea_surface_x)] # adjust based on your data alignment + +# First, we need to find the indices of the items in sea_surface_x_axis_bins that fall within the zoomed range. +zoomed_indices = [index for index, x in enumerate(sea_surface_x_axis_bins) if zoom_xlim[0] <= x <= zoom_xlim[1]] + +# Then, use these indices to select the corresponding items from both sea_surface_x_axis_bins and sea_surface_height. +zoomed_sea_surface_x = [sea_surface_x_axis_bins[i] for i in zoomed_indices] +zoomed_sea_surface_height = [sea_surface_height[i] for i in zoomed_indices] + + +#0197f6,#BE5504,#FA8128 +axins.plot(zoomed_sea_surface_x, zoomed_sea_surface_height, linewidth=1, color='b', label='Zoomed Surface Peak') +axins.plot(zoomed_sea_surface_x, [x - 0.5 for x in zoomed_sea_surface_height], linewidth=1, color='#DD571C', label='0.5 m Below Zoomed Surface Peak') +axins.plot(zoomed_sea_surface_x, [x - 1 for x in zoomed_sea_surface_height], linewidth=1, color='#FDA172', label='1 m Below Zoomed Surface Peak') +axins.plot(zoomed_sea_surface_x, [x - 2 for x in zoomed_sea_surface_height],linewidth=1, color='#FCAE1E', label='2 m Below Zoomed Surface Peak') + +#mean sea surface height at 500m bins +axins.plot(mean_sea_surface_x_axis_bins, mean_sea_surface_elevation_bigBin, linewidth=1, color='r', label='Zoomed Surface Peak 500m bin') + + +# Set the x limits for the inset axes; y limits remain unchanged +axins.set_xlim(zoom_xlim) +axins.set_ylim(sub_hlims) + +axins.set_title('Zoomed in') + + +# Adding the rectangle indicating the zoomed area +# Draw a rectangle on the main plot to indicate the zoomed area +# We use the data coordinates for the rectangle, as it needs to correspond to the actual data points on the main axes. +rec_hlims=[-10,3] +rectangle = Rectangle((zoom_xlim[0], rec_hlims[0]), zoom_xlim[1] - zoom_xlim[0], rec_hlims[1] - rec_hlims[0], + linewidth=1, edgecolor='r', facecolor='none', linestyle='--') +ax.add_patch(rectangle) + + +# Finalize the main plot adjustments +ax.legend(loc='lower right') +ax.set_ylim(hlims) + +plt.savefig(path+'/'+YY+MM+DD+'Track'+TRK+'Plot'+gtx+'_With_depth_colored.jpg', dpi=500,bbox_inches='tight') +# Display the plot +plt.show() + + + diff --git a/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py b/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py new file mode 100644 index 0000000..51a7396 --- /dev/null +++ b/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py @@ -0,0 +1,108 @@ +import rasterio +import numpy as np +import pandas as pd +from rtree import index +from shapely.geometry import box, Point + +# 1. Function to create an R-tree spatial index for raster bounds +def create_spatial_index(gebco_paths): + """ + Create an R-tree spatial index for raster bounds to quickly find relevant rasters. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + + Returns: + raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + """ + idx = index.Index() + raster_data_dict = {} + for i, path in enumerate(gebco_paths): + with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + gebco_raster = rasterio.open(path) + raster_data = gebco_raster.read(1) + raster_data_dict[path] = (gebco_raster, raster_data) + bounds = gebco_raster.bounds + idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + return raster_data_dict, idx + +# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner +def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): + """ + Determine which raster datasets are relevant for the given coordinates using spatial index. + This function uses a more efficient approach by performing a bounding box query for batches of points. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + raster_data_dict (dict): Dictionary containing raster datasets and their data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + + Returns: + relevant_rasters (list): List of relevant raster datasets and their respective data. + """ + # Create a bounding box that covers all points + min_lon, max_lon = lons.min(), lons.max() + min_lat, max_lat = lats.min(), lats.max() + bounding_box = box(min_lon, min_lat, max_lon, max_lat) + + # Get all rasters that intersect with the bounding box + matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) + relevant_paths = {match.object for match in matches} + relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + return relevant_rasters + +# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner +def get_seafloor_elevation(lons, lats, relevant_rasters): + """ + Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + relevant_rasters (list): List of relevant raster datasets and their respective data. + + Returns: + seafloor_elevations (numpy.ndarray): Array of seafloor elevations. + """ + points = np.vstack((lons, lats)).T + seafloor_elevations = np.full(len(lons), np.nan) + + for gebco_raster, raster_data in relevant_rasters: + # Use rasterio.sample to get values for multiple points in a batch + values = list(gebco_raster.sample(points)) + for idx, value in enumerate(values): + if np.isnan(seafloor_elevations[idx]) and value is not None: + seafloor_elevations[idx] = value[0] + + return seafloor_elevations + +# 4. The main process function that ties everything together +def process_seafloor_data(gebco_paths, sea_photon_dataset): + """ + Main function to process the seafloor data. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. + """ + # Create spatial index and load GEBCO Raster Data + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + + # Get the relevant rasters using spatial index + lons = sea_photon_dataset['longitude'].values + lats = sea_photon_dataset['latitude'].values + relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) + + # Get seafloor elevation for all points + sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + + # Filter out points below the seafloor + filtered_sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']] + # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) + + return filtered_sea_photon_dataset \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py b/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py new file mode 100644 index 0000000..4b80bbf --- /dev/null +++ b/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py @@ -0,0 +1,483 @@ +import h5py +import os +import numpy as np +import pandas as pd +import geopandas as gpd +from shapely.geometry import Point +from scipy.interpolate import interp1d +from scipy.optimize import curve_fit +import matplotlib.pyplot as plt +from multiprocessing import Pool +from config import get_args +from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation +from noise_utils.data_processing import isolate_sea_land_photons +import random # Added for random selection + +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Define model functions +def power_model(x, a, b, c): + return a * np.power(x, b) + c + +def exp_model(x, a, b, c): + """Exponential model: a * exp(b * x) + c""" + return a * np.exp(b * x) + c + +def twilight_model(x, a, b, c): + return a * np.exp(-b * x**2) + c # Gaussian form for twilight + +def process_file(args): + """Process a single ATL03 file, skipping files missing required datasets.""" + file, beam, shoreline_data_path = args + try: + with h5py.File(file, 'r') as f: + # Check for required datasets + required_keys = [ + f'{beam}/bckgrd_atlas/bckgrd_rate', + f'{beam}/bckgrd_atlas/delta_time', + f'{beam}/heights/delta_time', + f'{beam}/heights/h_ph', + f'{beam}/geolocation/reference_photon_lat', + f'{beam}/geolocation/reference_photon_lon', + f'{beam}/geolocation/solar_azimuth', + f'{beam}/geolocation/solar_elevation' + ] + for key in required_keys: + if key not in f: + raise KeyError(f"Missing dataset: {key}") + + bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) + bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] + heights = f[f'{beam}/heights/h_ph'][:] + segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] + segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] + segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] + + Segment_Is_Land = pd.DataFrame({ + 'latitude': segment_lat, + 'longitude': segment_lon, + 'delta_time': segment_delta_time + }) + + # Create a GeoDataFrame with geometry + Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Determine land/ocean classification + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + + # Interpolate is_land labels to bckgrd_time + interp_is_land = interp1d( + ICESat2_GDF['delta_time'], + ICESat2_GDF['is_land'].astype(int), + bounds_error=False, + fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + ) + is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) + + # Compute total background rates for land and ocean + land_mask = is_land_interpolated == 1 + ocean_mask = is_land_interpolated == 0 + total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + + # Solar elevation handling + FILL_VALUE = 3.4028235e38 + solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) + valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) + + mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan + + result = { + 'file': os.path.basename(file), + 'total_background_rate_land': total_background_rate_land, + 'total_background_rate_ocean': total_background_rate_ocean, + 'solar_elevation': solar_elevation, + 'mean_photon_height': mean_photon_height + } + logger.info(f"Processed {file}: {result}") + return result + except Exception as e: + logger.error(f"Error processing {file}: {str(e)}") + return {'file': os.path.basename(file), 'error': str(e)} + +def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + """Fit a model to a specific classification and surface type.""" + subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + if len(subset) < 3: + print(f"Warning: Insufficient data for {classification} {surface_type} model.") + return None, None, None + + mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + valid_x = subset.loc[mask, x_col] + valid_y = subset.loc[mask, y_col] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + return None, None, None + + try: + popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + y_pred = model_func(valid_x, *popt) + r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + return popt, r2, lambda x: model_func(x, *popt) + except RuntimeError as e: + print(f"Fit failed for {classification} {surface_type}: {e}") + return None, None, None + +def load_and_process_files(atl03_directory, beam, shoreline_data_path): + """Load and process ATL03 files in parallel.""" + atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + print(f"Found {len(atl03_files)} ATL03 files.") + + if not atl03_files: + raise ValueError("No ATL03 files found in the directory.") + + with Pool() as pool: + results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) + + print("Results from process_file:", results) + df = pd.DataFrame(results) + + if 'error' in df.columns: + error_files = df[df['error'].notna()]['file'].tolist() + print(f"Errors in {len(error_files)} files: {error_files}") + df = df[df['error'].isna()].drop(columns=['error']) + + if df.empty: + raise ValueError("No files were successfully processed. Check the error messages above.") + + print("DataFrame columns after processing:", df.columns) + return df + +def assign_surface_type(df): + """Assign surface type based on background rates.""" + required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") + + df['surface_type'] = np.where( + df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), + 'Land', + np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + ) + return df + +def classify_data(df, twilight_threshold=6.0): + """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" + df['classification'] = np.where( + df['solar_elevation'].isna(), 'Unknown', + np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', + np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + ) + daytime_df = df[df['classification'] == 'Daytime'].copy() + nighttime_df = df[df['classification'] == 'Nighttime'].copy() + twilight_df = df[df['classification'] == 'Twilight'].copy() + unknown_df = df[df['classification'] == 'Unknown'].copy() + print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") + return df, daytime_df, nighttime_df, twilight_df, unknown_df + +def calculate_non_solar_rate(nighttime_df): + """Calculate the non-solar background rate from nighttime data.""" + non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) + non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) + print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") + print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") + print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") + return non_solar_rate + +def compute_solar_background_rates(df, non_solar_rate): + """Compute solar background rates for land and ocean.""" + df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) + df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) + df['solar_background_rate'] = np.where( + df['surface_type'] == 'Land', df['solar_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + ) + return df + +def fit_daytime_model(df, daytime_df, non_solar_rate): + """Fit daytime models for land and ocean.""" + daytime_models = {} + for surface_type in ['Land', 'Ocean']: + subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient daytime {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_x = subset.loc[mask, 'solar_elevation'] + valid_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + continue + + # Remove outliers + log_y = np.log10(valid_y + 1) + mean_log_y = np.mean(log_y) + std_log_y = np.std(log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + valid_x = valid_x[outlier_mask] + valid_y = valid_y[outlier_mask] + + print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") + try: + p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] + bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) + popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) + r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + x_shifted = valid_x - min(valid_x) + 1 + p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] + bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) + popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) + r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + if r2_power > r2_exp: + print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") + daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + else: + print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") + daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + + # Apply model + daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) + valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + if r2_power > r2_exp: + x_for_model_shifted = x_for_model - min(valid_x) + 1 + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + else: + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") + df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + return df, daytime_models + +def fit_twilight_model(df, twilight_df, non_solar_rate): + """Fit twilight models for land and ocean.""" + twilight_models = {} + if len(twilight_df) > 3: + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( + df['solar_background_rate'].fillna(0).clip(lower=0) + ) + for surface_type in ['Land', 'Ocean']: + subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_twilight_x = subset.loc[mask, 'solar_elevation'] + valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_twilight_x) < 3: + print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + continue + + print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + try: + initial_a = np.max(valid_twilight_y) - non_solar_rate + initial_b = 0.01 + initial_c = non_solar_rate + p0 = [initial_a, initial_b, initial_c] + bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) + + popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + y_pred = twilight_model(valid_twilight_x, *popt) + r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + + if r_squared < 0: + mean_rate = np.mean(valid_twilight_y) + twilight_models[surface_type] = { + 'params': [mean_rate], + 'r2': 0.0, + 'model': lambda x: np.full_like(x, mean_rate) + } + print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + else: + print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") + twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] + df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") + df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + else: + print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + return df, twilight_models + +def create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir): + """Create diagnostic plot of background rate vs. solar elevation.""" + plt.figure(figsize=(10, 6)) + plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], + c='blue', alpha=0.5, label='Daytime Land') + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='cyan', alpha=0.5, label='Daytime Ocean') + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], + c='orange', alpha=0.5, label='Twilight Land') + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='yellow', alpha=0.5, label='Twilight Ocean') + plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + + # Plot daytime fits + for surface_type in daytime_models: + if daytime_models[surface_type]['model'] is not None: + x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) + y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) + plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + # Plot twilight fits + for surface_type in twilight_models: + if twilight_models[surface_type]['model'] is not None: + x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) + y_twilight = twilight_models[surface_type]['model'](x_twilight) + plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + plt.yscale('log') + plt.ylim(1e3, 1e8) + plt.xlabel('Solar Elevation (degrees)') + plt.ylabel('Background Rate (ph/s)') + plt.legend() + plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean)') + plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) + plt.close() + +def create_calibration_plot(df, daytime_df, output_dir): + """Create a calibration plot using mean photon height as a proxy.""" + if not daytime_df.empty: + sample_file = daytime_df.iloc[0] + heights = sample_file['mean_photon_height'] + if np.isfinite(heights): + solar_rate = sample_file['solar_background_rate'] + bin_size = 0.1 + hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) + photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + calibrated_counts = np.maximum(hist - photons_per_bin, 0) + calibrated_heights = [] + for i, count in enumerate(calibrated_counts): + if count > 0: + calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + + plt.figure(figsize=(8, 6)) + plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') + plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') + plt.xlabel('Photon Height (m)') + plt.ylabel('Count') + plt.legend() + plt.title(f'Calibration Example: {sample_file["file"]}') + plt.savefig(os.path.join(output_dir, "calibration_example.png")) + plt.close() + +def save_results(df, output_dir): + """Save the results to a CSV file.""" + df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) + +# New function to calculate averages +def calculate_background_averages(daytime_df, nighttime_df): + """Calculate average background rates for daytime and nighttime, land and ocean.""" + # Calculate averages, ignoring NaN values + daytime_land_avg = np.nanmean(daytime_df['total_background_rate_land']) + daytime_ocean_avg = np.nanmean(daytime_df['total_background_rate_ocean']) + nighttime_land_avg = np.nanmean(nighttime_df['total_background_rate_land']) + nighttime_ocean_avg = np.nanmean(nighttime_df['total_background_rate_ocean']) + + # Log the averages + logger.info(f"Average Daytime Land Background Rate: {daytime_land_avg:.2e} ph/s") + logger.info(f"Average Daytime Ocean Background Rate: {daytime_ocean_avg:.2e} ph/s") + logger.info(f"Average Nighttime Land Background Rate: {nighttime_land_avg:.2e} ph/s") + logger.info(f"Average Nighttime Ocean Background Rate: {nighttime_ocean_avg:.2e} ph/s") + + return { + 'daytime_land': daytime_land_avg, + 'daytime_ocean': daytime_ocean_avg, + 'nighttime_land': nighttime_land_avg, + 'nighttime_ocean': nighttime_ocean_avg + } + +# New function to plot averages +def plot_background_averages(averages, output_dir): + """Plot the average background rates for daytime and nighttime, land and ocean.""" + labels = ['Daytime Land', 'Daytime Ocean', 'Nighttime Land', 'Nighttime Ocean'] + rates = [ + averages['daytime_land'] if not pd.isna(averages['daytime_land']) else 0, + averages['daytime_ocean'] if not pd.isna(averages['daytime_ocean']) else 0, + averages['nighttime_land'] if not pd.isna(averages['nighttime_land']) else 0, + averages['nighttime_ocean'] if not pd.isna(averages['nighttime_ocean']) else 0 + ] + + plt.figure(figsize=(8, 6)) + bars = plt.bar(labels, rates, color=['blue', 'cyan', 'purple', 'lightgreen']) + + plt.ylabel('Average Background Rate (ph/s)') + plt.title('Average Background Rate Comparison\n(Daytime vs. Nighttime, Land vs. Ocean)') + plt.yscale('log') + plt.ylim(1e3, 1e8) + + for bar in bars: + yval = bar.get_height() + plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2e}', ha='center', va='bottom') + + plt.savefig(os.path.join(output_dir, "average_background_rate_comparison.png")) + plt.close() + logger.info("Average background rate comparison plot generated.") + +def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + """Main analysis function orchestrating the workflow.""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + df = load_and_process_files(atl03_directory, beam, shoreline_data_path) + if df.empty: + raise ValueError("No data processed successfully. Check logs for errors.") + + df = assign_surface_type(df) + + # Compute total_background_rate based on surface_type + df['total_background_rate'] = np.where( + df['surface_type'] == 'Land', df['total_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) + + averages = calculate_background_averages(daytime_df, nighttime_df) + plot_background_averages(averages, output_dir) + + +if __name__ == "__main__": + args = get_args() + atl03_directory = os.path.join(args.workspace_path, args.atl03_path) + shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) + output_dir = os.path.join(args.workspace_path, args.output_path) + analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) + print("Analysis complete.") \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py new file mode 100644 index 0000000..bded772 --- /dev/null +++ b/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py @@ -0,0 +1,520 @@ +import h5py +import os +import numpy as np +import pandas as pd +import geopandas as gpd +from shapely.geometry import Point +from scipy.interpolate import interp1d +from scipy.optimize import curve_fit +import matplotlib.pyplot as plt +from multiprocessing import Pool +from config import get_args +from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation +from noise_utils.data_processing import isolate_sea_land_photons + +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Define model functions +def power_model(x, a, b, c): + return a * np.power(x, b) + c + +def exp_model(x, a, b, c): + """Exponential model: a * exp(b * x) + c""" + return a * np.exp(b * x) + c + +def twilight_model(x, a, b, c): + return a * np.exp(-b * x**2) + c # Gaussian form for twilight + +def process_file(args): + """Process a single ATL03 file, skipping files missing required datasets.""" + file, beam, shoreline_data_path = args + try: + with h5py.File(file, 'r') as f: + # Check for required datasets + required_keys = [ + f'{beam}/bckgrd_atlas/bckgrd_rate', + f'{beam}/bckgrd_atlas/delta_time', + f'{beam}/heights/delta_time', + f'{beam}/heights/h_ph', + f'{beam}/geolocation/reference_photon_lat', + f'{beam}/geolocation/reference_photon_lon', + f'{beam}/geolocation/solar_azimuth', + f'{beam}/geolocation/solar_elevation' + ] + for key in required_keys: + if key not in f: + raise KeyError(f"Missing dataset: {key}") + + bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) + bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] + heights = f[f'{beam}/heights/h_ph'][:] + segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] + segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] + segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] + + Segment_Is_Land = pd.DataFrame({ + 'latitude': segment_lat, # Use unique column names to avoid conflicts + 'longitude': segment_lon, + 'delta_time': segment_delta_time + }) + + # Create a GeoDataFrame with geometry + Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Determine land/ocean classification + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + + # Interpolate is_land labels to bckgrd_time + interp_is_land = interp1d( + ICESat2_GDF['delta_time'], + ICESat2_GDF['is_land'].astype(int), + bounds_error=False, + fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + ) + is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) + + # ##########Debug################################# + # # Interpolate latitude and longitude to bckgrd_time + # interp_lat = interp1d( + # ICESat2_GDF['delta_time'], + # ICESat2_GDF['latitude'], + # bounds_error=False, + # fill_value=(ICESat2_GDF['latitude'].iloc[0], ICESat2_GDF['latitude'].iloc[-1]) + # ) + # interp_lon = interp1d( + # ICESat2_GDF['delta_time'], + # ICESat2_GDF['longitude'], + # bounds_error=False, + # fill_value=(ICESat2_GDF['longitude'].iloc[0], ICESat2_GDF['longitude'].iloc[-1]) + # ) + # interpolated_lat = interp_lat(bckgrd_time) + # interpolated_lon = interp_lon(bckgrd_time) + + # # Create a GeoDataFrame for is_land_interpolated with lat/lon + # is_land_df = pd.DataFrame({ + # 'delta_time': bckgrd_time, + # 'latitude': interpolated_lat, + # 'longitude': interpolated_lon, + # 'is_land': is_land_interpolated + # }) + # is_land_gdf = gpd.GeoDataFrame( + # is_land_df, + # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), + # crs="EPSG:4326" + # ) + + # # Save to a GeoPackage file + # file_dir = os.path.dirname(file) + # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") + # if not os.path.exists(is_land_output_dir): + # os.makedirs(is_land_output_dir) + # output_filename = os.path.join(is_land_output_dir, f"{os.path.basename(file)}_is_land.gpkg") + # is_land_gdf.to_file(output_filename, driver="GPKG") + # logger.info(f"Saved is_land_interpolated to {output_filename}") + ############################################## + + # Compute total background rates for land and ocean + land_mask = is_land_interpolated == 1 + ocean_mask = is_land_interpolated == 0 + total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + + # Solar elevation handling + FILL_VALUE = 3.4028235e38 + solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) + valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) + + mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan + + result = { + 'file': os.path.basename(file), + 'total_background_rate_land': total_background_rate_land, + 'total_background_rate_ocean': total_background_rate_ocean, + 'solar_elevation': solar_elevation, + 'mean_photon_height': mean_photon_height + } + logger.info(f"Processed {file}: {result}") + return result + except Exception as e: + logger.error(f"Error processing {file}: {str(e)}") + return {'file': os.path.basename(file), 'error': str(e)} + +def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + """Fit a model to a specific classification and surface type.""" + subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + if len(subset) < 3: + print(f"Warning: Insufficient data for {classification} {surface_type} model.") + return None, None, None + + mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + valid_x = subset.loc[mask, x_col] + valid_y = subset.loc[mask, y_col] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + return None, None, None + + try: + popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + y_pred = model_func(valid_x, *popt) + r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + return popt, r2, lambda x: model_func(x, *popt) + except RuntimeError as e: + print(f"Fit failed for {classification} {surface_type}: {e}") + return None, None, None + +def load_and_process_files(atl03_directory, beam, shoreline_data_path): + """Load and process ATL03 files in parallel.""" + atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + print(f"Found {len(atl03_files)} ATL03 files.") + + if not atl03_files: + raise ValueError("No ATL03 files found in the directory.") + + with Pool() as pool: + results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) + + print("Results from process_file:", results) # Debug output + df = pd.DataFrame(results) + + + if 'error' in df.columns: + error_files = df[df['error'].notna()]['file'].tolist() + print(f"Errors in {len(error_files)} files: {error_files}") + df = df[df['error'].isna()].drop(columns=['error']) + + if df.empty: + raise ValueError("No files were successfully processed. Check the error messages above.") + + print("DataFrame columns after processing:", df.columns) + return df + +def assign_surface_type(df): + """Assign surface type based on background rates.""" + required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") + + df['surface_type'] = np.where( + df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), + 'Land', + np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + ) + return df + +def classify_data(df, twilight_threshold=6.0): + """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" + df['classification'] = np.where( + df['solar_elevation'].isna(), 'Unknown', + np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', + np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + ) + daytime_df = df[df['classification'] == 'Daytime'].copy() + nighttime_df = df[df['classification'] == 'Nighttime'].copy() + twilight_df = df[df['classification'] == 'Twilight'].copy() + unknown_df = df[df['classification'] == 'Unknown'].copy() + print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") + return df, daytime_df, nighttime_df, twilight_df, unknown_df + +def calculate_non_solar_rate(nighttime_df): + """Calculate the non-solar background rate from nighttime data.""" + non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) + non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) + print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") + print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") + print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") + return non_solar_rate + +def compute_solar_background_rates(df, non_solar_rate): + """Compute solar background rates for land and ocean.""" + df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) + df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) + df['solar_background_rate'] = np.where( + df['surface_type'] == 'Land', df['solar_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + ) + return df + +def fit_daytime_model(df, daytime_df, non_solar_rate): + """Fit daytime models for land and ocean.""" + daytime_models = {} + for surface_type in ['Land', 'Ocean']: + subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient daytime {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_x = subset.loc[mask, 'solar_elevation'] + valid_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + continue + + # Remove outliers + log_y = np.log10(valid_y + 1) + mean_log_y = np.mean(log_y) + std_log_y = np.std(log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + valid_x = valid_x[outlier_mask] + valid_y = valid_y[outlier_mask] + + print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") + try: + p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] + bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) + popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) + r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + x_shifted = valid_x - min(valid_x) + 1 + p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] + bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) + popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) + r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + if r2_power > r2_exp: + print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") + daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + else: + print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") + daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + + # Apply model + daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) + valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + if r2_power > r2_exp: + x_for_model_shifted = x_for_model - min(valid_x) + 1 + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + else: + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") + df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + return df, daytime_models + + +def fit_twilight_model(df, twilight_df, non_solar_rate): + """Fit twilight models for land and ocean.""" + twilight_models = {} + if len(twilight_df) > 3: + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( + df['solar_background_rate'].fillna(0).clip(lower=0) + ) + for surface_type in ['Land', 'Ocean']: + subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_twilight_x = subset.loc[mask, 'solar_elevation'] + valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_twilight_x) < 3: + print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + continue + + print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + try: + initial_a = np.max(valid_twilight_y) - non_solar_rate + initial_b = 0.01 # Smaller b for a flatter Gaussian + initial_c = non_solar_rate + p0 = [initial_a, initial_b, initial_c] + bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) + + popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + y_pred = twilight_model(valid_twilight_x, *popt) + r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + + if r_squared < 0: + # Fallback to a constant model (mean of the data) + mean_rate = np.mean(valid_twilight_y) + twilight_models[surface_type] = { + 'params': [mean_rate], + 'r2': 0.0, # R² of a constant model is 0 by definition + 'model': lambda x: np.full_like(x, mean_rate) + } + print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + else: + print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") + twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] + df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") + df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + else: + print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + return df, twilight_models + +def create_diagnostic_plot(df, daytime_df, twilight_df, nighttime_df, daytime_models, twilight_models, non_solar_rate, output_dir): + """Create diagnostic plot of background rate vs. solar elevation, including nighttime comparison.""" + plt.figure(figsize=(10, 6)) + + # Total background rate (all data) + plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') + + # Daytime data + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], + c='blue', alpha=0.5, label='Daytime Land') + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='cyan', alpha=0.5, label='Daytime Ocean') + + # Twilight data + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], + c='orange', alpha=0.5, label='Twilight Land') + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='yellow', alpha=0.5, label='Twilight Ocean') + + # Nighttime data (using total_background_rate since solar contribution is minimal) + plt.scatter(nighttime_df[nighttime_df['surface_type'] == 'Land']['solar_elevation'], + nighttime_df[nighttime_df['surface_type'] == 'Land']['total_background_rate'], + c='purple', alpha=0.5, label='Nighttime Land') + plt.scatter(nighttime_df[nighttime_df['surface_type'] == 'Ocean']['solar_elevation'], + nighttime_df[nighttime_df['surface_type'] == 'Ocean']['total_background_rate'], + c='lightgreen', alpha=0.5, label='Nighttime Ocean') + + # Non-solar background rate line + plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + + # Plot daytime fits + for surface_type in daytime_models: + if daytime_models[surface_type]['model'] is not None: + x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) + y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) + plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + # Plot twilight fits + for surface_type in twilight_models: + if twilight_models[surface_type]['model'] is not None: + x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) + y_twilight = twilight_models[surface_type]['model'](x_twilight) + plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + # Set plot properties + plt.yscale('log') + plt.ylim(1e3, 1e8) + plt.xlabel('Solar Elevation (degrees)') + plt.ylabel('Background Rate (ph/s)') + plt.legend() + plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean) with Nighttime Comparison') + + # Save the plot + plt.savefig(os.path.join(output_dir, "background_vs_elevation_with_nighttime_2021.png")) + plt.close() + +def create_calibration_plot(df, daytime_df, output_dir): + """Create a calibration plot using mean photon height as a proxy.""" + if not daytime_df.empty: + sample_file = daytime_df.iloc[0] + heights = sample_file['mean_photon_height'] + if np.isfinite(heights): + solar_rate = sample_file['solar_background_rate'] + bin_size = 0.1 + # Simulate histogram with 1000 points + hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) + photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + calibrated_counts = np.maximum(hist - photons_per_bin, 0) + calibrated_heights = [] + for i, count in enumerate(calibrated_counts): + if count > 0: + calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + + plt.figure(figsize=(8, 6)) + plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') + plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') + plt.xlabel('Photon Height (m)') + plt.ylabel('Count') + plt.legend() + plt.title(f'Calibration Example: {sample_file["file"]}') + plt.savefig(os.path.join(output_dir, "calibration_example.png")) + plt.close() + +def save_results(df, output_dir): + """Save the results to a CSV file.""" + df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) + +def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + """Main analysis function orchestrating the workflow.""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + df = load_and_process_files(atl03_directory, beam, shoreline_data_path) + if df.empty: + raise ValueError("No data processed successfully. Check logs for errors.") + + df = assign_surface_type(df) + + # Compute total_background_rate based on surface_type + df['total_background_rate'] = np.where( + df['surface_type'] == 'Land', df['total_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) + + non_solar_rate = calculate_non_solar_rate(nighttime_df) + df = compute_solar_background_rates(df, non_solar_rate) + + # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates + daytime_df = df[df['classification'] == 'Daytime'].copy() + nighttime_df = df[df['classification'] == 'Nighttime'].copy() + twilight_df = df[df['classification'] == 'Twilight'].copy() + unknown_df = df[df['classification'] == 'Unknown'].copy() + + df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) + df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) + save_results(df, output_dir) + + # Updated call to include nighttime_df + create_diagnostic_plot(df, daytime_df, twilight_df, nighttime_df, daytime_models, twilight_models, non_solar_rate, output_dir) + # create_calibration_plot(df, daytime_df, output_dir) + + return df, { + 'daytime_models': daytime_models, + 'twilight_models': twilight_models, + 'non_solar_rate': non_solar_rate + } + +if __name__ == "__main__": + args = get_args() + atl03_directory = os.path.join(args.workspace_path, args.atl03_path) + shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) + output_dir = os.path.join(args.workspace_path, args.output_path) + df, models = analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) + print("Analysis complete.") + + + + + \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py new file mode 100644 index 0000000..3701a3f --- /dev/null +++ b/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py @@ -0,0 +1,496 @@ +import h5py +import os +import numpy as np +import pandas as pd +import geopandas as gpd +from shapely.geometry import Point +from scipy.interpolate import interp1d +from scipy.optimize import curve_fit +import matplotlib.pyplot as plt +from multiprocessing import Pool +from config import get_args +from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation +from noise_utils.data_processing import isolate_sea_land_photons + +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Define model functions +def power_model(x, a, b, c): + return a * np.power(x, b) + c + +def exp_model(x, a, b, c): + """Exponential model: a * exp(b * x) + c""" + return a * np.exp(b * x) + c + +def twilight_model(x, a, b, c): + return a * np.exp(-b * x**2) + c # Gaussian form for twilight + +def process_file(args): + """Process a single ATL03 file, skipping files missing required datasets.""" + file, beam, shoreline_data_path = args + try: + with h5py.File(file, 'r') as f: + # Check for required datasets + required_keys = [ + f'{beam}/bckgrd_atlas/bckgrd_rate', + f'{beam}/bckgrd_atlas/delta_time', + f'{beam}/heights/delta_time', + f'{beam}/heights/h_ph', + f'{beam}/geolocation/reference_photon_lat', + f'{beam}/geolocation/reference_photon_lon', + f'{beam}/geolocation/solar_azimuth', + f'{beam}/geolocation/solar_elevation' + ] + for key in required_keys: + if key not in f: + raise KeyError(f"Missing dataset: {key}") + + bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) + bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] + heights = f[f'{beam}/heights/h_ph'][:] + segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] + segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] + segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] + + Segment_Is_Land = pd.DataFrame({ + 'latitude': segment_lat, # Use unique column names to avoid conflicts + 'longitude': segment_lon, + 'delta_time': segment_delta_time + }) + + # Create a GeoDataFrame with geometry + Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Determine land/ocean classification + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + + # Interpolate is_land labels to bckgrd_time + interp_is_land = interp1d( + ICESat2_GDF['delta_time'], + ICESat2_GDF['is_land'].astype(int), + bounds_error=False, + fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + ) + is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) + + # ##########Debug################################# + # # Interpolate latitude and longitude to bckgrd_time + # interp_lat = interp1d( + # ICESat2_GDF['delta_time'], + # ICESat2_GDF['latitude'], + # bounds_error=False, + # fill_value=(ICESat2_GDF['latitude'].iloc[0], ICESat2_GDF['latitude'].iloc[-1]) + # ) + # interp_lon = interp1d( + # ICESat2_GDF['delta_time'], + # ICESat2_GDF['longitude'], + # bounds_error=False, + # fill_value=(ICESat2_GDF['longitude'].iloc[0], ICESat2_GDF['longitude'].iloc[-1]) + # ) + # interpolated_lat = interp_lat(bckgrd_time) + # interpolated_lon = interp_lon(bckgrd_time) + + # # Create a GeoDataFrame for is_land_interpolated with lat/lon + # is_land_df = pd.DataFrame({ + # 'delta_time': bckgrd_time, + # 'latitude': interpolated_lat, + # 'longitude': interpolated_lon, + # 'is_land': is_land_interpolated + # }) + # is_land_gdf = gpd.GeoDataFrame( + # is_land_df, + # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), + # crs="EPSG:4326" + # ) + + # # Save to a GeoPackage file + # file_dir = os.path.dirname(file) + # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") + # if not os.path.exists(is_land_output_dir): + # os.makedirs(is_land_output_dir) + # output_filename = os.path.join(is_land_output_dir, f"{os.path.basename(file)}_is_land.gpkg") + # is_land_gdf.to_file(output_filename, driver="GPKG") + # logger.info(f"Saved is_land_interpolated to {output_filename}") + ############################################## + + # Compute total background rates for land and ocean + land_mask = is_land_interpolated == 1 + ocean_mask = is_land_interpolated == 0 + total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + + # Solar elevation handling + FILL_VALUE = 3.4028235e38 + solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) + valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) + + mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan + + result = { + 'file': os.path.basename(file), + 'total_background_rate_land': total_background_rate_land, + 'total_background_rate_ocean': total_background_rate_ocean, + 'solar_elevation': solar_elevation, + 'mean_photon_height': mean_photon_height + } + logger.info(f"Processed {file}: {result}") + return result + except Exception as e: + logger.error(f"Error processing {file}: {str(e)}") + return {'file': os.path.basename(file), 'error': str(e)} + +def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + """Fit a model to a specific classification and surface type.""" + subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + if len(subset) < 3: + print(f"Warning: Insufficient data for {classification} {surface_type} model.") + return None, None, None + + mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + valid_x = subset.loc[mask, x_col] + valid_y = subset.loc[mask, y_col] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + return None, None, None + + try: + popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + y_pred = model_func(valid_x, *popt) + r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + return popt, r2, lambda x: model_func(x, *popt) + except RuntimeError as e: + print(f"Fit failed for {classification} {surface_type}: {e}") + return None, None, None + +def load_and_process_files(atl03_directory, beam, shoreline_data_path): + """Load and process ATL03 files in parallel.""" + atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + print(f"Found {len(atl03_files)} ATL03 files.") + + if not atl03_files: + raise ValueError("No ATL03 files found in the directory.") + + with Pool() as pool: + results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) + + print("Results from process_file:", results) # Debug output + df = pd.DataFrame(results) + + + if 'error' in df.columns: + error_files = df[df['error'].notna()]['file'].tolist() + print(f"Errors in {len(error_files)} files: {error_files}") + df = df[df['error'].isna()].drop(columns=['error']) + + if df.empty: + raise ValueError("No files were successfully processed. Check the error messages above.") + + print("DataFrame columns after processing:", df.columns) + return df + +def assign_surface_type(df): + """Assign surface type based on background rates.""" + required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") + + df['surface_type'] = np.where( + df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), + 'Land', + np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + ) + return df + +def classify_data(df, twilight_threshold=6.0): + """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" + df['classification'] = np.where( + df['solar_elevation'].isna(), 'Unknown', + np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', + np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + ) + daytime_df = df[df['classification'] == 'Daytime'].copy() + nighttime_df = df[df['classification'] == 'Nighttime'].copy() + twilight_df = df[df['classification'] == 'Twilight'].copy() + unknown_df = df[df['classification'] == 'Unknown'].copy() + print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") + return df, daytime_df, nighttime_df, twilight_df, unknown_df + +def calculate_non_solar_rate(nighttime_df): + """Calculate the non-solar background rate from nighttime data.""" + non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) + non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) + print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") + print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") + print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") + return non_solar_rate + +def compute_solar_background_rates(df, non_solar_rate): + """Compute solar background rates for land and ocean.""" + df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) + df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) + df['solar_background_rate'] = np.where( + df['surface_type'] == 'Land', df['solar_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + ) + return df + +def fit_daytime_model(df, daytime_df, non_solar_rate): + """Fit daytime models for land and ocean.""" + daytime_models = {} + for surface_type in ['Land', 'Ocean']: + subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient daytime {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_x = subset.loc[mask, 'solar_elevation'] + valid_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_x) < 3: + print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + continue + + # Remove outliers + log_y = np.log10(valid_y + 1) + mean_log_y = np.mean(log_y) + std_log_y = np.std(log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + valid_x = valid_x[outlier_mask] + valid_y = valid_y[outlier_mask] + + print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") + try: + p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] + bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) + popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) + r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + x_shifted = valid_x - min(valid_x) + 1 + p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] + bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) + popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) + r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + + if r2_power > r2_exp: + print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") + daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + else: + print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") + daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + + # Apply model + daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) + valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + if r2_power > r2_exp: + x_for_model_shifted = x_for_model - min(valid_x) + 1 + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + else: + df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") + df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + return df, daytime_models + + +def fit_twilight_model(df, twilight_df, non_solar_rate): + """Fit twilight models for land and ocean.""" + twilight_models = {} + if len(twilight_df) > 3: + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( + df['solar_background_rate'].fillna(0).clip(lower=0) + ) + for surface_type in ['Land', 'Ocean']: + subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + if len(subset) < 3: + print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + continue + + mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ + np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) + valid_twilight_x = subset.loc[mask, 'solar_elevation'] + valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + + if len(valid_twilight_x) < 3: + print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + continue + + print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + try: + initial_a = np.max(valid_twilight_y) - non_solar_rate + initial_b = 0.01 # Smaller b for a flatter Gaussian + initial_c = non_solar_rate + p0 = [initial_a, initial_b, initial_c] + bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) + + popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + y_pred = twilight_model(valid_twilight_x, *popt) + r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + + if r_squared < 0: + # Fallback to a constant model (mean of the data) + mean_rate = np.mean(valid_twilight_y) + twilight_models[surface_type] = { + 'params': [mean_rate], + 'r2': 0.0, # R² of a constant model is 0 by definition + 'model': lambda x: np.full_like(x, mean_rate) + } + print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + else: + print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") + twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} + + twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) + valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) + x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] + df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) + df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + except RuntimeError as e: + print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") + df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + else: + print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") + df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + return df, twilight_models + +def create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir): + """Create diagnostic plot of background rate vs. solar elevation.""" + plt.figure(figsize=(10, 6)) + plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], + c='blue', alpha=0.5, label='Daytime Land') + plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], + daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='cyan', alpha=0.5, label='Daytime Ocean') + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], + c='orange', alpha=0.5, label='Twilight Land') + plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], + twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], + c='yellow', alpha=0.5, label='Twilight Ocean') + plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + + # Plot daytime fits + for surface_type in daytime_models: + if daytime_models[surface_type]['model'] is not None: + x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) + y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) + plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + # Plot twilight fits + for surface_type in twilight_models: + if twilight_models[surface_type]['model'] is not None: + x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) + y_twilight = twilight_models[surface_type]['model'](x_twilight) + plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle='-' if surface_type == 'Land' else '--') + + plt.yscale('log') + plt.ylim(1e3, 1e8) + plt.xlabel('Solar Elevation (degrees)') + plt.ylabel('Background Rate (ph/s)') + plt.legend() + plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean)') + plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) + plt.close() + +def create_calibration_plot(df, daytime_df, output_dir): + """Create a calibration plot using mean photon height as a proxy.""" + if not daytime_df.empty: + sample_file = daytime_df.iloc[0] + heights = sample_file['mean_photon_height'] + if np.isfinite(heights): + solar_rate = sample_file['solar_background_rate'] + bin_size = 0.1 + # Simulate histogram with 1000 points + hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) + photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + calibrated_counts = np.maximum(hist - photons_per_bin, 0) + calibrated_heights = [] + for i, count in enumerate(calibrated_counts): + if count > 0: + calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + + plt.figure(figsize=(8, 6)) + plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') + plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') + plt.xlabel('Photon Height (m)') + plt.ylabel('Count') + plt.legend() + plt.title(f'Calibration Example: {sample_file["file"]}') + plt.savefig(os.path.join(output_dir, "calibration_example.png")) + plt.close() + +def save_results(df, output_dir): + """Save the results to a CSV file.""" + df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) + +def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + """Main analysis function orchestrating the workflow.""" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + df = load_and_process_files(atl03_directory, beam, shoreline_data_path) + if df.empty: + raise ValueError("No data processed successfully. Check logs for errors.") + + df = assign_surface_type(df) + + # Compute total_background_rate based on surface_type + df['total_background_rate'] = np.where( + df['surface_type'] == 'Land', df['total_background_rate_land'], + np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) + + non_solar_rate = calculate_non_solar_rate(nighttime_df) + df = compute_solar_background_rates(df, non_solar_rate) + + # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates + daytime_df = df[df['classification'] == 'Daytime'].copy() + nighttime_df = df[df['classification'] == 'Nighttime'].copy() + twilight_df = df[df['classification'] == 'Twilight'].copy() + unknown_df = df[df['classification'] == 'Unknown'].copy() + + df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) + df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) + save_results(df, output_dir) + create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir) + create_calibration_plot(df, daytime_df, output_dir) + + + return df, { + 'daytime_models': daytime_models, + 'twilight_models': twilight_models, + 'non_solar_rate': non_solar_rate + } + +if __name__ == "__main__": + args = get_args() + atl03_directory = os.path.join(args.workspace_path, args.atl03_path) + shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) + output_dir = os.path.join(args.workspace_path, args.output_path) + df, models = analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) + print("Analysis complete.") + \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/config.py b/aok/core/kd_utils/SolarBckgrd/config.py new file mode 100644 index 0000000..ebfbcaa --- /dev/null +++ b/aok/core/kd_utils/SolarBckgrd/config.py @@ -0,0 +1,79 @@ +import argparse + +def get_args(): + parser = argparse.ArgumentParser(description="Analyze ICESat-2 ATL03 data for solar background.") + + #task name + parser.add_argument("--taskID", type=str, default='2022_granules', + help="task name") + # workspace path + parser.add_argument("--workspace_path", type=str, default='/work/users/w/a/wayne128/ICESat2/IS2_kd_py/', + help="Root path for all data processing.") + + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCCENTRAL/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCNORTH/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCSOUTH/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2018_granules/h5_files/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2019_granules/h5_files/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/', + # help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2021_granules/h5_files/', + # help="Base path for ICESat-2 ATL03 data.") + + parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/', + help="Base path for ICESat-2 ATL03 data.") + + # parser.add_argument("--parallel", action="store_true", help="Process files in parallel") + parser.add_argument("--beam", default="gt1l", help="Beam to process (e.g., gt1l)") + + + parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001234100_01721701_006_01.h5", + help="Name of the ATL03 H5 file to process.")#processed_ATL03_20221001114643_01641707_006_01 + + parser.add_argument("--output_path", type=str, default='Dataset/Results/', + help="Directory for saving results.") + + # Shoreline data + parser.add_argument("--shoreline_data", type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default='Dataset/Shorelines/ne_10m_land/ne_10m_land.shp', + help="Path to the global shoreline dataset.") + + # Bathymetry datasets + parser.add_argument("--gebco_path", nargs='+', type=str, + default='Dataset/Bathy/gebco_2024_geotiff/', + help="Path to GEBCO bathymetry datasets.") + + # Resolution settings + parser.add_argument("--horizontal_res", type=int, default=500, + help="Horizontal resolution for the analysis (in meters).") + parser.add_argument("--vertical_res", type=float, default=0.25, + help="Vertical resolution for the analysis.") + + # Analysis parameters + parser.add_argument("--subsurface_thresh", type=float, default=0.5, + help="Threshold for photons below the sea surface.") + parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-1, + help="Maximum depth thres for Kd calculations (in meters).") + + return parser.parse_args() + +if __name__ == "__main__": + args = get_args() + # Access parameters like this: + atl03_file_path = args.workspace_path + args.atl03_file + + print("Processing file:", atl03_file_path) + print("Output path:", args.output_path) \ No newline at end of file diff --git a/aok/core/kd_utils/__init__.py b/aok/core/kd_utils/__init__.py new file mode 100644 index 0000000..f8f285a --- /dev/null +++ b/aok/core/kd_utils/__init__.py @@ -0,0 +1,8 @@ +# utils/__init__.py + +from .data_processing import load_data, extract_file_params, Extract_sea_photons, create_photon_dataframe +from .visualization import plot_photon_height, plot_kd_photons +from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton +from .interpolation import interpolate_labels, apply_interpolation, geoid_correction, refraction_correction +from .bathy_processing import * +from .sea_photons_analysis import * \ No newline at end of file diff --git a/aok/core/kd_utils/bathy_processing.py b/aok/core/kd_utils/bathy_processing.py new file mode 100644 index 0000000..73b028f --- /dev/null +++ b/aok/core/kd_utils/bathy_processing.py @@ -0,0 +1,647 @@ +# utils/bathy_processing.py +import os +import pandas as pd +import rasterio +import numpy as np +from kd_utils.sea_photons_analysis import ( + get_sea_surface_height_adaptive, + get_sea_surface_height_static, + horizontal_vertical_bin_dataset, +) +from rtree import index +from shapely.geometry import box, Point +from scipy.spatial import cKDTree +from scipy.stats import norm + + +def apply_optional_histogram_quality_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + min_ratio=0.05, + depth_min=6.0, + depth_max=7.0 +): + """ + Optional histogram quality check: + keep along-track bins only if the target depth-band photon ratio is >= min_ratio. + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return subsurface_photon_dataset + + quality_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height': sea_surface_height + }).dropna(subset=['sea_surface_height']).copy() + if quality_df.empty: + return subsurface_photon_dataset.iloc[0:0].copy() + + ratios = [] + for _, row in quality_df.iterrows(): + lat_bin = row['lat_bins'] + surface = row['sea_surface_height'] + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + if bin_data.empty: + ratios.append(0.0) + continue + + depth = surface - bin_data['photon_height'] + underwater_mask = depth > 0 + total_underwater = int(underwater_mask.sum()) + if total_underwater == 0: + ratios.append(0.0) + continue + + band_mask = underwater_mask & (depth >= depth_min) & (depth <= depth_max) + ratios.append(float(band_mask.sum()) / float(total_underwater)) + + quality_df['hist_quality_ratio'] = ratios + valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + + +def apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=0.5 +): + """ + Optional Gaussian surface sigma filter: + discard along-track bins if fitted surface sigma exceeds sigma_max. + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return subsurface_photon_dataset + + sigma_rows = [] + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): + if pd.isna(surface): + continue + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + if bin_data.empty: + continue + peak_data = bin_data[ + (bin_data['photon_height'] > surface - 1.0) & + (bin_data['photon_height'] < surface + 1.0) + ] + if len(peak_data) > 10: + _, sigma = norm.fit(peak_data['photon_height']) + else: + sigma = np.nan + sigma_rows.append((lat_bin, sigma)) + + if not sigma_rows: + return subsurface_photon_dataset + + sigma_df = pd.DataFrame(sigma_rows, columns=['lat_bins', 'surface_sigma']) + valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + + +def apply_optional_refraction_correction( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + water_temp_c=20.0, + wavelength_nm=532.0 +): + """ + Optional refraction correction for subsurface photons. + Updates photon positions/heights using Snell-based geometry. + """ + required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} + if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): + return subsurface_photon_dataset + + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return subsurface_photon_dataset + + surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) + corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() + if corrected['sea_surface_height'].isna().all(): + return subsurface_photon_dataset + + corrected['photon_height_pre_refraction'] = corrected['photon_height'] + corrected['lon_pre_refraction'] = corrected['lon'] + corrected['lat_pre_refraction'] = corrected['lat'] + + # Refraction index parameterization from legacy module. + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + n1 = 1.00029 + n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e + + ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) + ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) + z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) + ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) + x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) + y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) + + # Convert to radians if values look like degrees. + if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): + ref_elev = np.deg2rad(ref_elev) + if np.nanmax(np.abs(ref_az)) > (2 * np.pi): + ref_az = np.deg2rad(ref_az) + + valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) + if not np.any(valid_mask): + return subsurface_photon_dataset + + theta1 = (np.pi / 2.0) - ref_elev[valid_mask] + theta2_arg = (n1 * np.sin(theta1)) / n2 + theta2_arg = np.clip(theta2_arg, -1.0, 1.0) + theta2 = np.arcsin(theta2_arg) + + D = ws[valid_mask] - z[valid_mask] + cos_theta1 = np.cos(theta1) + cos_theta1 = np.where(np.abs(cos_theta1) < 1e-6, np.nan, cos_theta1) + S = D / cos_theta1 + R = (S * n1) / n2 + Gamma = (np.pi / 2.0) - theta1 + phi = theta1 - theta2 + P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) + P = np.sqrt(P_sq) + alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) + alpha_arg = np.clip(alpha_arg, -1.0, 1.0) + alpha = np.arcsin(alpha_arg) + Beta = Gamma - alpha + + DY = P * np.cos(Beta) + DZ = P * np.sin(Beta) + DE = DY * np.sin(ref_az[valid_mask]) + DN = DY * np.cos(ref_az[valid_mask]) + + x_corr = x[valid_mask] + DE + y_corr = y[valid_mask] + DN + z_corr = z[valid_mask] + DZ + + corrected.loc[valid_mask, 'lon'] = x_corr + corrected.loc[valid_mask, 'lat'] = y_corr + corrected.loc[valid_mask, 'photon_height'] = z_corr + return corrected + + +def apply_sea_surface_flattening( + subsurface_photon_dataset, + sea_surface_height, + lat_bin_keys, + horizontal_res, + flattening_window_m=500 +): + """ + Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. + This follows the standalone SeaSurfaceFlattening module logic, but is optional. + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + if len(sea_surface_height) != len(lat_bin_keys): + return subsurface_photon_dataset + + surface_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height_local': sea_surface_height + }).dropna(subset=['sea_surface_height_local']).copy() + if surface_df.empty: + return subsurface_photon_dataset + + surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') + surface_df = surface_df.dropna(subset=['lat_bins_num']) + if surface_df.empty: + return subsurface_photon_dataset + + big_bin_size = max(1, int(round(flattening_window_m / max(horizontal_res, 1)))) + surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) + mean_surface = ( + surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] + .mean() + .rename('sea_surface_height_bigbin_mean') + .reset_index() + ) + surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') + surface_df['sea_surface_flattening_offset'] = ( + surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] + ) + + flattened = subsurface_photon_dataset.copy() + flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') + flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) + flattened = flattened.merge( + surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], + on='lat_bins', + how='left' + ) + flattened['photon_height_original'] = flattened['photon_height'] + flattened['photon_height'] = ( + flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) + ) + return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') + +# 1. Function to create an R-tree spatial index for raster bounds +def create_spatial_index(gebco_paths): + """ + Create an R-tree spatial index for raster bounds to quickly find relevant rasters. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + + Returns: + raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + """ + idx = index.Index() + raster_data_dict = {} + for i, path in enumerate(gebco_paths): + with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + gebco_raster = rasterio.open(path) + raster_data = gebco_raster.read(1) + raster_data_dict[path] = (gebco_raster, raster_data) + bounds = gebco_raster.bounds + idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + return raster_data_dict, idx + +# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner +def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): + """ + Determine which raster datasets are relevant for the given coordinates using spatial index. + This function uses a more efficient approach by performing a bounding box query for batches of points. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + raster_data_dict (dict): Dictionary containing raster datasets and their data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + + Returns: + relevant_rasters (list): List of relevant raster datasets and their respective data. + """ + # Create a bounding box that covers all points + min_lon, max_lon = lons.min(), lons.max() + min_lat, max_lat = lats.min(), lats.max() + bounding_box = box(min_lon, min_lat, max_lon, max_lat) + + # Get all rasters that intersect with the bounding box + matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) + relevant_paths = {match.object for match in matches} + relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + return relevant_rasters + + +# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner +def get_seafloor_elevation(lons, lats, relevant_rasters): + """ + Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + relevant_rasters (list): List of relevant raster datasets and their respective data. + + Returns: + seafloor_elevations (numpy.ndarray): Array of seafloor elevations. + """ + points = np.vstack((lons, lats)).T + seafloor_elevations = np.full(len(lons), np.nan) + + for gebco_raster, raster_data in relevant_rasters: + # Use rasterio.sample to get values for multiple points in a batch + values = list(gebco_raster.sample(points)) + for idx, value in enumerate(values): + if np.isnan(seafloor_elevations[idx]) and value is not None: + seafloor_elevations[idx] = value[0] + + return seafloor_elevations + + +# 4. The main process function that ties everything together +def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): + """ + Main function to process the seafloor data. + + Parameters: + gebco_paths (list): List of path to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. + """ + + # Create spatial index and load GEBCO Raster Data + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + + # Get the relevant rasters using spatial index + lons = sea_photon_dataset['longitude'].values + lats = sea_photon_dataset['latitude'].values + relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) + + # Get seafloor elevation for all points + sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + + # Calculate depth (assuming seafloor_elevation is negative below sea level) + # If your elevation is positive below sea level, remove the negative sign + sea_photon_dataset['depth'] = -sea_photon_dataset['seafloor_elevation'] + + # Filter out points: + # 1. below the seafloor (photon_height > seafloor_elevation) + # 2. where depth is less than 6 meters (depth >= 6) + filtered_sea_photon_dataset = sea_photon_dataset[ + (sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']) & + (sea_photon_dataset['depth'] >= Ignore_Subsurface_Height_Thres) + ] + + return filtered_sea_photon_dataset + + +def load_atl24_points(atl24_file_path): + """ + Load ATL24 point dataset and normalize to columns: longitude, latitude, seafloor_elevation_atl24. + """ + if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + lower_path = atl24_file_path.lower() + if lower_path.endswith(('.csv', '.txt')): + atl24_df = pd.read_csv(atl24_file_path) + elif lower_path.endswith('.parquet'): + atl24_df = pd.read_parquet(atl24_file_path) + elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): + try: + import geopandas as gpd + atl24_gdf = gpd.read_file(atl24_file_path) + atl24_df = pd.DataFrame(atl24_gdf) + if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): + atl24_df['longitude'] = atl24_gdf.geometry.x + atl24_df['latitude'] = atl24_gdf.geometry.y + except Exception: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + else: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] + lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] + elev_candidates = [ + 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', + 'depth', 'water_depth' + ] + + lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) + lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) + elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) + if lon_col is None or lat_col is None or elev_col is None: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + out_df = pd.DataFrame({ + 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), + 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), + 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') + }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() + + if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): + out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() + else: + out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] + + return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] + + +def process_seafloor_data_atl24( + sea_photon_dataset, + atl24_file_path, + Ignore_Subsurface_Height_Thres, + max_match_distance_deg=0.01 +): + """ + Match ATL24 bathymetry points to photons, then filter below seafloor and shallow bins. + Returns filtered dataset and number of matched photons. + """ + atl24_points = load_atl24_points(atl24_file_path) + if atl24_points.empty: + return sea_photon_dataset, 0 + + photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) + atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) + if len(photon_coords) == 0 or len(atl24_coords) == 0: + return sea_photon_dataset, 0 + + tree = cKDTree(atl24_coords) + distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) + valid_match = np.isfinite(distances) & (indices < len(atl24_points)) + matched_count = int(valid_match.sum()) + if matched_count == 0: + return sea_photon_dataset, 0 + + matched_dataset = sea_photon_dataset.copy() + matched_dataset['seafloor_elevation'] = np.nan + matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() + matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] + + filtered_dataset = matched_dataset[ + (matched_dataset['photon_height'] > matched_dataset['seafloor_elevation']) & + (matched_dataset['depth'] >= Ignore_Subsurface_Height_Thres) + ] + return filtered_dataset, matched_count + + +# 5. The function to get the subsurface photon dataset +def get_subsurface_photon( + binned_dataset_sea_surface, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=False, + atl24_file_path='', + atl24_max_match_distance_deg=0.01, + use_gebco_filter=False, + apply_histogram_quality_filter=False, + histogram_quality_min_ratio=0.05, + histogram_quality_depth_min=6.0, + histogram_quality_depth_max=7.0, + apply_surface_sigma_filter=False, + surface_sigma_max=0.5, + apply_refraction_correction=False, + refraction_water_temp_c=20.0, + refraction_wavelength_nm=532.0, + apply_post_refraction_refit=False, + apply_flattening=False, + flattening_window_m=500, + horizontal_res=500, + vertical_res=0.25 +): + # get sea surface height + # threshold to determine how much depth below sea surface will be accounted + # Output: + # ->final_sea_surface_height + # ->sea_surface_height_abnormal_label + # ->sea_surface_dominated_label + # # static threshold + # sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + # get_sea_surface_height_static(binned_dataset_sea_surface, subsurface_thresh) + # adaptive threshold + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + get_sea_surface_height_adaptive(binned_dataset_sea_surface) + + if apply_histogram_quality_filter: + subsurface_photon_dataset = apply_optional_histogram_quality_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + min_ratio=histogram_quality_min_ratio, + depth_min=histogram_quality_depth_min, + depth_max=histogram_quality_depth_max + ) + + if apply_surface_sigma_filter: + subsurface_photon_dataset = apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=surface_sigma_max + ) + + if apply_refraction_correction: + subsurface_photon_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm + ) + if apply_post_refraction_refit: + corrected_full_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + binned_dataset_sea_surface.copy(), + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm + ) + rebinned_corrected = horizontal_vertical_bin_dataset( + corrected_full_dataset, + horizontal_res, + vertical_res + ) + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + get_sea_surface_height_adaptive(rebinned_corrected) + + if use_atl24_filter: + atl24_filtered, atl24_match_count = process_seafloor_data_atl24( + subsurface_photon_dataset, + atl24_file_path, + abs(Ignore_Subsurface_Height_Thres), + max_match_distance_deg=atl24_max_match_distance_deg + ) + if atl24_match_count > 0: + filtered_seafloor_subsurface_photon_dataset = atl24_filtered + elif use_gebco_filter: + filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + ) + else: + filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + elif use_gebco_filter: + filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + ) + else: + filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + + if apply_flattening: + lat_bin_keys = list( + binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() + ) + filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( + filtered_seafloor_subsurface_photon_dataset, + sea_surface_height, + lat_bin_keys, + horizontal_res=horizontal_res, + flattening_window_m=flattening_window_m + ) + return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset + + +# Main processing function to apply the subsurface photon filtering beam-by-beam +def process_subsurface_photon_filtering( + binned_dataset_sea_surface, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=False, + atl24_file_path='', + atl24_max_match_distance_deg=0.01, + use_gebco_filter=False, + apply_histogram_quality_filter=False, + histogram_quality_min_ratio=0.05, + histogram_quality_depth_min=6.0, + histogram_quality_depth_max=7.0, + apply_surface_sigma_filter=False, + surface_sigma_max=0.5, + apply_refraction_correction=False, + refraction_water_temp_c=20.0, + refraction_wavelength_nm=532.0, + apply_post_refraction_refit=False, + apply_flattening=False, + flattening_window_m=500, + horizontal_res=500, + vertical_res=0.25 +): + # Initialize lists to store results for each beam + sea_surface_heights = [] + sea_surface_labels = [] + filtered_beam_datasets = [] + + # Group the binned dataset by 'beam_id' and process each group separately + for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): + print(f'Processing subsurface filtering for beam: {beam_id}') + + # Apply get_subsurface_photon to the current beam's dataset + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res + ) + + # Append each result to the lists + sea_surface_heights.append(sea_surface_height) + sea_surface_labels.append(sea_surface_label) + filtered_beam_datasets.append(filtered_seafloor_subsurface_photon_dataset) + + # Combine all filtered beam datasets into a single DataFrame + combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) + + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/aok/core/kd_utils/config.py b/aok/core/kd_utils/config.py new file mode 100644 index 0000000..a3ce3af --- /dev/null +++ b/aok/core/kd_utils/config.py @@ -0,0 +1,247 @@ +# # config.py + +# path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/' + +# #US, Orgon +# # ATL03_h5_file = "processed_ATL03_20181206092124_10500106_005_01.h5"# +# #South America +# # ATL03_h5_file = "processed_ATL03_20220530041141_10391513_005_02.h5"# + + +# #Alaska Cook Inlet +# # ATL03_h5_file = "processed_ATL03_20210801125753_05941205_005_01.h5"# +# ATL03_20200805175053_06320803_006_01_subsetted +# ATL03_20210828231447_10131203_006_01_subsetted + +# # Hawaii line without afterpulses +# # ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5"# + + +# # ChesapeakeBay +# ATL03_h5_file = "processed_ATL03_20230825074121_10102002_006_02.h5"# + + +# # India +# # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# + + + +# #China Bohai +# # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# +# # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# + +# ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# + + + +# ATL03_h5_file_path = path + ATL03_h5_file + +# Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' + +# shoreline_data_path = Current_Path + 'Shorelines/GeoPkgGlobalShoreline.gpkg' + +# #load the Global bathy dataset +# GEBCO_paths = [ +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w0.0_e90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-90.0_e0.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w90.0_e180.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-180.0_e-90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w0.0_e90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-90.0_e0.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w90.0_e180.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-180.0_e-90.0.tif", +# ] + +# horizontal_res = 500 + +# vertical_res = 0.25 + +# # Use calculated sea height to determine photons at 0.5m below peak +# subsurface_thresh = 0.5 + +# # subsurface distance below the sea surface beginning to account +# Kd_max_depth = -1 + +# OutputPath='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Results/' + +import argparse + +def get_args(): + parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") + + # General paths + parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', + help="Root path for all data processing.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', + # help="Base path for ICESat-2 ATL03 data.") + parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/', + help="Base path for ICESat-2 ATL03 data.") + + + + # #ChesapeakeBay + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", + # help="Name of the ATL03 H5 file to process.") + + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + # #ATL03_20210628230109_00811207_006_01_subsetted + + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + + # # WaxDelta + # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + # # Bohai + parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", + help="Name of the ATL03 H5 file to process.") + + # # Cook Inlet + # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + + # #Core Sound + # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + # # Pamlico Sound + # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + # + # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", + # help="Name of the ATL03 H5 file to process.") + + parser.add_argument("--other_data_path", type=str, default='Dataset/', + help="Path to the current working directory.") + + parser.add_argument("--output_path", type=str, default='Results/', + help="Directory for saving results.") + + # Shoreline data + parser.add_argument("--shoreline_data", type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default='Shorelines/ne_10m_land/ne_10m_land.shp', + help="Path to the global shoreline dataset.") + + # Bathymetry datasets + parser.add_argument("--gebco_path", nargs='+', type=str, + default='Bathy/gebco_2024_geotiff/', + help="Path to GEBCO bathymetry datasets.") + + # Resolution settings + parser.add_argument("--horizontal_res", type=int, default=500, + help="Horizontal resolution for the analysis (in meters).") + + parser.add_argument("--vertical_res", type=float, default=0.25, + help="Vertical resolution for the analysis.") + + # Analysis parameters + parser.add_argument("--subsurface_thresh", type=float, default=1.0, + help="Threshold for photons below the sea surface.") + + parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, + help="Maximum depth thres for Kd calculations (in meters).") #5m + + parser.add_argument("--target_beams", type=str, default='', + help="Comma-separated beam IDs to process. Empty means auto-select strong beams.") + + parser.add_argument("--enable_solar_background_filter", action="store_true", + help="Optional: apply solar background filtering before binning.") + parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.") + parser.add_argument("--solar_background_quantile", type=float, default=0.8, + help="Quantile threshold of daytime background rate considered high background.") + parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, + help="Minimum photon signal confidence kept in high daytime background.") + + parser.add_argument("--enable_ir_ap_filter", action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.") + parser.add_argument("--ir_ap_quality_max", type=int, default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.") + parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.") + + parser.add_argument("--enable_gebco_filter", action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.") + parser.add_argument("--enable_sea_surface_flattening", action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") + parser.add_argument("--sea_surface_flattening_window_m", type=int, default=500, + help="Window size in meters for sea-surface flattening mean level.") + + parser.add_argument("--enable_convex_hull_filter", action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.") + parser.add_argument("--convex_hull_area_threshold", type=float, default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.") + + parser.add_argument("--enable_histogram_quality_filter", action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.") + parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.") + parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, + help="Minimum depth (m below surface) of quality-check band.") + parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, + help="Maximum depth (m below surface) of quality-check band.") + + parser.add_argument("--enable_surface_sigma_filter", action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.") + parser.add_argument("--surface_sigma_max", type=float, default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") + + parser.add_argument("--enable_refraction_correction", action="store_true", + help="Optional: apply refraction correction to subsurface photons.") + parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, + help="Water temperature (deg C) used for refraction correction.") + parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, + help="Laser wavelength (nm) used for refraction correction.") + + parser.add_argument("--enable_post_refraction_refit", action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.") + + parser.add_argument("--enable_atl24_filter", action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") + parser.add_argument("--atl24_file", type=str, default='', + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") + parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") + + parser.add_argument("--enable_paired_beam_combine", action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") + + # switch on consider the coastal water or inland water + # if Inland water, we should use one mask and for coastal water, it should be another mask + + # consider bathymetry or not manual setting + + + #night vs daytime + + #solar elevation detemine remove or not for solar background + + + # 1-2 hours + + + return parser.parse_args() + +if __name__ == "__main__": + args = get_args() + # Access parameters like this: + atl03_file_path = args.workspace_path + args.atl03_file + + print("Processing file:", atl03_file_path) + print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/data_processing.py b/aok/core/kd_utils/data_processing.py new file mode 100644 index 0000000..42df0a8 --- /dev/null +++ b/aok/core/kd_utils/data_processing.py @@ -0,0 +1,733 @@ +# utils/data_processing.py + + +import re +import os +import io +import time +import math +import h5py +import logging +import netCDF4 +import numpy as np +import geopandas as gpd + +import pandas as pd +from datetime import datetime +import matplotlib.pyplot as plt +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from scipy.spatial import ConvexHull + +import argparse +import subprocess +# import fiona +import utm +import pyproj +from pyproj import Transformer, Proj + +# from pyproj import Transformer +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path + +from sklearn.cluster import DBSCAN +from .interpolation import * + + + +#this function from icesat2_toolkit +# PURPOSE: read ICESat-2 ATL03 HDF5 data files +def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): + """ + Reads ICESat-2 ATL03 Global Geolocated Photons data files + + Parameters + ---------- + FILENAME: str + full path to ATL03 file + ATTRIBUTES: bool, default False + read file, group and variable attributes + + Returns + ------- + IS2_atl03_mds: dict + ATL03 variables + IS2_atl03_attrs: dict + ATL03 attributes + IS2_atl03_beams: list + valid ICESat-2 beams within ATL03 file + """ + # Open the HDF5 file for reading + if isinstance(FILENAME, io.IOBase): + fileID = h5py.File(FILENAME, 'r') + else: + fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + + # Output HDF5 file information + logging.info(fileID.filename) + logging.info(list(fileID.keys())) + + # allocate python dictionaries for ICESat-2 ATL03 variables and attributes + IS2_atl03_mds = {} + IS2_atl03_attrs = {} + + # read each input beam within the file + IS2_atl03_beams = [] + for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + # check if subsetted beam contains data + # check in both the geolocation and heights groups + try: + fileID[gtx]['geolocation']['segment_id'] + fileID[gtx]['heights']['delta_time'] + except KeyError: + pass + else: + IS2_atl03_beams.append(gtx) + + # for each included beam + for gtx in IS2_atl03_beams: + + # ------------------------------------------- + # 1. make sure the beam-level dict exists + IS2_atl03_attrs.setdefault(gtx, {}) + # 2. always save the two “must-have” attributes + for key in ('atlas_beam_type', 'atlas_spot_number'): + IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] + + # get each HDF5 variable + IS2_atl03_mds[gtx] = {} + IS2_atl03_mds[gtx]['heights'] = {} + IS2_atl03_mds[gtx]['geolocation'] = {} + IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} + IS2_atl03_mds[gtx]['geophys_corr'] = {} + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_mds[gtx]['heights'][key] = val[:] + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, + # solid earth, pole, load, and equilibrium), inverted barometer (IB) + # effects, and range corrections for tropospheric delays + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + + # Getting attributes of included variables + if ATTRIBUTES: + # Getting attributes of IS2_atl03_mds beam variables + IS2_atl03_attrs[gtx] = {} + IS2_atl03_attrs[gtx]['heights'] = {} + IS2_atl03_attrs[gtx]['geolocation'] = {} + IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} + IS2_atl03_attrs[gtx]['geophys_corr'] = {} + + # Global Group Attributes + for att_name,att_val in fileID[gtx].attrs.items(): + IS2_atl03_attrs[gtx][att_name] = att_val + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_attrs[gtx]['heights'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_attrs[gtx]['geolocation'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + # ICESat-2 Geophysical Corrections Group + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + + # ICESat-2 spacecraft orientation at time + IS2_atl03_mds['orbit_info'] = {} + IS2_atl03_attrs['orbit_info'] = {} + for key,val in fileID['orbit_info'].items(): + IS2_atl03_mds['orbit_info'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID['orbit_info'].attrs.items(): + IS2_atl03_attrs['orbit_info'][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs['orbit_info'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + + # information ancillary to the data product + # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) + # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) + # Add this value to delta time parameters to compute full gps_seconds + # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 + # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) + IS2_atl03_mds['ancillary_data'] = {} + IS2_atl03_attrs['ancillary_data'] = {} + ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', + 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', + 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', + 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', + 'start_orbit','start_region','start_rgt','version'] + for key in ancillary_keys: + # get each HDF5 variable + IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data'][key] = {} + for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): + IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + + # transmit-echo-path (tep) parameters + IS2_atl03_mds['ancillary_data']['tep'] = {} + IS2_atl03_attrs['ancillary_data']['tep'] = {} + for key,val in fileID['ancillary_data']['tep'].items(): + # get each HDF5 variable + IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data']['tep'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + + # channel dead time and first photon bias derived from ATLAS calibration + cal1,cal2 = ('ancillary_data','calibrations') + for var in ['dead_time','first_photon_bias']: + IS2_atl03_mds[cal1][var] = {} + IS2_atl03_attrs[cal1][var] = {} + for key,val in fileID[cal1][cal2][var].items(): + # get each HDF5 variable + if isinstance(val, h5py.Dataset): + IS2_atl03_mds[cal1][var][key] = val[:] + elif isinstance(val, h5py.Group): + IS2_atl03_mds[cal1][var][key] = {} + for k,v in val.items(): + IS2_atl03_mds[cal1][var][key][k] = v[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs[cal1][var][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][att_name] = att_val + if isinstance(val, h5py.Group): + for k,v in val.items(): + IS2_atl03_attrs[cal1][var][key][k] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + + # get ATLAS impulse response variables for the transmitter echo path (TEP) + tep1,tep2 = ('atlas_impulse_response','tep_histogram') + IS2_atl03_mds[tep1] = {} + IS2_atl03_attrs[tep1] = {} + for pce in ['pce1_spot1','pce2_spot3']: + IS2_atl03_mds[tep1][pce] = {tep2:{}} + IS2_atl03_attrs[tep1][pce] = {tep2:{}} + # for each TEP variable + for key,val in fileID[tep1][pce][tep2].items(): + IS2_atl03_mds[tep1][pce][tep2][key] = val[:] + # Getting attributes of included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs[tep1][pce][tep2][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val + + # Global File Attributes + if ATTRIBUTES: + for att_name,att_val in fileID.attrs.items(): + IS2_atl03_attrs[att_name] = att_val + + # Closing the HDF5 file + fileID.close() + # Return the datasets and variables + return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + + + +# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 +def convert_wgs_to_utm(lon: float, lat: float): + """Based on lat and lng, return best utm epsg-code""" + utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0' + utm_band + if lat >= 0: + epsg_code = 'epsg:326' + utm_band + return epsg_code + epsg_code = 'epsg:327' + utm_band + return epsg_code + + +def orthometric_correction(lat, lon, Z, epsg): + # Define the Proj string + #To transform from WGS84 ellipsoidal height + # to EGM2008 orthometric height using PyProj + # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' + # # Define the Proj string for WGS84 ellipsoidal height + # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' + + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 + # egm2008_proj_string = \ + # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ + # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' + + # transform ellipsoid (WGS84) height to orthometric height + # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) + transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) + X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) + + # transform WGS84 proj to local UTM + myProj = Proj(epsg) + X_utm, Y_utm = myProj(lon, lat) + + return Y_utm, X_utm, Z_egm08 + + + +def load_data(file_path, read_attributes=False): + """ + Wrapper around read_granule that lets you decide + whether to pull the full attribute tree. + + Parameters + ---------- + file_path : str + Path to the ATL03 HDF5 granule. + read_attributes : bool, optional + If True, read_granule returns the full IS2_atl03_attrs tree. + Defaults to False. + """ + + + return read_granule(file_path, ATTRIBUTES = read_attributes) + + + +# requires that the input gdf has ranged index values i +# will need to change if index is changed to time or something +# this currently checks point in polygon for EVERY point +# would be significantly sped up if evaluated at 10m or something similar +# maybe later, fine for now +def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): + # try loading the shoreline data + try: + ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + + # allocation of to be used arrays + zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) + + # Land flag initialized as -1 + # If shorelines downloaded already, will be set to 0 or 1 + ICESat2_GDF.insert(0, 'is_land', + zero_int_array - 1, False) + + # set the projection + ICESat2_GDF.set_crs("EPSG:4326", inplace=True) + + # load shoreline dataset to include only the features that intersect the bounding box + # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None + # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. + # engine str, 'fiona' or 'pyogrio' + # somtime it gives error if using fiona + # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') + land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + + # continue with getting a new array of 0-or-1 labels for each photon + land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) + + # update labels for points in the land polygons + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + + # get land or not bool value + land_loc = ICESat2_GDF.index.isin(pts_in_land.index) + + # asigned them to new numpy array + land_point_labels[land_loc] = 1 + land_point_labels[~land_loc] = 0 + + return land_point_labels + + except Exception as e: + + print(e) + + print("Error loading shoreline data, returning -1s for is_land flag") + + # if the shoreline data is not available + # return the original label array + + return -np.ones_like(ICESat2_GDF.is_land.values) + + +def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ + quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, + solar_elevation=None, background_rate=None): + # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights + h_ph_geoid_cor = h_ph[:] - geoid[:] + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) + + # Perform orthometric correction to obtain UTM coordinates and corrected heights + lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) + + # Put the data into the dataframe + sea_photon_dataset = pd.DataFrame({ + 'latitude': lat_ph, + 'longitude': lon_ph, + 'lat': lat_utm, + 'lon': lon_utm, + 'photon_height': h_ph_geoid_cor, + 'quality_ph': quality_ph, + 'is_land_label': is_land_label_interp1d, + 'photon_conf': signal_conf_photon, + 'ref_elevation': ref_elev, + 'ref_azimuth': ref_azimuth, + 'relative_AT_dist': relative_AT_dist + }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) + + if solar_elevation is not None: + sea_photon_dataset['solar_elevation'] = solar_elevation + if background_rate is not None: + sea_photon_dataset['background_rate'] = background_rate + + return sea_photon_dataset + + +def interpolate_by_time(source_time, source_values, target_time): + """Linearly interpolate source_values from source_time to target_time.""" + source_time = np.asarray(source_time) + source_values = np.asarray(source_values) + target_time = np.asarray(target_time) + + valid_mask = np.isfinite(source_time) & np.isfinite(source_values) + if valid_mask.sum() < 2: + return np.full_like(target_time, np.nan, dtype=float) + + sorted_idx = np.argsort(source_time[valid_mask]) + sorted_time = source_time[valid_mask][sorted_idx] + sorted_values = source_values[valid_mask][sorted_idx] + + unique_time, unique_idx = np.unique(sorted_time, return_index=True) + unique_values = sorted_values[unique_idx] + if unique_time.size < 2: + return np.full_like(target_time, unique_values[0], dtype=float) + + return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) + + +def apply_optional_solar_background_filter( + sea_photon_dataset, + enabled=False, + day_threshold=6.0, + background_quantile=0.8, + min_signal_conf=2 +): + """ + Optionally filter photons under strong daytime solar background conditions. + Keeps high-confidence photons in high-background segments. + """ + if not enabled: + return sea_photon_dataset + + required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} + if not required_cols.issubset(set(sea_photon_dataset.columns)): + logger.warning("Solar background filter skipped because required columns are missing.") + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold + daytime_background = filtered_dataset.loc[daytime_mask, 'background_rate'] + daytime_background = daytime_background[np.isfinite(daytime_background)] + + if daytime_background.empty: + logger.info("Solar background filter enabled, but no valid daytime photons were found.") + return filtered_dataset + + quantile_threshold = daytime_background.quantile(background_quantile) + noisy_daytime_mask = daytime_mask & (filtered_dataset['background_rate'] >= quantile_threshold) + keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "Solar background filter removed %s photons (from %s to %s).", + before_count - after_count, before_count, after_count + ) + return filtered_dataset + + +def apply_optional_ir_ap_filter( + sea_photon_dataset, + enabled=False, + quality_max=0, + min_signal_conf=0 +): + """ + Optionally remove likely flagged photons as an IR/afterpulse proxy. + This uses available ATL03 photon fields: + - quality_ph: keep <= quality_max + - photon_conf: keep >= min_signal_conf + """ + if not enabled: + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + keep_mask = np.ones(len(filtered_dataset), dtype=bool) + + if 'quality_ph' in filtered_dataset.columns: + quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce') + keep_mask &= quality_values <= quality_max + else: + logger.warning("IR/AP filter enabled, but quality_ph is missing.") + + if 'photon_conf' in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') + keep_mask &= conf_values >= min_signal_conf + else: + logger.warning("IR/AP filter enabled, but photon_conf is missing.") + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "IR/AP proxy filter removed %s photons (from %s to %s).", + before_count - after_count, before_count, after_count + ) + return filtered_dataset + + +def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path): + Segment_ID = {} + Segment_Index_begin = {} + Segment_PE_count = {} + Equator_Segment_Distance = {} + Segment_Length = {} + Segment_Is_Land = {} + Segment_Lon = {} + Segment_Lat = {} + Segment_Elev = {} + Segment_Time = {} + Segment_ref_elev = {} + Segment_ref_azimuth = {} + background_rate = {} + background_counts = {} + + # Initialize a list to store data for each beam + beam_datasets = [] + + + # Loop over each strong beam in target_strong_beams + for gtx in target_strong_beams: + print('Processing strong beam ID:', gtx) + + # Access the data for the current beam + IS2_val = IS2_atl03_mds[gtx] + + # Initialize dictionaries to store segment data for the beam + Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + n_seg = len(Segment_ID[gtx]) + n_pe, = IS2_val['heights']['delta_time'].shape + Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 + Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] + Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] + Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] + delta_time = IS2_val['geolocation']['delta_time'] + segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() + segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() + ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() + ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() + geoid = IS2_val['geophys_corr']['geoid'][:].copy() + h_ph = IS2_val['heights']['h_ph'][:].copy() + photon_delta_time = IS2_val['heights']['delta_time'][:].copy() + lat_ph = IS2_val['heights']['lat_ph'][:].copy() + lon_ph = IS2_val['heights']['lon_ph'][:].copy() + signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() + x_atc = IS2_val['heights']['dist_ph_along'][:].copy() + y_atc = IS2_val['heights']['dist_ph_across'][:].copy() + quality_ph = IS2_val['heights']['quality_ph'] + + # Optional variables for solar background sensitivity testing + photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) + photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) + if 'solar_elevation' in IS2_val['geolocation']: + segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() + photon_solar_elevation = interpolate_by_time( + delta_time, segment_solar_elevation, photon_delta_time + ) + if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: + bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() + bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() + photon_background_rate = interpolate_by_time( + bckgrd_time, bckgrd_rate, photon_delta_time + ) + + # Adjust x_atc based on segment distances + for seg_index in range(n_seg): + idx = Segment_Index_begin[gtx][seg_index] + cnt = Segment_PE_count[gtx][seg_index] + x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + + # Calculate relative distances + relative_AT_dist = (x_atc - x_atc[0]) / 1000 + relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 + + # Create a GeoDataFrame to hold segment data for shoreline check + Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Determine if it is land by the land/sea mask + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + + # Apply interpolations for required data + is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) + ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) + ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) + ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) + + + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, + ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid,h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate) + + # Filter out land photons + sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] + + # Add a new column to indicate the beam ID + sea_photon_dataset['beam_id'] = gtx + + # Append the processed dataset for the current beam to the list + beam_datasets.append(sea_photon_dataset) + + # Concatenate all beam data into a single DataFrame + all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) + + return all_beams_dataset + + +def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000): + """ + Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, + convex hull areas, and convex hull points. + """ + lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) + filtered_dataset = photon_dataset.copy() + + convex_hulls = {} + convex_hull_areas = {} + + for lat_bin, bin_data in lat_bins_grouped: + if len(bin_data) >= 3: + points = bin_data[['lat', 'photon_height']].to_numpy() + hull = ConvexHull(points) + area = hull.volume + convex_hull_areas[lat_bin] = area + + # Store the ConvexHull points if area meets the threshold + if area >= hull_area_threshold: + convex_hulls[lat_bin] = points[hull.vertices] + else: + # Remove bins with hull area below the threshold + filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] + + return filtered_dataset, convex_hull_areas, convex_hulls + + + + + + + + + +########################## +##Discard Functions Below +########################## + + +# Extracts key information from filenames, +# which could include processed status, product ID, timestamps, identifiers, or metadata. +def extract_file_params(file_path): + ''' + Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + '2023', '11', '15', '001', '12', '_data') + ''' + # Defines a regex pattern to match strings in the file path + rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' + r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') + # Searches for all matches of the regex pattern in the given + params = rx.findall(file_path).pop() + return params + + +def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): + ''' + create a mask for filtering sea surface, sea floor, and threshold_height + ''' + mask = (binned_data['height'] <= sea_surface_height) & \ + (binned_data['height'] >= seafloor_height) & \ + (binned_data['height'] >= threshold_height) + return mask + + +def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): + ''' + horizontal_vertical_bin_dataset + Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals + ''' + # Create latitude bins based on the specified interval + lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) + + # Create height bins within the range [-12, 2] based on the specified interval + height_bins = np.arange(-12, 2 + height_interval, height_interval) + + # Identify which bin each masked latitude and height value belongs to + lat_bin_indices = np.digitize(lat[mask], lat_bins) + + # Determine which height bin each masked point belongs to + height_bin_indices = np.digitize(height[mask], height_bins) + + # Combine latitude and height bin indices for each point + combined_bins = list(zip(lat_bin_indices, height_bin_indices)) + + # Find unique bin combinations and count the number of points in each bin + unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) + + # Filter bins to include only those within valid index ranges + valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] + + polygons = [] + for bin_lat, bin_height in valid_bins: + if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): + bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) + if len(bin_points) > 2: + hull = ConvexHull(bin_points) + polygons.append(bin_points[hull.vertices]) + + return lat_bins, height_bins, valid_bins, polygons + + + diff --git a/aok/core/kd_utils/interpolation.py b/aok/core/kd_utils/interpolation.py new file mode 100644 index 0000000..59b04b4 --- /dev/null +++ b/aok/core/kd_utils/interpolation.py @@ -0,0 +1,19 @@ +# utils/interpolation.py + +import scipy.interpolate + +def interpolate_labels(segment_lat, labels): + model = scipy.interpolate.interp1d(segment_lat, labels, fill_value="extrapolate") + return model + +def apply_interpolation(model, lat_ph): + return model(lat_ph) + +def geoid_correction(lat_ph, segment_lat, geoid): + model = interpolate_labels(segment_lat, geoid) + return apply_interpolation(model, lat_ph) + +def refraction_correction(lat_ph, segment_lat, ref_elev, ref_azimuth): + elev_model = interpolate_labels(segment_lat, ref_elev) + azimuth_model = interpolate_labels(segment_lat, ref_azimuth) + return apply_interpolation(elev_model, lat_ph), apply_interpolation(azimuth_model, lat_ph) diff --git a/aok/core/kd_utils/kd_utils.py b/aok/core/kd_utils/kd_utils.py new file mode 100644 index 0000000..39c3687 --- /dev/null +++ b/aok/core/kd_utils/kd_utils.py @@ -0,0 +1,1181 @@ +#!/usr/bin/env python +# coding: utf-8 + +##### +# This group of functions processes ICESat-2 data and creates a bathymetric model. +# To do this, it follows a number of steps in the form of functions, including: +# 1. Reading data (ReadATL03()) +# 2. Orthometrically correcting the dataset (OrthometricCorrection()) +# 3. Pulling down the data segment ID (getAtl03SegID()) +# 4. Bin the data along latitudinal and height gradients (bin_data()) +# 5. Calculate sea height (get_sea_height()) +# 6. Get water temperature (get_water_temp()) +# 7. Correct bathymetric surface for refraction (RefractionCorrection()) +# 8. Calculate bathymetric height (get_bath_height()) +# 9. Produce figures (produce_figures()) +##### +import os +# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" + +import io +import os +import re +import time +import math +import h5py +import logging +import netCDF4 +import numpy as np +import geopandas as gpd + +import pandas as pd +from datetime import datetime +import matplotlib.pyplot as plt +import hdbscan +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +import argparse +import subprocess +# import fiona +import utm +import pyproj +from pyproj import Transformer, Proj + +# from pyproj import Transformer +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path + +from sklearn.cluster import DBSCAN + +#this function from icesat2_toolkit +# PURPOSE: read ICESat-2 ATL03 HDF5 data files +def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): + """ + Reads ICESat-2 ATL03 Global Geolocated Photons data files + + Parameters + ---------- + FILENAME: str + full path to ATL03 file + ATTRIBUTES: bool, default False + read file, group and variable attributes + + Returns + ------- + IS2_atl03_mds: dict + ATL03 variables + IS2_atl03_attrs: dict + ATL03 attributes + IS2_atl03_beams: list + valid ICESat-2 beams within ATL03 file + """ + # Open the HDF5 file for reading + if isinstance(FILENAME, io.IOBase): + fileID = h5py.File(FILENAME, 'r') + else: + fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + + # Output HDF5 file information + logging.info(fileID.filename) + logging.info(list(fileID.keys())) + + # allocate python dictionaries for ICESat-2 ATL03 variables and attributes + IS2_atl03_mds = {} + IS2_atl03_attrs = {} + + # read each input beam within the file + IS2_atl03_beams = [] + for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + # check if subsetted beam contains data + # check in both the geolocation and heights groups + try: + fileID[gtx]['geolocation']['segment_id'] + fileID[gtx]['heights']['delta_time'] + except KeyError: + pass + else: + IS2_atl03_beams.append(gtx) + + # for each included beam + for gtx in IS2_atl03_beams: + # get each HDF5 variable + IS2_atl03_mds[gtx] = {} + IS2_atl03_mds[gtx]['heights'] = {} + IS2_atl03_mds[gtx]['geolocation'] = {} + IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} + IS2_atl03_mds[gtx]['geophys_corr'] = {} + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_mds[gtx]['heights'][key] = val[:] + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, + # solid earth, pole, load, and equilibrium), inverted barometer (IB) + # effects, and range corrections for tropospheric delays + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + + # Getting attributes of included variables + if ATTRIBUTES: + # Getting attributes of IS2_atl03_mds beam variables + IS2_atl03_attrs[gtx] = {} + IS2_atl03_attrs[gtx]['heights'] = {} + IS2_atl03_attrs[gtx]['geolocation'] = {} + IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} + IS2_atl03_attrs[gtx]['geophys_corr'] = {} + # Global Group Attributes + for att_name,att_val in fileID[gtx].attrs.items(): + IS2_atl03_attrs[gtx][att_name] = att_val + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_attrs[gtx]['heights'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_attrs[gtx]['geolocation'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + # ICESat-2 Geophysical Corrections Group + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + + # ICESat-2 spacecraft orientation at time + IS2_atl03_mds['orbit_info'] = {} + IS2_atl03_attrs['orbit_info'] = {} + for key,val in fileID['orbit_info'].items(): + IS2_atl03_mds['orbit_info'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID['orbit_info'].attrs.items(): + IS2_atl03_attrs['orbit_info'][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs['orbit_info'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + + # information ancillary to the data product + # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) + # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) + # Add this value to delta time parameters to compute full gps_seconds + # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 + # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) + IS2_atl03_mds['ancillary_data'] = {} + IS2_atl03_attrs['ancillary_data'] = {} + ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', + 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', + 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', + 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', + 'start_orbit','start_region','start_rgt','version'] + for key in ancillary_keys: + # get each HDF5 variable + IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data'][key] = {} + for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): + IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + + # transmit-echo-path (tep) parameters + IS2_atl03_mds['ancillary_data']['tep'] = {} + IS2_atl03_attrs['ancillary_data']['tep'] = {} + for key,val in fileID['ancillary_data']['tep'].items(): + # get each HDF5 variable + IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data']['tep'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + + # channel dead time and first photon bias derived from ATLAS calibration + cal1,cal2 = ('ancillary_data','calibrations') + for var in ['dead_time','first_photon_bias']: + IS2_atl03_mds[cal1][var] = {} + IS2_atl03_attrs[cal1][var] = {} + for key,val in fileID[cal1][cal2][var].items(): + # get each HDF5 variable + if isinstance(val, h5py.Dataset): + IS2_atl03_mds[cal1][var][key] = val[:] + elif isinstance(val, h5py.Group): + IS2_atl03_mds[cal1][var][key] = {} + for k,v in val.items(): + IS2_atl03_mds[cal1][var][key][k] = v[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs[cal1][var][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][att_name] = att_val + if isinstance(val, h5py.Group): + for k,v in val.items(): + IS2_atl03_attrs[cal1][var][key][k] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + + # get ATLAS impulse response variables for the transmitter echo path (TEP) + tep1,tep2 = ('atlas_impulse_response','tep_histogram') + IS2_atl03_mds[tep1] = {} + IS2_atl03_attrs[tep1] = {} + for pce in ['pce1_spot1','pce2_spot3']: + IS2_atl03_mds[tep1][pce] = {tep2:{}} + IS2_atl03_attrs[tep1][pce] = {tep2:{}} + # for each TEP variable + for key,val in fileID[tep1][pce][tep2].items(): + IS2_atl03_mds[tep1][pce][tep2][key] = val[:] + # Getting attributes of included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs[tep1][pce][tep2][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val + + # Global File Attributes + if ATTRIBUTES: + for att_name,att_val in fileID.attrs.items(): + IS2_atl03_attrs[att_name] = att_val + + # Closing the HDF5 file + fileID.close() + # Return the datasets and variables + return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + +# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 +def convert_wgs_to_utm(lon: float, lat: float): + """Based on lat and lng, return best utm epsg-code""" + utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0' + utm_band + if lat >= 0: + epsg_code = 'epsg:326' + utm_band + return epsg_code + epsg_code = 'epsg:327' + utm_band + return epsg_code + + +def orthometric_correction(lat, lon, Z, epsg): + # Define the Proj string + #To transform from WGS84 ellipsoidal height + # to EGM2008 orthometric height using PyProj + # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' + # # Define the Proj string for WGS84 ellipsoidal height + # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' + + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 + # egm2008_proj_string = \ + # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ + # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' + + # transform ellipsoid (WGS84) height to orthometric height + # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) + transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) + X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) + + # transform WGS84 proj to local UTM + myProj = Proj(epsg) + X_utm, Y_utm = myProj(lon, lat) + + return Y_utm, X_utm, Z_egm08 + + +# Snippet by Eric Guenther (via Amy N.) for assigning photons to a segment +def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): + # We need to know spacecraft orbit info, + # which is provided across segments. + # This first function assigns photons to the segment they belong to. + # We end up making a new array + # that has more points to match the photon. Segment is defined as every 100m in the long track. + + # Filter all data where atl03_ph_index starts at 0 (0 indicates errors) + indsNotZero = atl03_ph_index_beg != 0 + atl03_ph_index_beg = atl03_ph_index_beg[indsNotZero] + atl03_segment_id = atl03_segment_id[indsNotZero] + + # Subtract 1 from ph_index_beg to start at python 0th pos + atl03_ph_index_beg = atl03_ph_index_beg - 1 + + # Sometimes the ph_index_beg is not at the 0th position, it is not, + # add it in and then add the associated segment id + # Warning, this is assuming that the segment id for the points are from + # the segment id directly before it, this assumption might fail, but I have + # not come across a case yet where it does. If you want to play it safe + # you could comment this section out and then if the first position is not + # 0 then all photons before the first position will not be assigned a + # segment id. + # if atl03_ph_index_beg[0] != 0: + # atl03_ph_index_beg = np.append(0,atl03_ph_index_beg) + # first_seg_id = atl03_segment_id[0] -1 + # atl03_segment_id = np.append(first_seg_id,atl03_segment_id) + + # Append atl03_height_len to end of array for final position + atl03_ph_index_beg = np.append(atl03_ph_index_beg, atl03_heights_len) + + # Make array equal to the length of the atl03_heights photon level data + ph_segment_id = np.zeros(atl03_heights_len) + + # Iterate through ph_index_beg, from the first to second to last number + # and set the photons between ph_index_beg i to ph_index_beg i + 1 to + # segment id i + for i in range(0, len(atl03_ph_index_beg) - 1): + ph_segment_id[atl03_ph_index_beg[i]:atl03_ph_index_beg[i + 1]] = atl03_segment_id[i] + + # Return list of segment_id at the photon level + return ph_segment_id + + +def ref_linear_interp(x, y): + # initialize an empty list + arr = [] + + # get unique x values + ux = np.unique(x) + for u in ux: + # get y values for x=u + idx = y[x == u] + + # try to get the y values for x=u-1 and x=u + # if this is not possible, set min and max to y values for x=u + try: + min = y[x == u - 1][0] + max = y[x == u][0] + except: + min = y[x == u][0] + max = y[x == u][0] + + # try to get the y values for x=u and x=u+1 + # if this is not possible, set min and max to y values for x=u + try: + min = y[x == u][0] + max = y[x == u + 1][0] + except: + min = y[x == u][0] + max = y[x == u][0] + + # if the min and max values are the same, + # fill the sub array with the value + if min == max: + sub = np.full((len(idx)), min) + arr.append(sub) + + # if min and max are different, + # create a sub array using linear interpolation + else: + sub = np.linspace(min, max, len(idx)) + arr.append(sub) + + # concatenate all the sub arrays into a single array and return + return np.concatenate(arr, axis=None).ravel() + + +# Bin data along vertical and horizontal scales +def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): + """Bin data along vertical and horizontal scales + for later segmentation""" + + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise + valid_range = (-50, 10) + valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + + # Apply the valid_mask to filter unwanted values + filtered_dataset = dataset[valid_mask] + + # Calculate the number of height bins + height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) + height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + + # Calculate the number of latitude bins + lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin + + # Create bins for latitude + lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + + # Create bins for height + height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, + labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), + filtered_dataset['photon_height'].max(), + num=height_bin_number), decimals=1)) + + # Add bins to dataframe using .loc to avoid SettingWithCopyWarning + filtered_dataset.loc[:, 'lat_bins'] = lat_bins + filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset = filtered_dataset.reset_index(drop=True) + + return filtered_dataset + + +# thinking about grid searching to detect bathymetric directly +# rather than bin and then search +def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density_threshold): + """Bin data along vertical and horizontal scales + and calculate high-density points using a grid method""" + + # Calculate the number of bins required both vertically + lat_bin_number = round(abs(dataset['lat'].min() - dataset['lat'].max()) / lat_res) + # and horizontally based on resolution size + height_bin_number = round(abs(dataset['photon_height'].min() - dataset['photon_height'].max()) / vertical_res) + + # Create the grid + grid = np.zeros((lat_bin_number, height_bin_number)) + + # Iterate over the dataset and assign points to cells + for _, row in dataset.iterrows(): + lat_index = int((row['lat'] - dataset['lat'].min()) / lat_res) + height_index = int((row['photon_height'] - dataset['photon_height'].min()) / vertical_res) + grid[lat_index, height_index] += 1 + + # Identify high-density cells + high_density_cells = np.argwhere(grid > density_threshold) + + # Create a copy of the dataset + dataset_copy = dataset.copy() + + # Assign the cell indices as bins to the dataset + dataset_copy['lat_bins'] = pd.cut(dataset['lat'], bins=lat_bin_number, + labels=np.arange(lat_bin_number)) + dataset_copy['height_bins'] = pd.cut(dataset['photon_height'], bins=height_bin_number, + labels=np.arange(height_bin_number)) + + # Reset the index of the copied dataset + dataset_copy = dataset_copy.reset_index(drop=True) + + return dataset_copy, high_density_cells + + +# Bin data along horizontal scale only +def horizontal_bin_dataset(dataset, lat_res): + """Bin data along the horizontal scale (lat) only + for later segmentation""" + + # Calculate the number of bins required horizontally based on resolution size + lat_bin_number = round(abs(dataset['lat_utm'].min() - dataset['lat_utm'].max()) / lat_res) + + # Cut lat bins + # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + lat_bins = pd.cut(dataset['lat_utm'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + + # Create a copy of the dataset + dataset_copy = dataset.copy() + + # Add lat bins to the dataframe + dataset_copy['lat_bins'] = lat_bins + + # Reset the index of the copied dataset + dataset_copy = dataset_copy.reset_index(drop=True) + + return dataset_copy + + +# Bin data first, and then throw away only the surface bin +def get_rm_sea_surface_bin(binned_dataset): + """Calculate mean sea height for easier calculation of depth and cleaner + figures""" + + # set flag for the df save + flag = 1 + + # group dataset by lat bins + grouped_data = binned_dataset.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k, v in data_groups.items(): + + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby('height_bins',observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # get all values below this bin + # Use boolean indexing to select only the values below the peak bin + new_photon_array_without_peak_bin = v.loc[v['height_bins'] < largest_h_index] + + if flag == 1: + photon_array_without_peak_bin = new_photon_array_without_peak_bin + flag = 2 + + else: + photon_array_without_peak_bin = photon_array_without_peak_bin.append(new_photon_array_without_peak_bin) + + del new_df + + return photon_array_without_peak_bin + + +def get_sea_surface_height(binned_data, threshold): + """Calculate mean sea height for easier calculation of depth and cleaner figures""" + + #set flag for the df save + firstTimeIndex = True + + # Create sea height list + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + + # Group dataset by latitude bins + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k, v in data_groups.items(): + # based on lat_utm + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # Calculate the median value of all photon height values within this bin + photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + lat_bin_sea_median = photons_sea_surface.median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) + mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Get all photons below sea surface + # to determine segment type of each subsurface water column + # Use calculated sea height to determine photons at 0.5m below peak + photons_sea_surface_up = \ + v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & + (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + + # Calculate the photon ratio between surface and whole photons + if v['photon_height'].shape[0] > 0: + new_photons_ratio_sea_surface = \ + photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + else: + new_photons_ratio_sea_surface = np.nan + + sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + + final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | + (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + + # Determine label based on ratio of sea surface photons and subsurface photons + sea_surface_dominated_label = \ + np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Loop through groups again and return photons below 0.5m of sea height + PhotonDFBelowThresholdPeak = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + # Get all values below this bin + NewPhotonDFBelowThresholdPeak = \ + v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] + + if firstTimeIndex: + PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak + firstTimeIndex=False + else: + PhotonDFBelowThresholdPeak=\ + pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) + + + return final_sea_surface_height, \ + sea_surface_height_abnormal_label, \ + sea_surface_dominated_label,\ + PhotonDFBelowThresholdPeak + + +# +#Arbitrary cutoff below the max value - 0.5 m below peak +# (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) +def get_photon_below_sea_surface(binned_data,threshold): + '''Calculate mean sea height for easier calculation of depth and cleaner figures''' + + #set flag for the df save + firstTimeIndex=1 + + # Create sea height list + sea_surface_height = [] +# mean_lat_bins_seq=[] + + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k,v in data_groups.items(): + + lat_bin_average=v['lat_utm'].mean() + + # Create new dataframe based on occurance of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat_utm'].argmax() + + # Select the index of the bin with the highest count + largest_h = new_df.index[largest_h_bin] + + # Calculate the median value of all values within this bin + lat_bin_sea_median = v.loc[v['height_bins']==largest_h, 'photon_height'].median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) +# mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + Final_sea_height = np.where((sea_surface_height > (mean + 2*sd)) | (sea_surface_height < (mean - 2*sd)), np.nan, + sea_surface_height).tolist() + Abnormal_sea_height_label=np.where(np.isnan(Final_sea_height), 1, 0) + + + # Loop through groups again and return photons below 0.5m of sea height + for k,v in data_groups.items(): + + # get all values below this bin + # Use calculated sea height to determine photons at 0.5m below peak + NewPhotonArrayBelowThresholdPeak= \ + v.loc[v['photon_height']<(sea_surface_height[k]-threshold)] + + if firstTimeIndex ==1: + PhotonArrayBelowThresholdPeak=NewPhotonArrayBelowThresholdPeak + firstTimeIndex=2 + + else: + PhotonArrayBelowThresholdPeak=PhotonArrayBelowThresholdPeak.append(NewPhotonArrayBelowThresholdPeak) + + + return PhotonArrayBelowThresholdPeak + + + +# Function to get elevation for multiple points +def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): + # Convert geographic coordinates to the raster's coordinate system + rows, cols = raster.index(lons, lats) + rows, cols = np.array(rows), np.array(cols) + + # Ensure the indices are within bounds + valid_mask = (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + elevations = np.full(lons.shape, np.nan) + elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] + + return elevations + +def get_water_temp(date_year, date_month, date_day, latitude, longitude): + """ + Pull down surface water temperature along the track from the JPL GHRSST opendap website. + + The GHRSST data are gridded tiles with dimension 17998 x 35999. + To get the specific grid tile of the SST, you must convert from lat, lon coordinates + to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. + """ + # Get date from data filename + # data_path[-33:-25] + date = date_year + date_month + date_day + # date[0:4] + year = date_year + # date[4:6] + month = date_month + # date[6:8] + day = date_day + day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + # Add zero in front of day of year string + zero_day_of_year = day_of_year.zfill(3) + + # Calculate ratio of latitude from mid-point of IS2 track + old_lat = latitude.mean() + old_lat_min = -90 + old_lat_max = 90 + new_lat_min = 0 + new_lat_max = 17998 + + new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * + (new_lat_max - new_lat_min) + new_lat_min) + + # Calculate ratio of longitude from mid-point of IS2 track + old_lon = longitude.mean() + old_lon_min = -180 + old_lon_max = 180 + new_lon_min = 0 + new_lon_max = 35999 + + new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * + (new_lon_max - new_lon_min) + new_lon_min) + + # Access the SST data using the JPL OpenDap interface + url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ + + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ + + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + + dataset = netCDF4.Dataset(url) + + # Access the data and convert the temperature from K to C + water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + return water_temp + + +def refraction_correction(WTemp, WSmodel, Wavelength, + Photon_ref_elev, Ph_ref_azimuth, + PhotonZ, PhotonX, PhotonY, Ph_Conf): + """ + WTemp; there is python library that pulls water temp data + WSmodel is the value surface height + Wavelength is fixed + """ + + # Only process photons below water surface model + PhotonX = PhotonX[PhotonZ <= WSmodel] + PhotonY = PhotonY[PhotonZ <= WSmodel] + Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] + Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] + Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] + PhotonZ = PhotonZ[PhotonZ <= WSmodel] + + # water temp for refraction correction + WaterTemp = WTemp + + # Refraction coefficient # + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + wl = Wavelength + + # refractive index of air + n1 = 1.00029 + + # refractive index of water + n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + + # assumption is 0.25416 + # This example is refractionCoef = 0.25449 + # 1.00029 is refraction of air constant + correction_coef = (1 - (n1 / n2)) + + # read photon ref_elev to get theta1 + theta1 = np.pi / 2 - Photon_ref_elev + + # eq 1. Theta2 + theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + + # eq 3. S + # Approximate water Surface = 1.5 + # D = raw uncorrected depth + D = WSmodel - PhotonZ + + # For Triangle DTS + S = D / np.cos(theta1) + + # eq 2. R + R = (S * n1) / n2 + Gamma = (np.pi / 2) - theta1 + + # For triangle RPS + # phi is an angle needed + phi = theta1 - theta2 + + # P is the difference between raw and corrected YZ location + P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + + # alpha is an angle needed + alpha = np.arcsin((R * np.sin(phi)) / P) + + # Beta angle needed for Delta Y an d Delta Z + Beta = Gamma - alpha + + # Delta Y + DY = P * np.cos(Beta) + + # Delta Z + DZ = P * np.sin(Beta) + + # Delta Easting + DE = DY * np.sin(Ph_ref_azimuth) + + # Delta Northing + DN = DY * np.cos(Ph_ref_azimuth) + + outX = PhotonX + DE + outY = PhotonY + DN + outZ = PhotonZ + DZ + + ''' + print('For selected Bathy photon:') + print('lat = ', PhotonY[9000]) + print('long = ', PhotonX[9000]) + print('Raw Depth = ', PhotonZ[9000]) + print('D = ', D[9000]) + + print('ref_elev = ', Photon_ref_elev[9000]) + + print('Delta East = ', DE[9000]) + print('Delta North = ', DN[9000]) + print('Delta Z = ', DZ[9000]) + ''' + return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, + Photon_ref_elev) # We are most interested in out-x, out-y, out-z + + +def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surface_height, vertical_res): + """ Detect bathymetric level per bin based on percentile_thresh """ + # Create sea height list + bath_height = [] + + geo_photon_height = [] + geo_longitude = [] + geo_latitude = [] + + # Group data by latitude + # Filter out surface data that are two bins below median surface value calculated above + binned_data_bath = binned_data[(binned_data['photon_height'] < + sea_surface_height - (vertical_res * 2))] + grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Create a percentile threshold of photon counts in each grid, + # grouped by both x and y axes. + count_threshold = np.percentile( + binned_data.groupby(['lat_bins', 'height_bins']).size().reset_index().groupby('lat_bins')[[0]].max(), + percentile_thresh) + + # Loop through groups and return average bathy height + for k, v in data_groups.items(): + new_df = pd.DataFrame(v.groupby('height_bins').count()) + bath_bin = new_df['lat'].argmax() + bath_bin_h = new_df.index[bath_bin] + + # Set threshold of photon counts per bin + # here this script determines whether there is bathymetry signals by + # the photon counts per bin below sea surface height + if new_df.iloc[bath_bin]['lat'] >= count_threshold: + + geo_photon_height.append(v.loc[v['height_bins'] == + bath_bin_h, 'cor_photon_height'].values) + geo_longitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lon'].values) + geo_latitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lat'].values) + + bath_bin_median = v.loc[v['height_bins'] == bath_bin_h, 'cor_photon_height'].median() + bath_height.append(bath_bin_median) + del new_df + + else: + bath_height.append(np.nan) + del new_df + + geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() + geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() + geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() + geo_depth = sea_surface_height - geo_photon_list + geo_df = pd.DataFrame( + {'lon': geo_longitude_list, 'lat': geo_latitude_list, + 'photon_height': geo_photon_list, + 'depth': geo_depth}) + + del geo_longitude_list, geo_latitude_list, geo_photon_list + + return bath_height, geo_df + + +def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_height, vertical_res): + """ Calculate bathymetric level per lat bin based on horizontal resolution """ + # Create sea height list + bath_height = [] + geo_photon_height = [] + geo_longitude = [] + geo_latitude = [] + + # Group data by latitude + # Filter out surface data that are two bins (2 times height resolution) + # below median sea surface value calculated above + binned_data_bath = lat_binned_data[(lat_binned_data['photon_height'] < + sea_surface_height - (vertical_res * 2))] + + grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average bathymetric height + for k, v in data_groups.items(): + + # assign each group of dataset to a new dataframe + new_df = pd.DataFrame(v) + + # Check if the DataFrame is empty + if new_df.empty: + # If the DataFrame is empty, append null values to the lists and skip to the next iteration + bath_height.append(np.nan) + geo_photon_height.append([]) + geo_longitude.append([]) + geo_latitude.append([]) + del new_df + continue + + lat_height_pairs = list(zip(new_df['lat_utm'], new_df['cor_photon_height'])) + + # Convert to a numpy array for sklearn + lat_height_pairs_array = np.array(lat_height_pairs) + + # Perform HDBSCAN clustering on the photons below sea surface for each lat bin + # scaler = StandardScaler() + scaler = MinMaxScaler() + data_scaled = scaler.fit_transform(lat_height_pairs_array) + + min_cluster_size = 4 + + # Check the number of data points is greater than the initial min_cluster_size + if len(data_scaled) < min_cluster_size: + min_cluster_size = len(data_scaled) // 2 + if min_cluster_size < 2: + bath_height.append(np.nan) + geo_photon_height.append([]) + geo_longitude.append([]) + geo_latitude.append([]) + del new_df + continue + + # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) + cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, + min_samples=3, cluster_selection_epsilon=20, + leaf_size=25, core_dist_n_jobs=-1) + cluster.fit(data_scaled) + + # Get the labels assigned to each point by the HDBSCAN model + labels = cluster.labels_ + + # Count the number of points assigned to each cluster + unique, counts = np.unique(labels, return_counts=True) + + # Create a dictionary that maps each cluster label to the count of points assigned to it + clusters_dict = dict(zip(unique, counts)) + + # Print the dictionary to see the size of each cluster + print(clusters_dict) + + # Identify the label of the largest cluster + max_cluster_label = max(clusters_dict, key=clusters_dict.get) + + # Check if no cluster is found + if max_cluster_label == -1: + bath_height.append(np.nan) + else: + # Add labels to the DataFrame + new_df_copy = new_df.copy() + new_df_copy['cluster'] = labels + + # Subset the DataFrame to get only the data points in the largest cluster + bath_cluster_data = new_df[new_df_copy['cluster'] == max_cluster_label] + + # calculate the median height value as the average bathymetric height + bath_bin_median = bath_cluster_data['cor_photon_height'].median() + bath_height.append(bath_bin_median) + + # Extract the longitude, latitude, and photon height for each lat bin + # and add it to respective lists + geo_photon_height.append(bath_cluster_data['cor_photon_height'].values) + geo_longitude.append(bath_cluster_data['lon_utm'].values) + geo_latitude.append(bath_cluster_data['lat_utm'].values) + + del new_df + + # Convert the lists to a single list + geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() + geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() + geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() + + # Calculate depth + geo_depth = sea_surface_height - np.array(geo_photon_list) + + # Create a DataFrame + geo_df = pd.DataFrame( + {'lon': geo_longitude_list, 'lat': geo_latitude_list, + 'photon_height': geo_photon_list, + 'depth': geo_depth.tolist()}) + + return bath_height, geo_df + +# requires that the input gdf has ranged index values i +# will need to change if index is changed to time or something +# this currently checks point in polygon for EVERY point +# would be significantly sped up if evaluated at 10m or something similar +# maybe later, fine for now +def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): + # try loading the shoreline data + try: + ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + + # allocation of to be used arrays + zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) + + # Land flag initialized as -1 + # If shorelines downloaded already, will be set to 0 or 1 + ICESat2_GDF.insert(0, 'is_land', + zero_int_array - 1, False) + + # set the projection + ICESat2_GDF.set_crs("EPSG:4326", inplace=True) + + # load shoreline dataset to include only the features that intersect the bounding box + # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None + # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. + # engine str, 'fiona' or 'pyogrio' + # somtime it gives error if using fiona + # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') + land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + + # continue with getting a new array of 0-or-1 labels for each photon + land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) + + # update labels for points in the land polygons + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + + # get land or not bool value + land_loc = ICESat2_GDF.index.isin(pts_in_land.index) + + # asigned them to new numpy array + land_point_labels[land_loc] = 1 + land_point_labels[~land_loc] = 0 + + return land_point_labels + + except Exception as e: + + print(e) + + print("Error loading shoreline data, returning -1s for is_land flag") + + # if the shoreline data is not available + # return the original label array + + return -np.ones_like(ICESat2_GDF.is_land.values) + + +# +def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, + y_limit_top, y_limit_bottom, percentile, file, geo_df, + ref_y, ref_z, beam, epsg_num): + """Create figures""" + + # Create bins for latitude + bath_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(bath_height))+20 + + sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(sea_height))+10 + + # Create new dataframes for median values + bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + + # Create uniform sea surface based on median sea surface values and filter out surface breaching + sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] + sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + + # Create uniform solo sea surface label + sea_surface_label = solo_sea_surface_label + sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + idx_1 = np.where(sea_surface_label_df.y == 1) + idx_0 = np.where(sea_surface_label_df.y == 0) + + # Define figure size + fig = plt.rcParams["figure.figsize"] = (40, 25) + + # Plot raw points + # plt.scatter(x=binned_data.lat, + # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, + # c = 'yellow', label = 'Raw photon height') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') + plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', + alpha=0.1, c='red', label='Classified Photons') + + # plt.scatter(x=geo_df.lat, + # y = geo_df.photon_height, marker='o', lw=0, s=0.8, + # alpha = 0.8, c = 'black', label = 'Corrected photon bin') + + # Plot median values + plt.scatter(bath_median_df.x, bath_median_df.y, + marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') + + plt.scatter(sea_median_df.x, sea_median_df.y, + marker='o', c='b', alpha=1, s=2, label='Median sea surface') + + plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, + marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') + plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, + marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + + # Insert titles and subtitles + plt.title('Icesat2 Bathymetry\n' + file) + plt.xlabel('Latitude', fontsize=25) + plt.ylabel('Photon Height (m)', fontsize=25) + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + + plt.legend(loc="upper left", prop={'size': 20}) + + # Limit the x and y axes using parameters + plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) + plt.ylim(top=y_limit_top, bottom=y_limit_bottom) + + timestr = time.strftime("%Y%m%d_%H%M%S") + file = file.replace('.h5', '') + # Define where to save file + plt.tight_layout() + plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + + str(beam) + '_' + str(percentile) + + '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + # plt.show() + # plt.close() + + # convert corrected locations back to wgs84 (useful to contain) + transformer = Transformer.from_crs("EPSG:" + str(epsg_num), + "EPSG:4326", always_xy=True) + print(transformer) + lon_wgs84, lat_wgs84 = transformer.transform( + geo_df.lon.values, geo_df.lat.values) + + geo_df['lon_wgs84'] = lon_wgs84 + geo_df['lat_wgs84'] = lat_wgs84 + + geodf = gpd.GeoDataFrame(geo_df, + geometry=gpd.points_from_xy(geo_df.lon_wgs84, + geo_df.lat_wgs84)) + + geodf.set_crs(epsg=4326, inplace=True) + + # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + + # str(epsg_num) + '_' + timestr + ".gpkg", + # driver="GPKG") diff --git a/aok/core/kd_utils/sea_photons_analysis.py b/aok/core/kd_utils/sea_photons_analysis.py new file mode 100644 index 0000000..90a2510 --- /dev/null +++ b/aok/core/kd_utils/sea_photons_analysis.py @@ -0,0 +1,449 @@ + +import numpy as np +import geopandas as gpd +from datetime import datetime +import pandas as pd +import netCDF4 +import pandas as pd +from scipy.signal import find_peaks +from scipy.stats import norm + +# Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset +def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): + # Initialize list to store results from each beam + binned_beam_datasets = [] + + # Group the dataset by 'beam_id' and process each group separately + for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): + print(f'Processing binning for beam: {beam_id}') + + # Apply binning to the current beam dataset + binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) + + # Append the binned data for the current beam to the list + binned_beam_datasets.append(binned_beam_data) + + # Combine all binned beam datasets into a single DataFrame + binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) + + return binned_dataset_sea_surface + + +# Bin data along vertical and horizontal scales +def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): + """Bin data along vertical and horizontal scales + for later segmentation""" + + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise + valid_range = (-70, 5) + valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + + # Apply the valid_mask to filter unwanted values + # and create a copy to avoid SettingWithCopyWarning + filtered_dataset = dataset[valid_mask].copy() + + # Calculate the number of height bins + height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) + height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + + # Calculate the number of latitude bins + lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin + + # Create bins for latitude + lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + + # Create bins for height + height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, + labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), + filtered_dataset['photon_height'].max(), + num=height_bin_number), decimals=1)) + + # Add bins to dataframe using .loc to avoid SettingWithCopyWarning + filtered_dataset.loc[:, 'lat_bins'] = lat_bins + filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset = filtered_dataset.reset_index(drop=True) + + return filtered_dataset + + +def get_sea_surface_height_static(binned_data, threshold): + """ + Calculate sea surface height and filter subsurface photons using adaptive detection. + + Parameters: + binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height + + Returns: + final_sea_surface_height (list): Detected sea surface heights + sea_surface_height_abnormal_label (array): Labels for abnormal heights + sea_surface_dominated_label (array): Labels for surface-dominated bins + PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface + """ + + #set flag for the df save + firstTimeIndex = True + + # Create sea height list + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + + # Group dataset along horizental (latitude bins) + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups to detect sea surface + for k, v in data_groups.items(): + # based on lat_utm + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Check if new_df is not empty before finding the bin with the highest photon count + if not new_df.empty: + # Find the vertical bin with the highest photon count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # Calculate the median value of all photon height values within this bin + photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + lat_bin_sea_median = photons_sea_surface.median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) + mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Get all photons below sea surface + # to determine segment type of each subsurface water column + # Use calculated sea height to determine photons at 0.5m below peak + photons_sea_surface_up = \ + v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & + (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + + # Calculate the photon ratio between surface and whole photons + if v['photon_height'].shape[0] > 0: + new_photons_ratio_sea_surface = \ + photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + else: + new_photons_ratio_sea_surface = np.nan + + sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + + else: + # Append NaNs if the group is empty + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + + final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | + (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + + # Determine label based on ratio of sea surface photons and subsurface photons + sea_surface_dominated_label = \ + np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Loop through groups again and return photons below 0.5m of sea height + PhotonDFBelowThresholdPeak = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + + # Get all values below this bin + if not np.isnan(final_sea_surface_height[i]): + NewPhotonDFBelowThresholdPeak = \ + v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] + + if firstTimeIndex: + PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak + firstTimeIndex=False + else: + PhotonDFBelowThresholdPeak=\ + pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) + + + return final_sea_surface_height, \ + sea_surface_height_abnormal_label, \ + sea_surface_dominated_label,\ + PhotonDFBelowThresholdPeak + + +def get_sea_surface_height_adaptive(binned_data): + """ + Calculate sea surface height and filter subsurface photons with enhanced wave removal. + + Parameters: + binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height + + Returns: + final_sea_surface_height (list): Detected sea surface heights + sea_surface_height_abnormal_label (array): Labels for abnormal heights + sea_surface_dominated_label (array): Labels for surface-dominated bins + PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed + """ + + firstTimeIndex = True + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + for k, v in data_groups.items(): + lat_bin_average = v['lat'].mean() + + if len(v) < 20: # Increase minimum photon count for robustness + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + continue + + # Finer histogram for better peak resolution + hist, bin_edges = np.histogram(v['photon_height'], + bins=100, # Finer bins + density=True, + range=(v['photon_height'].min(), v['photon_height'].max())) + bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + + # Enhanced peak detection + peaks, properties = find_peaks(hist, + height=np.max(hist)*0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist)*0.1) # Require prominent peaks + + if len(peaks) == 0: + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + continue + + # Use strongest peak as surface, consider nearby peaks as wave effects + surface_peak_idx = peaks[np.argmax(hist[peaks])] + surface_height = bin_centers[surface_peak_idx] + + # Wider window for Gaussian fit to capture wave effects + peak_data = v[(v['photon_height'] > surface_height - 1.0) & + (v['photon_height'] < surface_height + 1.0)] + if len(peak_data) > 10: + mu, sigma = norm.fit(peak_data['photon_height']) + else: + mu, sigma = surface_height, 0.2 # More conservative default + + # Stricter threshold: 3σ below mean, plus minimum depth + # Waves can extend beyond 2.5σ, especially in rough conditions. + # 3σ is a common statistical cutoff for excluding outliers + adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # Ensure at least 1m below + + # Extended surface layer to remove wave effects + surface_photons = v[(v['photon_height'] > adaptive_threshold) & + (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound + ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan + + sea_surface_height.append(mu) + mean_lat_bins_seq.append(lat_bin_average) + sea_surface_subsurface_photons_ratio.append(1 - ratio) + + # Outlier filtering + mean = np.nanmean(sea_surface_height) + sd = np.nanstd(sea_surface_height) + final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | + (sea_surface_height < mean - 2*sd), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Filter subsurface photons with diagnostics + PhotonDFBelowSurface = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + if not np.isnan(final_sea_surface_height[i]): + peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - 1.0) & + (v['photon_height'] < final_sea_surface_height[i] + 1.0)] + if len(peak_data) > 10: + mu, sigma = norm.fit(peak_data['photon_height']) + adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) + else: + adaptive_threshold = final_sea_surface_height[i] - 1.0 # Stricter default + + NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] + + if firstTimeIndex: + PhotonDFBelowSurface = NewPhotonDFBelowSurface + firstTimeIndex = False + else: + PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) + + return (final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface) + + +def get_water_temp(date_year, date_month, date_day, latitude, longitude): + """ + Pull down surface water temperature along the track from the JPL GHRSST opendap website. + + The GHRSST data are gridded tiles with dimension 17998 x 35999. + To get the specific grid tile of the SST, you must convert from lat, lon coordinates + to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. + """ + # Get date from data filename + # data_path[-33:-25] + date = date_year + date_month + date_day + # date[0:4] + year = date_year + # date[4:6] + month = date_month + # date[6:8] + day = date_day + day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + # Add zero in front of day of year string + zero_day_of_year = day_of_year.zfill(3) + + # Calculate ratio of latitude from mid-point of IS2 track + old_lat = latitude.mean() + old_lat_min = -90 + old_lat_max = 90 + new_lat_min = 0 + new_lat_max = 17998 + + new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * + (new_lat_max - new_lat_min) + new_lat_min) + + # Calculate ratio of longitude from mid-point of IS2 track + old_lon = longitude.mean() + old_lon_min = -180 + old_lon_max = 180 + new_lon_min = 0 + new_lon_max = 35999 + + new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * + (new_lon_max - new_lon_min) + new_lon_min) + + # Access the SST data using the JPL OpenDap interface + url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ + + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ + + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + + dataset = netCDF4.Dataset(url) + + # Access the data and convert the temperature from K to C + water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + return water_temp + + +def refraction_correction(WTemp, WSmodel, Wavelength, + Photon_ref_elev, Ph_ref_azimuth, + PhotonZ, PhotonX, PhotonY, Ph_Conf): + """ + WTemp; there is python library that pulls water temp data + WSmodel is the value surface height + Wavelength is fixed + """ + + # Only process photons below water surface model + PhotonX = PhotonX[PhotonZ <= WSmodel] + PhotonY = PhotonY[PhotonZ <= WSmodel] + Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] + Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] + Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] + PhotonZ = PhotonZ[PhotonZ <= WSmodel] + + # water temp for refraction correction + WaterTemp = WTemp + + # Refraction coefficient # + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + wl = Wavelength + + # refractive index of air + n1 = 1.00029 + + # refractive index of water + n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + + # assumption is 0.25416 + # This example is refractionCoef = 0.25449 + # 1.00029 is refraction of air constant + correction_coef = (1 - (n1 / n2)) + + # read photon ref_elev to get theta1 + theta1 = np.pi / 2 - Photon_ref_elev + + # eq 1. Theta2 + theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + + # eq 3. S + # Approximate water Surface = 1.5 + # D = raw uncorrected depth + D = WSmodel - PhotonZ + + # For Triangle DTS + S = D / np.cos(theta1) + + # eq 2. R + R = (S * n1) / n2 + Gamma = (np.pi / 2) - theta1 + + # For triangle RPS + # phi is an angle needed + phi = theta1 - theta2 + + # P is the difference between raw and corrected YZ location + P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + + # alpha is an angle needed + alpha = np.arcsin((R * np.sin(phi)) / P) + + # Beta angle needed for Delta Y an d Delta Z + Beta = Gamma - alpha + + # Delta Y + DY = P * np.cos(Beta) + + # Delta Z + DZ = P * np.sin(Beta) + + # Delta Easting + DE = DY * np.sin(Ph_ref_azimuth) + + # Delta Northing + DN = DY * np.cos(Ph_ref_azimuth) + + outX = PhotonX + DE + outY = PhotonY + DN + outZ = PhotonZ + DZ + + ''' + print('For selected Bathy photon:') + print('lat = ', PhotonY[9000]) + print('long = ', PhotonX[9000]) + print('Raw Depth = ', PhotonZ[9000]) + print('D = ', D[9000]) + + print('ref_elev = ', Photon_ref_elev[9000]) + + print('Delta East = ', DE[9000]) + print('Delta North = ', DN[9000]) + print('Delta Z = ', DZ[9000]) + ''' + return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, + Photon_ref_elev) # We are most interested in out-x, out-y, out-z + diff --git a/aok/core/kd_utils/sliderule_adapter.py b/aok/core/kd_utils/sliderule_adapter.py new file mode 100644 index 0000000..47c6e35 --- /dev/null +++ b/aok/core/kd_utils/sliderule_adapter.py @@ -0,0 +1,181 @@ +import logging +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +from pyproj import Geod, Proj + +from kd_utils.data_processing import convert_wgs_to_utm + +logger = logging.getLogger(__name__) + + +def _get_beam_id_from_track_pair(df: pd.DataFrame) -> pd.Series: + if 'gt' in df.columns: + return df['gt'].astype(str) + if 'track' in df.columns and 'pair' in df.columns: + suffix = df['pair'].map({0: 'l', 1: 'r'}).fillna('x') + return 'gt' + df['track'].astype(int).astype(str) + suffix + return pd.Series(['unknown'] * len(df), index=df.index, dtype='object') + + +def _infer_strong_beam_suffix(df: pd.DataFrame) -> Optional[str]: + if 'sc_orient' not in df.columns or df.empty: + return None + orient = df['sc_orient'].dropna() + if orient.empty: + return None + # Same logic as your Florida script. + return 'l' if int(orient.iloc[0]) == 0 else 'r' + + +def _normalize_signal_conf(values: pd.Series) -> pd.Series: + def to_scalar(v): + if isinstance(v, (list, tuple, np.ndarray)): + if len(v) == 0: + return np.nan + return float(v[0]) + try: + return float(v) + except Exception: + return np.nan + return values.apply(to_scalar) + + +def _relative_distance_km_per_beam(df: pd.DataFrame) -> pd.Series: + out = pd.Series(np.nan, index=df.index, dtype=float) + for beam_id, beam_df in df.groupby('beam_id'): + beam_df = beam_df.sort_values('delta_time') + if 'segment_dist' in beam_df.columns: + rel_km = (beam_df['segment_dist'] - beam_df['segment_dist'].min()) / 1000.0 + out.loc[beam_df.index] = rel_km.values + continue + + lons = beam_df['longitude'].to_numpy(dtype=float) + lats = beam_df['latitude'].to_numpy(dtype=float) + if len(lons) == 0: + continue + geod = Geod(ellps='WGS84') + cumulative_m = np.zeros(len(lons), dtype=float) + for i in range(1, len(lons)): + _, _, dist_m = geod.inv(lons[i - 1], lats[i - 1], lons[i], lats[i]) + cumulative_m[i] = cumulative_m[i - 1] + max(dist_m, 0.0) + out.loc[beam_df.index] = cumulative_m / 1000.0 + return out + + +def _project_to_utm(df: pd.DataFrame) -> pd.DataFrame: + projected = df.copy() + projected['lon'] = np.nan + projected['lat'] = np.nan + for beam_id, beam_df in projected.groupby('beam_id'): + if beam_df.empty: + continue + lon0 = float(beam_df['longitude'].iloc[0]) + lat0 = float(beam_df['latitude'].iloc[0]) + epsg = convert_wgs_to_utm(lon0, lat0) + proj = Proj(epsg) + x, y = proj(beam_df['longitude'].to_numpy(dtype=float), beam_df['latitude'].to_numpy(dtype=float)) + projected.loc[beam_df.index, 'lon'] = x + projected.loc[beam_df.index, 'lat'] = y + return projected + + +def sliderule_to_framework_dataset( + atl03_gdf, + strong_beams_only: bool = True, +) -> pd.DataFrame: + """ + Convert SlideRule ATL03 photon GeoDataFrame to the framework's sea_photon_dataset schema. + """ + if atl03_gdf is None or len(atl03_gdf) == 0: + return pd.DataFrame(columns=[ + 'beam_id', 'latitude', 'longitude', 'lat', 'lon', 'photon_height', + 'quality_ph', 'photon_conf', 'ref_elevation', 'ref_azimuth', + 'relative_AT_dist', 'solar_elevation', 'background_rate', 'is_land_label' + ]) + + src = pd.DataFrame(atl03_gdf).copy() + src['beam_id'] = _get_beam_id_from_track_pair(src) + if strong_beams_only: + strong_suffix = _infer_strong_beam_suffix(src) + if strong_suffix in ('l', 'r'): + src = src[src['beam_id'].str.endswith(strong_suffix)] + + out = pd.DataFrame({ + 'beam_id': src['beam_id'].astype(str), + 'latitude': pd.to_numeric(src.get('lat_ph', np.nan), errors='coerce'), + 'longitude': pd.to_numeric(src.get('lon_ph', np.nan), errors='coerce'), + 'photon_height': pd.to_numeric(src.get('h_ph', np.nan), errors='coerce'), + 'quality_ph': pd.to_numeric(src.get('quality_ph', np.nan), errors='coerce'), + 'ref_elevation': pd.to_numeric(src.get('ref_elev', np.nan), errors='coerce'), + 'ref_azimuth': pd.to_numeric(src.get('ref_azimuth', np.nan), errors='coerce'), + 'solar_elevation': pd.to_numeric(src.get('solar_elevation', np.nan), errors='coerce'), + 'background_rate': pd.to_numeric(src.get('bckgrd_rate', np.nan), errors='coerce'), + 'delta_time': pd.to_numeric(src.get('delta_time', np.nan), errors='coerce'), + 'segment_dist': pd.to_numeric(src.get('segment_dist', np.nan), errors='coerce'), + }) + + if 'signal_conf_ph' in src.columns: + out['photon_conf'] = _normalize_signal_conf(src['signal_conf_ph']) + else: + out['photon_conf'] = np.nan + + out = out.dropna(subset=['latitude', 'longitude', 'photon_height']).copy() + out = _project_to_utm(out) + out['relative_AT_dist'] = _relative_distance_km_per_beam(out) + out['is_land_label'] = 0 + + return out.drop(columns=['delta_time', 'segment_dist'], errors='ignore') + + +def fetch_sliderule_atl03( + region: List[Dict[str, float]], + t0: str, + t1: str, + rgt: Optional[int] = None, + ph_fields: Optional[List[str]] = None, + sliderule_url: str = "slideruleearth.io", + verbose: bool = False, +): + """ + Fetch ATL03 photons from SlideRule. + """ + try: + from sliderule import sliderule, icesat2 + except Exception as e: + raise ImportError("sliderule package is required for SlideRule ingestion.") from e + + sliderule.init(sliderule_url, verbose=verbose) + fields = ph_fields or [ + "delta_time", "lat_ph", "lon_ph", "h_ph", "quality_ph", "signal_conf_ph", + "segment_dist", "ref_elev", "ref_azimuth", "solar_elevation", "bckgrd_rate", + ] + params = {"poly": region, "t0": t0, "t1": t1, "atl03_ph_fields": fields} + if rgt is not None: + params["rgt"] = int(rgt) + return icesat2.atl03sp(params) + + +def fetch_and_prepare_sliderule_dataset( + region: List[Dict[str, float]], + t0: str, + t1: str, + rgt: Optional[int] = None, + strong_beams_only: bool = True, + sliderule_url: str = "slideruleearth.io", + verbose: bool = False, +) -> pd.DataFrame: + """ + One-call helper: + fetch ATL03 photons from SlideRule and convert to framework dataset schema. + """ + atl03_gdf = fetch_sliderule_atl03( + region=region, + t0=t0, + t1=t1, + rgt=rgt, + sliderule_url=sliderule_url, + verbose=verbose, + ) + return sliderule_to_framework_dataset(atl03_gdf, strong_beams_only=strong_beams_only) diff --git a/aok/core/kd_utils/visualization.py b/aok/core/kd_utils/visualization.py new file mode 100644 index 0000000..64034d4 --- /dev/null +++ b/aok/core/kd_utils/visualization.py @@ -0,0 +1,263 @@ +# utils/visualization.py + +import numpy as np +import os +import pandas as pd +import time +import geopandas as gpd +from pyproj import Transformer, Proj +import matplotlib.pyplot as plt +from scipy.spatial import ConvexHull + +def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): + fig, ax = plt.subplots() + ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') + ax.set_xlabel('Relative AT Distance') + ax.set_ylabel('Height (h_ph)') + ax.set_title('Scatter Plot of ATL03 Photons') + ax.set_ylim(hlims) + ax.legend() + plt.show() + +def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): + """ + Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. + + Parameters: + - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_surface_height: Array of sea surface height values. + - output_path: Path to save the output plot. + """ + + # get sea_surface_x_axis_bins + sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), + filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), + len(sea_surface_height)) + + hlims = [-25, 10] + fig, ax = plt.subplots() + + # Scatter plot of subsurface photons + ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + + # Overlay the seafloor elevation data as points + ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + + ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], + linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') + + # Set labels and title + ax.set_xlabel('Distance From Start of Track (km)') + ax.set_ylabel('Photon Height (m)') + ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') + ax.set_ylim(hlims) + + # Add a legend + ax.legend() + + # Save the plot + plt.legend(loc='upper right') + plt.savefig(output_path, dpi=400, format='jpeg') + + # Show the plot + plt.show() + + +def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull_areas): + """ + Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. + """ + #filter beam type + # photon_dataset + photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] + + fig, ax = plt.subplots(figsize=(10, 6)) + hlims = [-10, 2] + + for lat_bin, hull_points in convex_hulls.items(): + hull = ConvexHull(hull_points) + ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') + + # Calculate centroid to place the label + centroid = np.mean(hull_points[hull.vertices], axis=0) + ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', + horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') + + ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.set_ylim(hlims) + ax.set_xlabel('Latitude') + ax.set_ylabel('Photon Height') + ax.set_title('ConvexHull of Photons within each Lat Bin') + plt.show() + +# plot the kd and photon +def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): + #filter beam type + # photon_dataset + subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] + + hlims = [-45, 5] + fig, ax1 = plt.subplots(figsize=(10, 6)) + ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], + s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') + ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) + ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel('Photon Height', color='b', fontsize=18) + ax1.tick_params(axis='y', labelcolor='b', labelsize=18) + ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') + # ax1.set_xlim([1800, 2100]) # Set x-axis limits + # ax1.set_ylim(hlims) + + ax2 = ax1.twinx() + ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], + label='Kd values', color='r', alpha=0.6) + ax2.set_ylabel('Kd Value', color='r', fontsize=18) + ax2.tick_params(axis='y', labelcolor='r',labelsize=18) + # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') + handles1, labels1 = ax1.get_legend_handles_labels() + handles2, labels2 = ax2.get_legend_handles_labels() + fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) + # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) + fig.tight_layout() + plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') + plt.show() + + + +def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): + plt.figure(figsize=(12, 6)) + + plt.subplot(1, 2, 1) + plt.title("Original Height Data") + plt.scatter(lat, height, c=height, cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') + plt.ylim(y_min, y_max) + plt.legend() + + plt.subplot(1, 2, 2) + plt.title("Masked Region (ROI)") + plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + + for polygon in polygons: + plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) + + for lb in lat_bins: + plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) + for hb in height_bins: + plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) + + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') + plt.ylim(y_min, y_max) + plt.legend() + + plt.tight_layout() + plt.show() + +# +def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, + y_limit_top, y_limit_bottom, percentile, file, geo_df, + ref_y, ref_z, beam, epsg_num): + """Create figures""" + + # Create bins for latitude + bath_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(bath_height))+20 + + sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(sea_height))+10 + + # Create new dataframes for median values + bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + + # Create uniform sea surface based on median sea surface values and filter out surface breaching + sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] + sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + + # Create uniform solo sea surface label + sea_surface_label = solo_sea_surface_label + sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + idx_1 = np.where(sea_surface_label_df.y == 1) + idx_0 = np.where(sea_surface_label_df.y == 0) + + # Define figure size + fig = plt.rcParams["figure.figsize"] = (40, 25) + + # Plot raw points + # plt.scatter(x=binned_data.lat, + # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, + # c = 'yellow', label = 'Raw photon height') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') + plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', + alpha=0.1, c='red', label='Classified Photons') + + # plt.scatter(x=geo_df.lat, + # y = geo_df.photon_height, marker='o', lw=0, s=0.8, + # alpha = 0.8, c = 'black', label = 'Corrected photon bin') + + # Plot median values + plt.scatter(bath_median_df.x, bath_median_df.y, + marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') + + plt.scatter(sea_median_df.x, sea_median_df.y, + marker='o', c='b', alpha=1, s=2, label='Median sea surface') + + plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, + marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') + plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, + marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + + # Insert titles and subtitles + plt.title('Icesat2 Bathymetry\n' + file) + plt.xlabel('Latitude', fontsize=25) + plt.ylabel('Photon Height (m)', fontsize=25) + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + + plt.legend(loc="upper left", prop={'size': 20}) + + # Limit the x and y axes using parameters + plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) + plt.ylim(top=y_limit_top, bottom=y_limit_bottom) + + timestr = time.strftime("%Y%m%d_%H%M%S") + file = file.replace('.h5', '') + # Define where to save file + plt.tight_layout() + plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + + str(beam) + '_' + str(percentile) + + '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + # plt.show() + # plt.close() + + # convert corrected locations back to wgs84 (useful to contain) + transformer = Transformer.from_crs("EPSG:" + str(epsg_num), + "EPSG:4326", always_xy=True) + print(transformer) + lon_wgs84, lat_wgs84 = transformer.transform( + geo_df.lon.values, geo_df.lat.values) + + geo_df['lon_wgs84'] = lon_wgs84 + geo_df['lat_wgs84'] = lat_wgs84 + + geodf = gpd.GeoDataFrame(geo_df, + geometry=gpd.points_from_xy(geo_df.lon_wgs84, + geo_df.lat_wgs84)) + + geodf.set_crs(epsg=4326, inplace=True) + + # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + + # str(epsg_num) + '_' + timestr + ".gpkg", + # driver="GPKG") \ No newline at end of file From e48c0f164d37e50636d189bad53d64d04325f65a Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Thu, 26 Mar 2026 12:14:39 -0400 Subject: [PATCH 03/11] add main.py --- aok/core/kd_utils/main.py | 221 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 aok/core/kd_utils/main.py diff --git a/aok/core/kd_utils/main.py b/aok/core/kd_utils/main.py new file mode 100644 index 0000000..e38729f --- /dev/null +++ b/aok/core/kd_utils/main.py @@ -0,0 +1,221 @@ +import glob +import logging +import os +import re +import sys + +import pandas as pd + +from config import get_args +from kd_utils.Kd_analysis import process_kd_calculation +from kd_utils.bathy_processing import process_subsurface_photon_filtering +from kd_utils.data_processing import ( + Extract_sea_photons, + apply_optional_ir_ap_filter, + apply_optional_solar_background_filter, + filter_photon_dataset_by_hull_area, + load_data, +) +from kd_utils.sea_photons_analysis import process_sea_photon_binning +from kd_utils.visualization import plot_convex_hulls, plot_kd_photons + + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +PAIR_ID_MAP = { + 'gt1l': 'gt1_pair', + 'gt1r': 'gt1_pair', + 'gt2l': 'gt2_pair', + 'gt2r': 'gt2_pair', + 'gt3l': 'gt3_pair', + 'gt3r': 'gt3_pair', +} + + +def get_target_beams(all_beams, beam_attrs, target_beams_arg): + strong_beams = [ + gtx for gtx in all_beams + if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' + ] + if not target_beams_arg: + return strong_beams + + requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] + return [b for b in requested if b in strong_beams] + + +def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=False): + """ + Optionally combine left/right beams into pair groups before Kd fitting. + Uses shared along-track bins from relative_AT_dist when available. + """ + if (not enabled) or dataset.empty: + return dataset + + combined = dataset.copy() + combined['source_beam_id'] = combined['beam_id'] + combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) + + if 'relative_AT_dist' in combined.columns: + rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') + pair_bins = pair_bins.fillna(-1).astype(int) + combined['lat_bins'] = pair_bins + + return combined + + +def run_pipeline(args): + atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) + shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) + gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) + atl24_file_path = args.atl24_file + if atl24_file_path and (not os.path.isabs(atl24_file_path)): + atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) + output_path = os.path.join(args.workspace_path, args.output_path) + os.makedirs(output_path, exist_ok=True) + + match = re.search(r"_(\d{14})_", atl03_h5_file_path) + timestamp = match.group(1) if match else "unknown" + + gebco_pattern = os.path.join(gebco_full_path, "gebco_*.tif") + gebco_file_path_lists = [p for p in glob.glob(gebco_pattern)] + + is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) + target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) + if not target_strong_beams: + logger.error("No target strong beams found. Check input file and --target_beams.") + sys.exit(1) + logger.info("Strong beams selected: %s", target_strong_beams) + plot_target_beam = [target_strong_beams[0]] + + sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + sea_photon_dataset = apply_optional_ir_ap_filter( + sea_photon_dataset, + enabled=args.enable_ir_ap_filter, + quality_max=args.ir_ap_quality_max, + min_signal_conf=args.ir_ap_min_signal_conf, + ) + sea_photon_dataset = apply_optional_solar_background_filter( + sea_photon_dataset, + enabled=args.enable_solar_background_filter, + day_threshold=args.solar_elevation_day_threshold, + background_quantile=args.solar_background_quantile, + min_signal_conf=args.solar_background_min_signal_conf, + ) + + binned_dataset_sea_surface = process_sea_photon_binning( + sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res + ) + + post_refraction_refit_enabled = args.enable_post_refraction_refit + if post_refraction_refit_enabled and (not args.enable_refraction_correction): + logger.warning( + "--enable_post_refraction_refit requested without --enable_refraction_correction. " + "The post-refraction refit step will be skipped." + ) + post_refraction_refit_enabled = False + + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) + + final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) + ].copy() + + if args.enable_convex_hull_filter: + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ + filter_photon_dataset_by_hull_area( + final_filtered_subsurface_photon_dataset, + hull_area_threshold=args.convex_hull_area_threshold + ) + if plot_target_beam: + plot_convex_hulls( + final_filtered_subsurface_photon_dataset, + plot_target_beam, + convex_hulls, + convex_hull_areas + ) + + subsurface_output_path = os.path.join( + output_path, + f"{timestamp}_strongBeam_{'_'.join(target_strong_beams)}_subsurface_photons.csv", + ) + final_filtered_subsurface_photon_dataset.to_csv(subsurface_output_path, index=False) + + kd_input_dataset = optionally_combine_paired_beams_for_kd( + final_filtered_subsurface_photon_dataset, + horizontal_res=args.horizontal_res, + enabled=args.enable_paired_beam_combine + ) + subsurface_photon_df_added_kd = process_kd_calculation(kd_input_dataset) + kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") + subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) + + if 'relative_AT_dist' in kd_input_dataset.columns: + unique_photon_dataset = kd_input_dataset[ + ['relative_AT_dist', 'lat_bins', 'photon_height'] + ].drop_duplicates() + unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( + 'lat_bins', observed=False + )['relative_AT_dist'].transform('mean') + + def find_closest(group): + center = group['relative_AT_dist_center'].iloc[0] + group = group.copy() + group['dist_to_center'] = abs(group['relative_AT_dist'] - center) + return group.loc[[group['dist_to_center'].idxmin()]] + + closest_to_center = unique_photon_dataset.groupby('lat_bins', observed=False).apply( + find_closest, include_groups=True + ) + closest_to_center.index = closest_to_center.index.droplevel(0) + kd_df_merged_distance = closest_to_center.merge( + subsurface_photon_df_added_kd, + on='lat_bins', + how='left' + ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + + plot_kd_photons( + output_path, + timestamp, + plot_target_beam, + kd_input_dataset, + kd_df_merged_distance, + ) + + logger.info("SUCCESS! Kd output: %s", kd_output_path) + + +if __name__ == '__main__': + cli_args = get_args() + try: + run_pipeline(cli_args) + except Exception as e: + logger.error("An error occurred: %s", e) + raise From 1c2efe7d300375716411aa6e661ff6a8d41d789c Mon Sep 17 00:00:00 2001 From: Chao Date: Mon, 30 Mar 2026 14:06:31 -0400 Subject: [PATCH 04/11] description of changes --- icesat-2_kdph_py/config.py | 298 +++++ icesat-2_kdph_py/kd_utils/Kd_analysis.py | 879 ++++++++++++ .../kd_utils/SeafloorFilterByGEBCO_Module.py | 108 ++ icesat-2_kdph_py/kd_utils/__init__.py | 8 + icesat-2_kdph_py/kd_utils/bathy_processing.py | 953 +++++++++++++ icesat-2_kdph_py/kd_utils/data_processing.py | 810 +++++++++++ icesat-2_kdph_py/kd_utils/interpolation.py | 19 + icesat-2_kdph_py/kd_utils/kd_utils.py | 1181 +++++++++++++++++ .../kd_utils/sea_photons_analysis.py | 489 +++++++ icesat-2_kdph_py/kd_utils/visualization.py | 317 +++++ icesat-2_kdph_py/main.py | 288 ++++ 11 files changed, 5350 insertions(+) create mode 100644 icesat-2_kdph_py/config.py create mode 100644 icesat-2_kdph_py/kd_utils/Kd_analysis.py create mode 100644 icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py create mode 100644 icesat-2_kdph_py/kd_utils/__init__.py create mode 100644 icesat-2_kdph_py/kd_utils/bathy_processing.py create mode 100644 icesat-2_kdph_py/kd_utils/data_processing.py create mode 100644 icesat-2_kdph_py/kd_utils/interpolation.py create mode 100644 icesat-2_kdph_py/kd_utils/kd_utils.py create mode 100644 icesat-2_kdph_py/kd_utils/sea_photons_analysis.py create mode 100644 icesat-2_kdph_py/kd_utils/visualization.py create mode 100644 icesat-2_kdph_py/main.py diff --git a/icesat-2_kdph_py/config.py b/icesat-2_kdph_py/config.py new file mode 100644 index 0000000..863cc84 --- /dev/null +++ b/icesat-2_kdph_py/config.py @@ -0,0 +1,298 @@ +# # config.py + +# path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/' + +# #US, Orgon +# # ATL03_h5_file = "processed_ATL03_20181206092124_10500106_005_01.h5"# +# #South America +# # ATL03_h5_file = "processed_ATL03_20220530041141_10391513_005_02.h5"# + + +# #Alaska Cook Inlet +# # ATL03_h5_file = "processed_ATL03_20210801125753_05941205_005_01.h5"# +# ATL03_20200805175053_06320803_006_01_subsetted +# ATL03_20210828231447_10131203_006_01_subsetted + +# # Hawaii line without afterpulses +# # ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5"# + + +# # ChesapeakeBay +# ATL03_h5_file = "processed_ATL03_20230825074121_10102002_006_02.h5"# + + +# # India +# # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# + + + +# #China Bohai +# # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# +# # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# + +# ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# + + + +# ATL03_h5_file_path = path + ATL03_h5_file + +# Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' + +# shoreline_data_path = Current_Path + 'Shorelines/GeoPkgGlobalShoreline.gpkg' + +# #load the Global bathy dataset +# GEBCO_paths = [ +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w0.0_e90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-90.0_e0.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w90.0_e180.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-180.0_e-90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w0.0_e90.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-90.0_e0.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w90.0_e180.0.tif", +# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-180.0_e-90.0.tif", +# ] + +# horizontal_res = 500 + +# vertical_res = 0.25 + +# # Use calculated sea height to determine photons at 0.5m below peak +# subsurface_thresh = 0.5 + +# # subsurface distance below the sea surface beginning to account +# Kd_max_depth = -1 + +# OutputPath='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Results/' + +import argparse + +def get_args(): + parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") + + # General paths + parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', + help="Root path for all data processing.") + + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', + # help="Base path for ICESat-2 ATL03 data.") + + # # Bohai Sea (v006) + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/', + # help="Base path for ICESat-2 ATL03 data.") + + # Wax Delta (v007) — required for IR/AP filter sensitivity test + parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Wax_Delta/', + help="Base path for ICESat-2 ATL03 data.") + + + + # #ChesapeakeBay + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", + # help="Name of the ATL03 H5 file to process.") + + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + # #ATL03_20210628230109_00811207_006_01_subsetted + + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", + # help="Name of the ATL03 H5 file to process.") + + + # # WaxDelta + # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + # # Bohai (v006) — baseline sensitivity test site + # parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + # Wax Delta (v007) — required for IR/AP filter; use with atl03_path = Dataset/ATL03_ICESat2/Wax_Delta/ + parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_007_01_subsetted.h5", + help="Name of the ATL03 H5 file to process.") + + # # Cook Inlet + # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + + # #Core Sound + # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + # # Pamlico Sound + # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") + + + # + # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", + # help="Name of the ATL03 H5 file to process.") + + parser.add_argument("--other_data_path", type=str, default='Dataset/', + help="Path to the current working directory.") + + parser.add_argument("--output_path", type=str, default='Results/', + help="Directory for saving results.") + + # Shoreline data + parser.add_argument("--shoreline_data", type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default='Shorelines/ne_10m_land/ne_10m_land.shp', + help="Path to the global shoreline dataset.") + + # Bathymetry datasets + parser.add_argument("--gebco_path", nargs='+', type=str, + default='Bathy/gebco_2024_geotiff/', + help="Path to GEBCO bathymetry datasets.") + + # Resolution settings + parser.add_argument("--horizontal_res", type=int, default=500, + help="Horizontal resolution for the analysis (in meters).") + + parser.add_argument("--vertical_res", type=float, default=0.25, + help="Vertical resolution for the analysis.") + + # Analysis parameters + parser.add_argument("--subsurface_thresh", type=float, default=1.0, + help="Threshold for photons below the sea surface.") + + parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, + help="Maximum depth thres for Kd calculations (in meters).") #5m + + # Empty string '' auto-selects the first strong beam. Pass comma-separated IDs to override. + # Valid IDs: gt1l, gt1r, gt2l, gt2r, gt3l, gt3r — only strong beams for the granule are kept. + # To process all strong beams, use --target_beams "gt1r,gt2r,gt3r". + parser.add_argument("--target_beams", type=str, default='', + help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.") + + parser.add_argument("--disable_solar_background_filter", action="store_true", + help="Disable solar background filtering (enabled by default; " + "self-gates on solar elevation so has no effect on nighttime passes).") + parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.") + parser.add_argument("--solar_bg_median_window_deg", type=float, default=10.0, + help="Rolling median window width (degrees of solar elevation) for " + "the solar-elevation-aware background filter.") + parser.add_argument("--solar_bg_noise_multiplier", type=float, default=1.5, + help="Ratio of actual/expected background rate above which a photon " + "is flagged as noisy by the solar background filter.") + parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, + help="Minimum photon signal confidence kept in high daytime background.") + + parser.add_argument("--enable_ir_ap_filter", action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.") + parser.add_argument("--ir_ap_quality_max", type=int, default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.") + parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.") + + parser.add_argument("--enable_gebco_filter", action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.") + parser.add_argument("--enable_sea_surface_flattening", action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") + parser.add_argument("--sea_surface_flattening_window_m", type=int, default=5000, + help="Window size in meters for sea-surface flattening mean level. " + "Must be larger than horizontal_res to have any effect.") + + parser.add_argument("--enable_convex_hull_filter", action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.") + parser.add_argument("--convex_hull_area_threshold", type=float, default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.") + + parser.add_argument("--enable_histogram_quality_filter", action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.") + parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.") + parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, + help="Minimum depth (m below surface) of quality-check band.") + parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, + help="Maximum depth (m below surface) of quality-check band.") + parser.add_argument("--histogram_quality_ref_depth_min", type=float, default=0.0, + help="Min depth (m) of near-surface reference band for decay ratio.") + parser.add_argument("--histogram_quality_ref_depth_max", type=float, default=1.0, + help="Max depth (m) of near-surface reference band for decay ratio.") + + parser.add_argument("--enable_surface_sigma_filter", action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.") + parser.add_argument("--surface_sigma_max", type=float, default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") + + parser.add_argument("--enable_refraction_correction", action="store_true", + help="Optional: apply refraction correction to subsurface photons.") + parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, + help="Water temperature (deg C) used for refraction correction.") + parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, + help="Laser wavelength (nm) used for refraction correction.") + + parser.add_argument("--enable_post_refraction_refit", action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.") + + parser.add_argument("--enable_atl24_filter", action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") + parser.add_argument("--atl24_file", type=str, default='', + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") + parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") + + parser.add_argument("--decay_zone_threshold", type=float, default=0.0, + help="Fraction of peak photon count below which depth bins are excluded " + "in Step 16 decay zone detection. 0.0 = original behaviour (only " + "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.") + + parser.add_argument("--kd_fit_method", type=str, default="log_linear", + choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], + help="Beer's Law fitting strategy for Kd calculation. " + "log_linear = original log-space linear regression; " + "bg_subtract = subtract noise floor then log-linear; " + "breakpoint = segmented regression with breakpoint detection; " + "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.") + + parser.add_argument("--enable_wave_adaptive_fit", action="store_true", + help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " + "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.") + parser.add_argument("--wave_exclusion_multiplier", type=float, default=4.0, + help="Number of surface sigmas to skip from the actual water surface when " + "wave-adaptive fit is enabled. The adaptive surface detection already " + "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " + "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.") + parser.add_argument("--wave_sigma_calm_threshold", type=float, default=0.1, + help="Surface sigma (m) below which a bin is considered calm and no " + "wave trimming is applied. Default 0.1 m.") + + parser.add_argument("--enable_paired_beam_combine", action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") + + parser.add_argument("--no_plot", action="store_true", + help="Suppress all plot generation (useful for batch runs).") + + # switch on consider the coastal water or inland water + # if Inland water, we should use one mask and for coastal water, it should be another mask + + # consider bathymetry or not manual setting + + + #night vs daytime + + #solar elevation detemine remove or not for solar background + + + # 1-2 hours + + + return parser.parse_args() + +if __name__ == "__main__": + args = get_args() + # Access parameters like this: + atl03_file_path = args.workspace_path + args.atl03_file + + print("Processing file:", atl03_file_path) + print("Output path:", args.output_path) diff --git a/icesat-2_kdph_py/kd_utils/Kd_analysis.py b/icesat-2_kdph_py/kd_utils/Kd_analysis.py new file mode 100644 index 0000000..e706d23 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/Kd_analysis.py @@ -0,0 +1,879 @@ +# utils/Kd_analysis.py +# updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. + +import numpy as np +import pandas as pd +import logging +from scipy.optimize import curve_fit +from sklearn.linear_model import LinearRegression + +# def log_model(z, kd, e0): +# return np.log(e0) - kd * z + +## This is wrong because the input is already a bined data, +## it is not necessary to do a histogram again +# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): +# if df.empty or 'lat_bins' not in df.columns: +# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) + +# # Get the latitude bin value +# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan +# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan +# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + +# # Calculate photon height range +# photon_height_min = df['photon_height'].min() +# photon_height_max = df['photon_height'].max() + +# # Check for sufficient data range +# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# height_bins_range = abs(photon_height_max - photon_height_min) +# height_bins_number = round(height_bins_range / vertical_res) + +# # Ensure there are enough bins +# if height_bins_number < 5: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# # Create histogram of photon heights +# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) +# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) +# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + +# # Store histogram data +# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) + +# # x value for model +# # Reverse zdepth to align with the MATLAB approach +# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + +# # Log-transform photon counts, replacing zeros with NaN +# # y value for model +# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) +# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan + +# # Filter out rows with NaNs in either column for regression +# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) + +# # Check for enough valid data points +# # Skip the regression if there are fewer than 5 datapoints +# if valid_data['log_photon_counts'].notna().sum() > 3: +# # Drop NaNs for regression +# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) +# log_counts_valid = valid_data['log_photon_counts'].values + +# # Perform linear regression +# model = LinearRegression() +# model.fit(zdepth_valid, log_counts_valid) + +# # Extract kd as the negative of the slope and e0 from the intercept +# kd = -model.coef_[0] + +# print('zdepth_valid:',zdepth_valid) +# print('photon_counts:',hist_df['photon_counts']) +# print('kd:',kd) + +# e0 = np.exp(model.intercept_) + +# # Set kd to NaN if negative +# if kd < 0: +# kd = np.nan +# else: +# kd, e0 = np.nan, np.nan + +# return pd.DataFrame({ +# 'lat_bins': [lat_bin_value], +# 'kd': [kd], +# 'e0': [e0], +# 'latitude': [latitude], +# 'longitude': [longitude] +# }) + + +# # one solution is to adjust the vertical_res to 0.25 +# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): +# if df.empty or 'lat_bins' not in df.columns: +# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) + +# # Get the latitude bin value +# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan +# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan +# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + +# # Calculate photon height range +# photon_height_min = df['photon_height'].min() +# photon_height_max = df['photon_height'].max() + +# # Check for sufficient data range +# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# height_bins_range = abs(photon_height_max - photon_height_min) +# height_bins_number = round(height_bins_range / vertical_res) + +# # Ensure there are enough bins +# if height_bins_number < 5: +# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) + +# # Create histogram of photon heights +# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) +# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) +# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + +# # Store histogram data +# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) + +# # x value for model +# # Reverse zdepth to align with the MATLAB approach +# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + +# # Log-transform photon counts, replacing zeros with NaN +# # y value for model +# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) +# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan + +# # Filter out rows with NaNs in either column for regression +# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) + +# # Check for enough valid data points +# # Skip the regression if there are fewer than 5 datapoints +# if valid_data['log_photon_counts'].notna().sum() > 3: +# # Drop NaNs for regression +# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) +# log_counts_valid = valid_data['log_photon_counts'].values + +# # Perform linear regression +# model = LinearRegression() +# model.fit(zdepth_valid, log_counts_valid) + +# # Extract kd as the negative of the slope and e0 from the intercept +# kd = -model.coef_[0] + +# print('zdepth_valid:',zdepth_valid) +# print('photon_counts:',hist_df['photon_counts']) +# print('kd:',kd) + +# e0 = np.exp(model.intercept_) + +# # Set kd to NaN if negative +# if kd < 0: +# kd = np.nan +# else: +# kd, e0 = np.nan, np.nan + +# return pd.DataFrame({ +# 'lat_bins': [lat_bin_value], +# 'kd': [kd], +# 'e0': [e0], +# 'latitude': [latitude], +# 'longitude': [longitude] +# }) + + +def find_exponential_decay_zone(hist_df, decay_threshold=0.0): + """ + Identify the depth bins that lie within the exponential decay zone. + + Flowchart step: 16 — Determine depth where signal decays to 1% of incoming + signal or where photon decay is no longer exponential. Bins with zero photon + counts (log undefined) are excluded; the remaining contiguous non-zero bins + define the exponential decay zone. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (depth from surface, ascending), 'photon_counts'. + decay_threshold : float, optional + Fraction of peak photon count below which bins are excluded. + 0.0 (default) preserves original behaviour (only zero-count bins removed). + E.g. 0.01 removes bins with < 1% of the peak signal. + + Returns + ------- + pd.DataFrame + Filtered rows with a 'log_photon_counts' column added; only bins inside + the exponential decay zone are kept (photon_counts > 0 and log finite). + """ + df = hist_df.copy() + + # Apply decay threshold: exclude bins below threshold fraction of peak signal + if decay_threshold > 0.0: + peak_count = df['photon_counts'].max() + if peak_count > 0: + df.loc[df['photon_counts'] < decay_threshold * peak_count, 'photon_counts'] = 0 + + df['log_photon_counts'] = np.log(df['photon_counts'].replace(0, np.nan)) + df.loc[np.isinf(df['log_photon_counts']), 'log_photon_counts'] = np.nan + return df.dropna(subset=['zdepth', 'log_photon_counts']) + + +def fit_beers_law(valid_data): + """ + Fit Beer's Law exponential decay curve to the identified decay zone. + + Flowchart step: 17 — Fit exponential decay curve (Beer's Law) to data + within the zone of exponential decay. Implemented as linear regression on + log(photon_counts) vs depth; kd = -slope, e0 = exp(intercept). + + Parameters + ---------- + valid_data : pd.DataFrame + Output of find_exponential_decay_zone; columns 'zdepth' and + 'log_photon_counts'. + + Returns + ------- + tuple[float, float] + (kd, e0). kd is set to np.nan if negative or if fewer than 4 points. + """ + if valid_data['log_photon_counts'].notna().sum() <= 3: + return np.nan, np.nan + + zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) + log_counts_valid = valid_data['log_photon_counts'].values + + model = LinearRegression() + model.fit(zdepth_valid, log_counts_valid) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + + if kd < 0: + kd = np.nan + return kd, e0 + + +# --------------------------------------------------------------------------- +# Strategy A: Background subtraction + log-linear fit +# --------------------------------------------------------------------------- +def fit_beers_law_bg_subtract(hist_df, bg_fraction=0.2): + """ + Estimate the noise floor from the deepest bins, subtract it, then fit + Beer's Law in log-space on the noise-subtracted counts. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + bg_fraction : float + Fraction of the deepest bins used to estimate the background level. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + n_bins = len(df) + if n_bins < 5: + return np.nan, np.nan, np.nan + + # Estimate noise floor from the deepest bg_fraction of bins + n_bg = max(int(n_bins * bg_fraction), 2) + noise_floor = float(df.tail(n_bg)['photon_counts'].median()) + + # Subtract noise and keep only positive residuals + df['signal'] = df['photon_counts'] - noise_floor + df = df[df['signal'] > 0].copy() + if len(df) < 4: + return np.nan, np.nan, noise_floor + + df['log_signal'] = np.log(df['signal']) + + zdepth = df['zdepth'].values.reshape(-1, 1) + log_signal = df['log_signal'].values + + model = LinearRegression() + model.fit(zdepth, log_signal) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + + if kd < 0: + kd = np.nan + return kd, e0, noise_floor + + +# --------------------------------------------------------------------------- +# Strategy B: Breakpoint / segmented regression +# --------------------------------------------------------------------------- +def fit_beers_law_breakpoint(hist_df): + """ + Find the optimal breakpoint between exponential decay and noise floor, + then fit Beer's Law only to the decay segment. + + The breakpoint is chosen by testing every candidate position and selecting + the one that minimises total residual sum of squares of a two-segment + model: linear slope above the breakpoint, flat constant below. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + + Returns + ------- + tuple[float, float, float, float] + (kd, e0, breakpoint_depth, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + df = df[df['photon_counts'] > 0].copy() + if len(df) < 5: + return np.nan, np.nan, np.nan, np.nan + + df['log_counts'] = np.log(df['photon_counts']) + depths = df['zdepth'].values + log_counts = df['log_counts'].values + n = len(depths) + + best_rss = np.inf + best_bp_idx = None + + # Try each candidate breakpoint (need >= 4 points in decay segment, + # >= 1 in noise segment) + for bp_idx in range(4, n - 1): + # Decay segment: linear fit on bins 0..bp_idx-1 + z_decay = depths[:bp_idx].reshape(-1, 1) + lc_decay = log_counts[:bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + rss_decay = float(np.sum((model.predict(z_decay) - lc_decay) ** 2)) + + # Noise segment: flat at the mean of bins bp_idx..end + lc_noise = log_counts[bp_idx:] + noise_mean = lc_noise.mean() + rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) + + total_rss = rss_decay + rss_noise + if total_rss < best_rss: + best_rss = total_rss + best_bp_idx = bp_idx + + if best_bp_idx is None: + return np.nan, np.nan, np.nan, np.nan + + # Final fit on the decay segment + z_decay = depths[:best_bp_idx].reshape(-1, 1) + lc_decay = log_counts[:best_bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + breakpoint_depth = float(depths[best_bp_idx]) + noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) + + if kd < 0: + kd = np.nan + return kd, e0, breakpoint_depth, noise_floor + + +# --------------------------------------------------------------------------- +# Strategy C: Nonlinear fit C(z) = A * exp(-Kd * z) + N +# --------------------------------------------------------------------------- +def _beer_plus_noise(z, A, kd, N): + """Model: signal = A * exp(-kd * z) + N.""" + return A * np.exp(-kd * z) + N + + +def fit_beers_law_nonlinear(hist_df): + """ + Fit the physically correct model C(z) = A·exp(-Kd·z) + N directly + to raw photon counts using nonlinear least-squares (no log transform). + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + depths = df['zdepth'].values + counts = df['photon_counts'].values.astype(float) + + if len(depths) < 5: + return np.nan, np.nan, np.nan + + # Initial guesses + A0 = float(counts.max()) + N0 = float(np.median(counts[len(counts) * 3 // 4:])) # deepest 25% + # Quick log-linear Kd estimate for initial guess (ignore noise) + pos = counts > 0 + if pos.sum() >= 2: + log_c = np.log(counts[pos]) + z_pos = depths[pos] + kd0 = max(float(-(log_c[-1] - log_c[0]) / (z_pos[-1] - z_pos[0] + 1e-9)), 0.01) + else: + kd0 = 0.1 + + try: + popt, _ = curve_fit( + _beer_plus_noise, depths, counts, + p0=[A0, kd0, N0], + bounds=([0, 0, 0], [np.inf, np.inf, np.inf]), + maxfev=5000 + ) + A_fit, kd_fit, N_fit = popt + if kd_fit <= 0: + kd_fit = np.nan + return float(kd_fit), float(A_fit), float(N_fit) + except (RuntimeError, ValueError): + return np.nan, np.nan, np.nan + + +# --------------------------------------------------------------------------- +# Hybrid: Stabilised breakpoint zone detection + log-linear fit on decay only +# --------------------------------------------------------------------------- +def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, + expected_noise_floor=None, + noise_floor_tolerance=3.0): + """ + Two-stage approach matching the flowchart intent: + Step 16 — Find where exponential decay transitions to noise floor + (stabilised breakpoint with BIC selection). + Step 17 — Fit Beer's Law log-linear ONLY on the decay segment. + + Stabilisation constraints: + * Minimum breakpoint depth prevents shallow breakpoints that yield + unstable regression from too few decay bins. + * Decay segment slope must be negative (Kd > 0). + * BIC model selection penalises over-fitting, preventing the breakpoint + from drifting too deep (where the decay segment gets long and noisy). + * When expected_noise_floor is provided (from beam-level median), + candidate breakpoints whose noise segment deviates too far from the + expected value are penalised. + * Falls back to full log-linear if no valid breakpoint improves BIC + over a single-line model. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + min_breakpoint_depth : float + Minimum depth (m) below surface for a valid breakpoint. Prevents + breakpoints landing in the first few bins where regression is + unstable. Default 2.0 m. + expected_noise_floor : float or None + If provided, the beam-level median noise floor (photon counts). + Candidate breakpoints whose noise segment mean deviates by more + than ``noise_floor_tolerance`` times from the expected value + receive a BIC penalty. + noise_floor_tolerance : float + Factor controlling how far the candidate noise segment mean may + deviate from ``expected_noise_floor`` before a penalty is applied. + Default 3.0 (allow 3x variation). + + Returns + ------- + tuple[float, float, float, float] + (kd, e0, breakpoint_depth, noise_floor). + breakpoint_depth and noise_floor are np.nan if fallback to full fit. + """ + df = hist_df.copy().sort_values('zdepth') + df = df[df['photon_counts'] > 0].copy() + n = len(df) + if n < 6: + return np.nan, np.nan, np.nan, np.nan + + df['log_counts'] = np.log(df['photon_counts']) + depths = df['zdepth'].values + log_counts = df['log_counts'].values + + # ------------------------------------------------------------------ + # Reference: BIC for a single-line model (no breakpoint) + # ------------------------------------------------------------------ + model_full = LinearRegression() + model_full.fit(depths.reshape(-1, 1), log_counts) + rss_full = float(np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2)) + k_full = 2 # slope + intercept + bic_full = n * np.log(rss_full / n + 1e-10) + k_full * np.log(n) + + # ------------------------------------------------------------------ + # Search: best breakpoint using BIC + # ------------------------------------------------------------------ + best_bic = bic_full # must beat the single-line model + best_bp_idx = None + best_model = None + + for bp_idx in range(4, n - 1): + # -- Minimum breakpoint depth constraint -- + if depths[bp_idx] < min_breakpoint_depth: + continue + + # -- Decay segment (surface to breakpoint) -- + z_decay = depths[:bp_idx].reshape(-1, 1) + lc_decay = log_counts[:bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + + # Physical constraint: slope must be negative + if model.coef_[0] >= 0: + continue + + pred_decay = model.predict(z_decay) + rss_decay = float(np.sum((pred_decay - lc_decay) ** 2)) + + # -- Noise segment (breakpoint to bottom) -- + lc_noise = log_counts[bp_idx:] + noise_mean = float(lc_noise.mean()) + rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) + + # -- BIC for piecewise model (3 params: slope, intercept, noise level) + rss_total = rss_decay + rss_noise + k_bp = 3 + bic_bp = n * np.log(rss_total / n + 1e-10) + k_bp * np.log(n) + + # -- Noise floor consistency penalty -- + # When we have a beam-level expected noise floor, penalise + # candidates whose noise segment deviates substantially. + if expected_noise_floor is not None and expected_noise_floor > 0: + candidate_nf = float(np.exp(noise_mean)) + ratio = candidate_nf / expected_noise_floor + if ratio > noise_floor_tolerance or ratio < 1.0 / noise_floor_tolerance: + # Add a penalty proportional to log-deviation + bic_bp += n * abs(np.log(ratio)) + + if bic_bp < best_bic: + best_bic = bic_bp + best_bp_idx = bp_idx + best_model = model + + # ------------------------------------------------------------------ + # Result + # ------------------------------------------------------------------ + if best_bp_idx is None: + # No breakpoint beats the single-line model — fall back + kd = -model_full.coef_[0] + e0 = np.exp(model_full.intercept_) + if kd < 0: + kd = np.nan + return kd, e0, np.nan, np.nan + + kd = -best_model.coef_[0] + e0 = np.exp(best_model.intercept_) + breakpoint_depth = float(depths[best_bp_idx]) + noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) + + if kd < 0: + kd = np.nan + return kd, e0, breakpoint_depth, noise_floor + + +# --------------------------------------------------------------------------- +# Dispatcher: select fitting strategy by name +# --------------------------------------------------------------------------- +KD_FIT_METHODS = ('log_linear', 'bg_subtract', 'breakpoint', 'nonlinear', 'hybrid') + + +def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, + expected_noise_floor=None): + """ + Run the requested fitting strategy on a single histogram. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). noise_floor is np.nan for log_linear. + """ + if method == 'log_linear': + valid = find_exponential_decay_zone(hist_df, decay_threshold=decay_threshold) + kd, e0 = fit_beers_law(valid) + return kd, e0, np.nan + + elif method == 'bg_subtract': + return fit_beers_law_bg_subtract(hist_df) + + elif method == 'breakpoint': + kd, e0, bp, nf = fit_beers_law_breakpoint(hist_df) + return kd, e0, nf + + elif method == 'nonlinear': + return fit_beers_law_nonlinear(hist_df) + + elif method == 'hybrid': + kd, e0, bp, nf = fit_beers_law_hybrid( + hist_df, expected_noise_floor=expected_noise_floor + ) + return kd, e0, nf + + else: + raise ValueError(f"Unknown kd_fit_method: {method!r}. " + f"Choose from {KD_FIT_METHODS}") + + +# another solution is to calculate kd without hist +def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_threshold=0.0, + kd_fit_method='log_linear', expected_noise_floor=None, + surface_sigma=None, + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1): + """ + Calculate Kd for a single along-track bin by fitting Beer's Law in log-space. + + Orchestrates steps 16 and 17 by calling find_exponential_decay_zone and + fit_beers_law in sequence. + + Flowchart steps: + 16 — find_exponential_decay_zone: determine depth range of exponential decay. + 17 — fit_beers_law: fit Beer's Law curve; kd = -slope, e0 = exp(intercept). + """ + # Early exit if DataFrame is empty or missing required column + if df.empty or 'lat_bins' not in df.columns: + return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], + 'noise_floor': [np.nan], 'surface_sigma': [np.nan], + 'latitude': [np.nan], 'longitude': [np.nan]}) + + # Retrieve latitude and longitude + lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan + latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan + longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + + # Use value_counts to get photon counts in each height bin + height_counts = df['height_bins'].value_counts().sort_index() + bin_centers = height_counts.index.astype(float) + + # Create a DataFrame for the height bins and counts + hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) + + # Reverse zdepth for model alignment + hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + + # Wave-adaptive fit: skip shallow bins contaminated by wave smearing/bubbles. + # zdepth=0 in the histogram corresponds to the adaptive surface detection + # threshold: max(3*sigma, 1.0 m) below the Gaussian surface peak + # (set by get_sea_surface_height_adaptive). The wave-adaptive trim adds + # an additional margin of (k - 3)*sigma beyond the 3-sigma cutoff to + # cover bubble injection beneath wave troughs and compensate for sigma + # underestimation from the truncated ±1 m Gaussian fit window. + if surface_sigma is not None and wave_exclusion_multiplier > 0: + if not np.isnan(surface_sigma) and surface_sigma > wave_sigma_calm_threshold: + adaptive_offset = max(3.0 * surface_sigma, 1.0) + wave_skip = max(0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset) + trimmed = hist_df[hist_df['zdepth'] >= wave_skip] + if len(trimmed) >= 4: + logging.info("Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " + "skip=%.2f m, %d->%d bins", + surface_sigma, adaptive_offset, wave_skip, + len(hist_df), len(trimmed)) + hist_df = trimmed + # else: keep original hist_df (too few bins after trim) + + kd, e0, noise_floor = _fit_kd_with_method( + hist_df, method=kd_fit_method, decay_threshold=decay_zone_threshold, + expected_noise_floor=expected_noise_floor + ) + + return pd.DataFrame({ + 'lat_bins': [lat_bin_value], + 'kd': [kd], + 'e0': [e0], + 'noise_floor': [noise_floor], + 'surface_sigma': [surface_sigma if surface_sigma is not None else np.nan], + 'latitude': [latitude], + 'longitude': [longitude] + }) + + +# Original kd calculation function remains unchanged +def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', + wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): + """ + Loop over along-track bins and call CalculateKdFromFilteredSubsurfacePhoton + for each bin. + + For the 'hybrid' method, a two-pass approach is used: + Pass 1 — Fit each bin independently to estimate per-bin noise floors. + Pass 2 — Compute the beam-level median noise floor, then re-fit each + bin with the expected noise floor as a stabilisation constraint. + + After fitting, IQR-based outlier filtering removes extreme Kd values + (for hybrid method only). + + Flowchart steps: + 16 — Determine depth of exponential decay zone (per bin, inside + CalculateKdFromFilteredSubsurfacePhoton). + 17 — Fit Beer's Law exponential decay curve (per bin). + 18 — Save K_dph attenuation coefficient value and statistical terms + (kd, e0 columns in the returned DataFrame). + """ + logging.info("Calculating Kd from filtered subsurface photon dataset (method=%s)", kd_fit_method) + + empty_df = pd.DataFrame({'lat_bins': [], 'kd': [], 'e0': [], 'noise_floor': [], 'surface_sigma': [], 'latitude': [], 'longitude': []}) + + groups = list(filtered_seafloor_subsurface_photon_dataset.groupby('lat_bins', observed=False)) + if not groups: + return empty_df + + if kd_fit_method == 'hybrid': + # ------------------------------------------------------------------ + # Pass 1: estimate per-bin noise floors (no expected_noise_floor) + # ------------------------------------------------------------------ + logging.info("Hybrid pass 1: estimating per-bin noise floors") + pass1_results = [] + for _, group in groups: + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + pass1_results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + )) + if not pass1_results: + return empty_df + pass1_df = pd.concat(pass1_results, ignore_index=True) + + # ------------------------------------------------------------------ + # Compute sliding-window noise floor for pass 2 + # ------------------------------------------------------------------ + # 10 km window = ±10 bins at 500 m resolution (20 bins total). + # Each bin's expected noise floor = median of valid noise floors + # within ±half_window bins. Falls back to beam-level median when + # the local window has < 3 valid estimates. + HALF_WINDOW = 10 # ±10 bins = ±5 km at 500 m horizontal_res + MIN_LOCAL = 3 # minimum valid noise floors to use local median + + nf_array = pass1_df['noise_floor'].values.copy() + n_bins = len(nf_array) + + # Beam-level fallback + valid_nf_all = pass1_df['noise_floor'].dropna() + valid_nf_all = valid_nf_all[valid_nf_all > 0] + beam_median_nf = float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + + if beam_median_nf is not None: + # Build per-bin expected noise floor via sliding window + expected_nf_per_bin = np.full(n_bins, np.nan) + for i in range(n_bins): + lo = max(0, i - HALF_WINDOW) + hi = min(n_bins, i + HALF_WINDOW + 1) + window_nf = nf_array[lo:hi] + valid = window_nf[~np.isnan(window_nf) & (window_nf > 0)] + if len(valid) >= MIN_LOCAL: + expected_nf_per_bin[i] = float(np.median(valid)) + else: + expected_nf_per_bin[i] = beam_median_nf # fallback + + logging.info("Hybrid pass 2: sliding window noise floor " + "(half_window=%d bins, beam_median=%.3f, " + "local range=%.3f-%.3f)", + HALF_WINDOW, beam_median_nf, + float(np.nanmin(expected_nf_per_bin)), + float(np.nanmax(expected_nf_per_bin))) + else: + expected_nf_per_bin = None + logging.info("Hybrid: insufficient noise floor estimates (%d), " + "skipping pass 2", len(valid_nf_all)) + + # ------------------------------------------------------------------ + # Pass 2: re-fit with per-bin expected noise floor constraint + # ------------------------------------------------------------------ + if expected_nf_per_bin is not None: + results = [] + for idx, (_, group) in enumerate(groups): + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + expected_noise_floor=float(expected_nf_per_bin[idx]), + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold + )) + SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) + else: + SubsurfacePhotonDFAddedKd = pass1_df + + # ------------------------------------------------------------------ + # IQR-based outlier filtering (hybrid only) + # Uses 3×IQR to accommodate real spatial variation (e.g. turbidity + # gradients near river mouths) while removing fitting artefacts. + # ------------------------------------------------------------------ + kd_vals = SubsurfacePhotonDFAddedKd['kd'].dropna() + if len(kd_vals) >= 5: + q1 = float(kd_vals.quantile(0.25)) + q3 = float(kd_vals.quantile(0.75)) + iqr = q3 - q1 + lower = q1 - 3.0 * iqr + upper = q3 + 3.0 * iqr + # Also apply physical cap: Kd > 5.0 m⁻¹ is unrealistic + upper = min(upper, 5.0) + outlier_mask = (SubsurfacePhotonDFAddedKd['kd'] < lower) | (SubsurfacePhotonDFAddedKd['kd'] > upper) + n_outliers = int(outlier_mask.sum()) + if n_outliers > 0: + logging.info("Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", + n_outliers, lower, upper) + SubsurfacePhotonDFAddedKd.loc[outlier_mask, 'kd'] = np.nan + + else: + # ------------------------------------------------------------------ + # Non-hybrid methods: single pass, no IQR filtering + # ------------------------------------------------------------------ + results = [] + for _, group in groups: + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + )) + if not results: + return empty_df + SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) + + return SubsurfacePhotonDFAddedKd + + +# Updated function to apply kd calculation beam-by-beam +def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', + wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): + """ + Beam-by-beam wrapper that calls calculate_kd for every beam and concatenates + the results into a single Kd output DataFrame. + + Flowchart steps: + 16 — Determine depth of exponential decay zone (delegated to + CalculateKdFromFilteredSubsurfacePhoton). + 17 — Fit Beer's Law exponential decay curve (delegated). + 18 — Save K_dph attenuation coefficient value (slope of the Beer's Law + curve) and statistical terms (kd, e0) — the returned DataFrame is + written to CSV by run_pipeline. + """ + # Initialize list to store results for each beam + kd_beam_datasets = [] + + # Group by 'beam_id' to process each beam independently + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): + logging.info(f"Calculating Kd for beam: {beam_id}") + + # Apply the calculate_kd function to the current beam's dataset + SubsurfacePhotonDFAddedKd = calculate_kd( + beam_data, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + + # Add a column to track the beam_id in the results + SubsurfacePhotonDFAddedKd['beam_id'] = beam_id + + # Append the result to the list + kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) + + if not kd_beam_datasets: + logging.warning( + "process_kd_calculation: no beam data remained after filtering. " + "All bins may have been discarded by an upstream filter (e.g. histogram quality). " + "Returning empty DataFrame." + ) + return pd.DataFrame(columns=['lat_bins', 'kd', 'e0', 'noise_floor', 'surface_sigma', 'latitude', 'longitude', 'beam_id']) + + # Combine results from all beams into a single DataFrame + combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) + + return combined_kd_dataset \ No newline at end of file diff --git a/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py b/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py new file mode 100644 index 0000000..46f6a58 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py @@ -0,0 +1,108 @@ +import rasterio +import numpy as np +import pandas as pd +from rtree import index +from shapely.geometry import box, Point + +# 1. Function to create an R-tree spatial index for raster bounds +def create_spatial_index(gebco_paths): + """ + Create an R-tree spatial index for raster bounds to quickly find relevant rasters. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + + Returns: + raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + """ + idx = index.Index() + raster_data_dict = {} + for i, path in enumerate(gebco_paths): + with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + gebco_raster = rasterio.open(path) + raster_data = gebco_raster.read(1) + raster_data_dict[path] = (gebco_raster, raster_data) + bounds = gebco_raster.bounds + idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + return raster_data_dict, idx + +# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner +def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): + """ + Determine which raster datasets are relevant for the given coordinates using spatial index. + This function uses a more efficient approach by performing a bounding box query for batches of points. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + raster_data_dict (dict): Dictionary containing raster datasets and their data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + + Returns: + relevant_rasters (list): List of relevant raster datasets and their respective data. + """ + # Create a bounding box that covers all points + min_lon, max_lon = lons.min(), lons.max() + min_lat, max_lat = lats.min(), lats.max() + bounding_box = box(min_lon, min_lat, max_lon, max_lat) + + # Get all rasters that intersect with the bounding box + matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) + relevant_paths = {match.object for match in matches} + relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + return relevant_rasters + +# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner +def get_seafloor_elevation(lons, lats, relevant_rasters): + """ + Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + relevant_rasters (list): List of relevant raster datasets and their respective data. + + Returns: + seafloor_elevations (numpy.ndarray): Array of seafloor elevations. + """ + points = np.vstack((lons, lats)).T + seafloor_elevations = np.full(len(lons), np.nan) + + for gebco_raster, raster_data in relevant_rasters: + # Use rasterio.sample to get values for multiple points in a batch + values = list(gebco_raster.sample(points)) + for idx, value in enumerate(values): + if np.isnan(seafloor_elevations[idx]) and value is not None: + seafloor_elevations[idx] = value[0] + + return seafloor_elevations + +# 4. The main process function that ties everything together +def process_seafloor_data(gebco_paths, sea_photon_dataset): + """ + Main function to process the seafloor data. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. + """ + # Create spatial index and load GEBCO Raster Data + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + + # Get the relevant rasters using spatial index + lons = sea_photon_dataset['longitude'].values + lats = sea_photon_dataset['latitude'].values + relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) + + # Get seafloor elevation for all points + sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + + # Filter out points below the seafloor + filtered_sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']] + # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) + + return filtered_sea_photon_dataset \ No newline at end of file diff --git a/icesat-2_kdph_py/kd_utils/__init__.py b/icesat-2_kdph_py/kd_utils/__init__.py new file mode 100644 index 0000000..0c3915c --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/__init__.py @@ -0,0 +1,8 @@ +# utils/__init__.py + +from .data_processing import load_data, extract_file_params, Extract_sea_photons, create_photon_dataframe +from .visualization import plot_photon_height, plot_kd_photons +from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton +from .interpolation import interpolate_labels, apply_interpolation, geoid_correction, refraction_correction +from .bathy_processing import * +from .sea_photons_analysis import * \ No newline at end of file diff --git a/icesat-2_kdph_py/kd_utils/bathy_processing.py b/icesat-2_kdph_py/kd_utils/bathy_processing.py new file mode 100644 index 0000000..772f5cd --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/bathy_processing.py @@ -0,0 +1,953 @@ +# utils/bathy_processing.py +import os +import logging +import pandas as pd +import rasterio +import numpy as np +from kd_utils.sea_photons_analysis import ( + get_sea_surface_height_adaptive, + get_sea_surface_height_static, + horizontal_vertical_bin_dataset, +) +from rtree import index +from shapely.geometry import box, Point +from scipy.spatial import cKDTree +from scipy.stats import norm + + +def apply_optional_histogram_quality_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + min_ratio=0.05, + depth_min=6.0, + depth_max=7.0, + reference_depth_min=0.0, + reference_depth_max=1.0 +): + """ + Optional histogram quality check: keep along-track bins only if enough + signal remains at the target depth band relative to near-surface signal. + + Flowchart step: 6 — Review histograms for quality: if <5% of signal remains + at an uncorrected water depth of 6-7 m below the surface, quality-flag this + along-track bin as low confidence and exclude it from further analyses + (Possible discard). + + The decay ratio is computed as: + photon_count_in_depth_band / photon_count_in_reference_band + + This measures how much signal has decayed at depth compared to near the + surface, consistent with Beer's Law attenuation. + + Parameters + ---------- + binned_dataset_sea_surface : pd.DataFrame + Full binned photon dataset with 'lat_bins' and 'photon_height'. + subsurface_photon_dataset : pd.DataFrame + Subsurface photon dataset to filter. + sea_surface_height : list + Sea surface height per along-track bin. + min_ratio : float + Minimum decay ratio to keep a bin (default 0.05 = 5%). + depth_min : float + Minimum depth of the quality-check band (default 6.0 m). + depth_max : float + Maximum depth of the quality-check band (default 7.0 m). + reference_depth_min : float + Minimum depth of the near-surface reference band (default 0.0 m). + reference_depth_max : float + Maximum depth of the near-surface reference band (default 1.0 m). + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return subsurface_photon_dataset + + quality_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height': sea_surface_height + }).dropna(subset=['sea_surface_height']).copy() + if quality_df.empty: + return subsurface_photon_dataset.iloc[0:0].copy() + + ratios = [] + for _, row in quality_df.iterrows(): + lat_bin = row['lat_bins'] + surface = row['sea_surface_height'] + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + if bin_data.empty: + ratios.append(0.0) + continue + + depth = surface - bin_data['photon_height'] + underwater_mask = depth > 0 + + # Reference band: near-surface photon count (incoming signal) + ref_mask = underwater_mask & (depth >= reference_depth_min) & (depth <= reference_depth_max) + ref_count = float(ref_mask.sum()) + if ref_count == 0: + ratios.append(0.0) + continue + + # Target band: photon count at the check depth + band_mask = underwater_mask & (depth >= depth_min) & (depth <= depth_max) + band_count = float(band_mask.sum()) + + # Normalize by band width so bands of different sizes are comparable + ref_width = max(reference_depth_max - reference_depth_min, 0.01) + band_width = max(depth_max - depth_min, 0.01) + decay_ratio = (band_count / band_width) / (ref_count / ref_width) + + ratios.append(decay_ratio) + + quality_df['hist_quality_ratio'] = ratios + valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + + +def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): + """Compute per-bin Gaussian surface sigma from photons near the surface peak. + + Uses a two-pass fit: initial ±1 m window, then expands to ±min(2.5σ, 3 m) + if the initial sigma > 0.4 m (indicating the ±1 m window truncates the + distribution). Returns a dict mapping lat_bin -> sigma (float or NaN). + Bins with fewer than 10 surface photons get NaN. + """ + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return {} + + sigma_map = {} + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): + if pd.isna(surface): + continue + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + if bin_data.empty: + continue + peak_data = bin_data[ + (bin_data['photon_height'] > surface - 1.0) & + (bin_data['photon_height'] < surface + 1.0) + ] + if len(peak_data) > 10: + mu, sigma = norm.fit(peak_data['photon_height']) + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = bin_data[ + (bin_data['photon_height'] > mu - half_win2) & + (bin_data['photon_height'] < mu + half_win2) + ] + if len(peak_data2) > 10: + _, sigma = norm.fit(peak_data2['photon_height']) + else: + sigma = np.nan + sigma_map[lat_bin] = sigma + + return sigma_map + + +def apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=0.5 +): + """ + Optional Gaussian surface sigma filter: discard along-track bins if the + fitted surface Gaussian sigma exceeds sigma_max. + + Flowchart step: 8 — If standard deviation of Gaussian peak is >X m, + quality-flag this along-track bin as low confidence and discard it + (addresses surface waves) (Possible discard). + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) + if not sigma_map: + return subsurface_photon_dataset + + sigma_df = pd.DataFrame( + list(sigma_map.items()), columns=['lat_bins', 'surface_sigma'] + ) + valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + + +def apply_optional_refraction_correction( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + water_temp_c=20.0, + wavelength_nm=532.0 +): + """ + Optional refraction correction for subsurface photons. + Updates photon positions/heights using Snell-based geometry. + + Flowchart step: 9 — Compute water depths (distance below surface) and + correct depths for refraction. + """ + required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} + if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): + return subsurface_photon_dataset + + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + lat_bin_keys = list(grouped.groups.keys()) + if len(lat_bin_keys) != len(sea_surface_height): + return subsurface_photon_dataset + + surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) + corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() + if corrected['sea_surface_height'].isna().all(): + return subsurface_photon_dataset + + corrected['photon_height_pre_refraction'] = corrected['photon_height'] + corrected['lon_pre_refraction'] = corrected['lon'] + corrected['lat_pre_refraction'] = corrected['lat'] + + # Refraction index parameterization from legacy module. + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + n1 = 1.00029 + n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e + + ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) + ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) + z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) + ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) + x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) + y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) + + # Convert to radians if values look like degrees. + if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): + ref_elev = np.deg2rad(ref_elev) + if np.nanmax(np.abs(ref_az)) > (2 * np.pi): + ref_az = np.deg2rad(ref_az) + + valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) + if not np.any(valid_mask): + return subsurface_photon_dataset + + theta1 = (np.pi / 2.0) - ref_elev[valid_mask] + theta2_arg = (n1 * np.sin(theta1)) / n2 + theta2_arg = np.clip(theta2_arg, -1.0, 1.0) + theta2 = np.arcsin(theta2_arg) + + D = ws[valid_mask] - z[valid_mask] + cos_theta1 = np.cos(theta1) + cos_theta1 = np.where(np.abs(cos_theta1) < 1e-6, np.nan, cos_theta1) + S = D / cos_theta1 + R = (S * n1) / n2 + Gamma = (np.pi / 2.0) - theta1 + phi = theta1 - theta2 + P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) + P = np.sqrt(P_sq) + alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) + alpha_arg = np.clip(alpha_arg, -1.0, 1.0) + alpha = np.arcsin(alpha_arg) + Beta = Gamma - alpha + + DY = P * np.cos(Beta) + DZ = P * np.sin(Beta) + DE = DY * np.sin(ref_az[valid_mask]) + DN = DY * np.cos(ref_az[valid_mask]) + + x_corr = x[valid_mask] + DE + y_corr = y[valid_mask] + DN + z_corr = z[valid_mask] + DZ + + corrected.loc[valid_mask, 'lon'] = x_corr + corrected.loc[valid_mask, 'lat'] = y_corr + corrected.loc[valid_mask, 'photon_height'] = z_corr + return corrected + + +def apply_sea_surface_flattening( + subsurface_photon_dataset, + sea_surface_height, + lat_bin_keys, + horizontal_res, + flattening_window_m=500 +): + """ + Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. + This follows the standalone SeaSurfaceFlattening module logic, but is optional. + """ + if subsurface_photon_dataset.empty: + return subsurface_photon_dataset + + if len(sea_surface_height) != len(lat_bin_keys): + return subsurface_photon_dataset + + surface_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height_local': sea_surface_height + }).dropna(subset=['sea_surface_height_local']).copy() + if surface_df.empty: + return subsurface_photon_dataset + + surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') + surface_df = surface_df.dropna(subset=['lat_bins_num']) + if surface_df.empty: + return subsurface_photon_dataset + + big_bin_size = max(1, int(round(flattening_window_m / max(horizontal_res, 1)))) + if big_bin_size <= 1: + logging.warning( + "Sea surface flattening window (%d m) <= horizontal_res (%d m) → " + "big_bin_size=1, flattening has no effect. " + "Use --sea_surface_flattening_window_m > --horizontal_res.", + flattening_window_m, horizontal_res + ) + surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) + mean_surface = ( + surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] + .mean() + .rename('sea_surface_height_bigbin_mean') + .reset_index() + ) + surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') + surface_df['sea_surface_flattening_offset'] = ( + surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] + ) + + flattened = subsurface_photon_dataset.copy() + flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') + flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) + flattened = flattened.merge( + surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], + on='lat_bins', + how='left' + ) + flattened['photon_height_original'] = flattened['photon_height'] + flattened['photon_height'] = ( + flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) + ) + + # Re-bin height_bins from flattened photon_height so that downstream + # Kd calculation (which reads height_bins) uses the corrected depths. + if 'height_bins' in flattened.columns: + vertical_res_actual = horizontal_res # not used; infer from existing bins + # Infer the vertical resolution from the original height_bins spacing + orig_bins = pd.to_numeric(flattened['height_bins'], errors='coerce').dropna().unique() + if len(orig_bins) >= 2: + sorted_bins = np.sort(orig_bins) + diffs = np.diff(sorted_bins) + v_res = float(np.median(diffs[diffs > 0])) if np.any(diffs > 0) else 0.25 + else: + v_res = 0.25 + h_min = flattened['photon_height'].min() + h_max = flattened['photon_height'].max() + n_hbins = max(1, int(round((h_max - h_min) / v_res))) + bin_edges = np.linspace(h_min, h_max, n_hbins + 1) + bin_labels = np.round((bin_edges[:-1] + bin_edges[1:]) / 2, decimals=1) + flattened['height_bins'] = pd.cut( + flattened['photon_height'], bins=bin_edges, labels=bin_labels, + include_lowest=True + ) + + return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') + +# 1. Function to create an R-tree spatial index for raster bounds +def create_spatial_index(gebco_paths): + """ + Create an R-tree spatial index for raster bounds to quickly find relevant rasters. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + + Returns: + raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + """ + idx = index.Index() + raster_data_dict = {} + for i, path in enumerate(gebco_paths): + with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + gebco_raster = rasterio.open(path) + raster_data = gebco_raster.read(1) + raster_data_dict[path] = (gebco_raster, raster_data) + bounds = gebco_raster.bounds + idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + return raster_data_dict, idx + +# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner +def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): + """ + Determine which raster datasets are relevant for the given coordinates using spatial index. + This function uses a more efficient approach by performing a bounding box query for batches of points. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + raster_data_dict (dict): Dictionary containing raster datasets and their data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + + Returns: + relevant_rasters (list): List of relevant raster datasets and their respective data. + """ + # Create a bounding box that covers all points + min_lon, max_lon = lons.min(), lons.max() + min_lat, max_lat = lats.min(), lats.max() + bounding_box = box(min_lon, min_lat, max_lon, max_lat) + + # Get all rasters that intersect with the bounding box + matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) + relevant_paths = {match.object for match in matches} + relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + return relevant_rasters + + +# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner +def get_seafloor_elevation(lons, lats, relevant_rasters): + """ + Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + relevant_rasters (list): List of relevant raster datasets and their respective data. + + Returns: + seafloor_elevations (numpy.ndarray): Array of seafloor elevations. + """ + points = np.vstack((lons, lats)).T + seafloor_elevations = np.full(len(lons), np.nan) + + for gebco_raster, raster_data in relevant_rasters: + # Use rasterio.sample to get values for multiple points in a batch + values = list(gebco_raster.sample(points)) + for idx, value in enumerate(values): + if np.isnan(seafloor_elevations[idx]) and value is not None: + seafloor_elevations[idx] = value[0] + + return seafloor_elevations + + +def query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths): + """ + Query GEBCO raster tiles and attach a 'seafloor_elevation' column to the + photon dataset. Does not remove any rows. + + Flowchart step: 12 — Retrieve GEBCO data (or other regional raster + bathymetry data) for the region when ATL24 is unavailable or returns no + matching points. + + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain 'longitude' and 'latitude' columns. + gebco_paths : list + Paths to GEBCO GeoTIFF raster files. + + Returns + ------- + pd.DataFrame + Input dataset with 'seafloor_elevation' column added (negative = below + sea level). + """ + dataset = sea_photon_dataset.copy() + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + lons = dataset['longitude'].values + lats = dataset['latitude'].values + relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) + dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + return dataset + + +def remove_photons_below_seafloor(sea_photon_dataset): + """ + Remove photons whose height is at or below the seabed elevation. + + Flowchart step: 13 — Remove data below seabed (seabed defined by ATL24 or + GEBCO). + + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain 'photon_height' and 'seafloor_elevation' columns. + + Returns + ------- + pd.DataFrame + Only rows where photon_height > seafloor_elevation. + """ + return sea_photon_dataset[ + sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation'] + ].copy() + + +def discard_shallow_seabed_bins(sea_photon_dataset, min_depth_m): + """ + Discard all along-track bins where the seabed is shallower than the minimum + depth threshold. Kd is not calculated for these bins; they are labelled as + low confidence by exclusion. + + Flowchart step: 14 — Discard all photon data in horizontal bins where the + seabed is <5 m deep; Kd is not calculated for these bins — label discarded + bins as low confidence. + + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain a 'depth' column (= -seafloor_elevation, positive downward). + min_depth_m : float + Minimum seabed depth to retain (e.g., 5.0 m). Bins shallower than this + are discarded. + + Returns + ------- + pd.DataFrame + Only rows where depth >= min_depth_m. + """ + return sea_photon_dataset[ + sea_photon_dataset['depth'] >= min_depth_m + ].copy() + + +# 4. The main process function that ties everything together +def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): + """ + Query GEBCO bathymetry then apply seabed and shallow-water filters. + + Delegates to three single-step helpers: + query_gebco_seafloor_elevation → step 12 + remove_photons_below_seafloor → step 13 + discard_shallow_seabed_bins → step 14 + + Parameters: + gebco_paths (list): List of path to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, + latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset + containing points above the seafloor in water deeper than threshold. + """ + dataset = query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths) # step 12 + dataset['depth'] = -dataset['seafloor_elevation'] + dataset = remove_photons_below_seafloor(dataset) # step 13 + return discard_shallow_seabed_bins(dataset, abs(Ignore_Subsurface_Height_Thres)) # step 14 + + +def load_atl24_points(atl24_file_path): + """ + Load ATL24 point dataset and normalize to columns: longitude, latitude, + seafloor_elevation_atl24. + + Flowchart step: 12 — Query ATL24 data for bathymetry match; if no match, + retrieve GEBCO data (or other regional raster bathymetry data) for region. + """ + if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + lower_path = atl24_file_path.lower() + if lower_path.endswith(('.h5', '.hdf5')): + try: + import h5py + beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'] + frames = [] + with h5py.File(atl24_file_path, 'r') as f: + for beam in beams: + if beam not in f: + continue + grp = f[beam] + if not all(k in grp for k in ('lon_ph', 'lat_ph', 'ortho_h', 'class_ph')): + continue + class_ph = grp['class_ph'][:] + bathy_mask = class_ph == 40 # ATL24 bathymetry class + # Apply low_confidence_flag filter if available + if 'low_confidence_flag' in grp: + bathy_mask = bathy_mask & (grp['low_confidence_flag'][:] == 0) + if bathy_mask.sum() == 0: + continue + frames.append(pd.DataFrame({ + 'longitude': grp['lon_ph'][:][bathy_mask].astype(float), + 'latitude': grp['lat_ph'][:][bathy_mask].astype(float), + 'seafloor_elevation_atl24': grp['ortho_h'][:][bathy_mask].astype(float), + })) + if not frames: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.concat(frames, ignore_index=True).dropna() + except Exception: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + elif lower_path.endswith(('.csv', '.txt')): + atl24_df = pd.read_csv(atl24_file_path) + elif lower_path.endswith('.parquet'): + atl24_df = pd.read_parquet(atl24_file_path) + elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): + try: + import geopandas as gpd + atl24_gdf = gpd.read_file(atl24_file_path) + atl24_df = pd.DataFrame(atl24_gdf) + if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): + atl24_df['longitude'] = atl24_gdf.geometry.x + atl24_df['latitude'] = atl24_gdf.geometry.y + except Exception: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + else: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] + lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] + elev_candidates = [ + 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', + 'depth', 'water_depth' + ] + + lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) + lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) + elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) + if lon_col is None or lat_col is None or elev_col is None: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + + out_df = pd.DataFrame({ + 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), + 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), + 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') + }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() + + if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): + out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() + else: + out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] + + return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] + + +def process_seafloor_data_atl24( + sea_photon_dataset, + atl24_file_path, + Ignore_Subsurface_Height_Thres, + max_match_distance_deg=0.01 +): + """ + Match ATL24 bathymetry points to photons, then filter below seafloor and + shallow bins. Returns filtered dataset and number of matched photons. + + Flowchart steps: + 12 — Query ATL24 data for bathymetry match; if no match, retrieve GEBCO data. + 13 — Remove data below seabed (seabed defined by ATL24 or GEBCO). + 14 — Discard all photon data in horizontal bins where the seabed is <5 m + deep; Kd is not calculated for these bins — label discarded bins as + low confidence. + """ + atl24_points = load_atl24_points(atl24_file_path) + if atl24_points.empty: + return sea_photon_dataset, 0 + + photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) + atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) + if len(photon_coords) == 0 or len(atl24_coords) == 0: + return sea_photon_dataset, 0 + + tree = cKDTree(atl24_coords) + distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) + valid_match = np.isfinite(distances) & (indices < len(atl24_points)) + matched_count = int(valid_match.sum()) + if matched_count == 0: + return sea_photon_dataset, 0 + + matched_dataset = sea_photon_dataset.copy() + matched_dataset['seafloor_elevation'] = np.nan + matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() + matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] + + above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 + filtered_dataset = discard_shallow_seabed_bins(above_floor, abs(Ignore_Subsurface_Height_Thres)) # step 14 + return filtered_dataset, matched_count + + +def rebuild_and_refit_surface_after_refraction( + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c, + wavelength_nm, + horizontal_res, + vertical_res, +): + """ + Apply refraction correction to the full binned dataset, rebuild histograms + on the corrected depths, and re-fit the Gaussian surface peak. + + Flowchart steps: + 10 — Re-build histograms based on corrected depths + (apply_optional_refraction_correction → horizontal_vertical_bin_dataset). + 11 — Re-fit Gaussian curve to surface to identify surface peak; compute + standard deviation and remove histogram data within three standard + deviations (get_sea_surface_height_adaptive on the rebinned data). + + Parameters + ---------- + binned_dataset_sea_surface : pd.DataFrame + Full binned photon dataset before refraction correction. + sea_surface_height : list + Per-bin surface heights from step 7. + water_temp_c : float + Water temperature for the refraction index calculation. + wavelength_nm : float + Laser wavelength (nm) for the refraction index calculation. + horizontal_res : float + Along-track bin size (m) for histogram rebuild. + vertical_res : float + Vertical bin size (m) for histogram rebuild. + + Returns + ------- + tuple + (sea_surface_height, sea_surface_height_abnormal_label, + solo_sea_surface_label, subsurface_photon_dataset) — same 4-tuple as + get_sea_surface_height_adaptive. + """ + corrected_full_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + binned_dataset_sea_surface.copy(), + sea_surface_height, + water_temp_c=water_temp_c, + wavelength_nm=wavelength_nm, + ) + rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 + corrected_full_dataset, horizontal_res, vertical_res + ) + return get_sea_surface_height_adaptive(rebinned_corrected) # step 11 + + +# 5. The function to get the subsurface photon dataset +def get_subsurface_photon( + binned_dataset_sea_surface, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=False, + atl24_file_path='', + atl24_max_match_distance_deg=0.01, + use_gebco_filter=False, + apply_histogram_quality_filter=False, + histogram_quality_min_ratio=0.05, + histogram_quality_depth_min=6.0, + histogram_quality_depth_max=7.0, + histogram_quality_ref_depth_min=0.0, + histogram_quality_ref_depth_max=1.0, + apply_surface_sigma_filter=False, + surface_sigma_max=0.5, + apply_refraction_correction=False, + refraction_water_temp_c=20.0, + refraction_wavelength_nm=532.0, + apply_post_refraction_refit=False, + apply_flattening=False, + flattening_window_m=500, + horizontal_res=500, + vertical_res=0.25 +): + """ + Orchestrate per-beam subsurface photon extraction and all optional quality filters. + + Flowchart steps executed in order: + 7 — Fit Gaussian curve to identify surface elevation; compute std dev of + Gaussian peak (via get_sea_surface_height_adaptive). + 6 — Review histograms for quality; quality-flag low-confidence bins and + exclude (optional, apply_histogram_quality_filter). + 8 — If Gaussian std dev > X m, quality-flag and discard bin (optional, + apply_surface_sigma_filter). + 9 — Compute water depths and correct for refraction (optional, + apply_refraction_correction). + 10 — Re-build histograms based on corrected depths (optional, + apply_post_refraction_refit → rebuild_and_refit_surface_after_refraction). + 11 — Re-fit Gaussian curve to identify surface peak; remove histogram data + within three standard deviations (optional, + rebuild_and_refit_surface_after_refraction). + 12 — Query ATL24 / retrieve GEBCO bathymetry data (optional, + use_atl24_filter / use_gebco_filter). + 13 — Remove data below seabed (optional). + 14 — Discard bins where seabed < 5 m deep; label as low confidence + (applied unconditionally via Ignore_Subsurface_Height_Thres). + """ + # adaptive threshold + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + get_sea_surface_height_adaptive(binned_dataset_sea_surface) + + # Always compute per-bin surface sigma for quality flagging and wave-adaptive fit + sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) + + if apply_histogram_quality_filter: + subsurface_photon_dataset = apply_optional_histogram_quality_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + min_ratio=histogram_quality_min_ratio, + depth_min=histogram_quality_depth_min, + depth_max=histogram_quality_depth_max, + reference_depth_min=histogram_quality_ref_depth_min, + reference_depth_max=histogram_quality_ref_depth_max + ) + + if apply_surface_sigma_filter: + subsurface_photon_dataset = apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=surface_sigma_max + ) + + if apply_refraction_correction: + subsurface_photon_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm + ) + if apply_post_refraction_refit: + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + rebuild_and_refit_surface_after_refraction( # steps 10 & 11 + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) + + # Apply minimum depth threshold unconditionally — keeps photons at height + # >= Ignore_Subsurface_Height_Thres regardless of whether GEBCO/ATL24 is on. + # Without this, deep noise photons (down to -70 m) contaminate the Beer's Law fit + # and produce near-zero Kd values, especially at shallow turbid sites. + subsurface_photon_dataset = subsurface_photon_dataset[ + subsurface_photon_dataset['photon_height'] >= Ignore_Subsurface_Height_Thres + ].copy() + + if use_atl24_filter: + atl24_filtered, atl24_match_count = process_seafloor_data_atl24( + subsurface_photon_dataset, + atl24_file_path, + abs(Ignore_Subsurface_Height_Thres), + max_match_distance_deg=atl24_max_match_distance_deg + ) + if atl24_match_count > 0: + filtered_seafloor_subsurface_photon_dataset = atl24_filtered + elif use_gebco_filter: + filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + ) + else: + filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + elif use_gebco_filter: + filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + ) + else: + filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + + if apply_flattening: + lat_bin_keys = list( + binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() + ) + filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( + filtered_seafloor_subsurface_photon_dataset, + sea_surface_height, + lat_bin_keys, + horizontal_res=horizontal_res, + flattening_window_m=flattening_window_m + ) + # Attach per-bin surface_sigma as a quality flag column + filtered_seafloor_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset.copy() + if sigma_map: + filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = \ + filtered_seafloor_subsurface_photon_dataset['lat_bins'].map(sigma_map) + else: + filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = np.nan + + return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset + + +# Main processing function to apply the subsurface photon filtering beam-by-beam +def process_subsurface_photon_filtering( + binned_dataset_sea_surface, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=False, + atl24_file_path='', + atl24_max_match_distance_deg=0.01, + use_gebco_filter=False, + apply_histogram_quality_filter=False, + histogram_quality_min_ratio=0.05, + histogram_quality_depth_min=6.0, + histogram_quality_depth_max=7.0, + histogram_quality_ref_depth_min=0.0, + histogram_quality_ref_depth_max=1.0, + apply_surface_sigma_filter=False, + surface_sigma_max=0.5, + apply_refraction_correction=False, + refraction_water_temp_c=20.0, + refraction_wavelength_nm=532.0, + apply_post_refraction_refit=False, + apply_flattening=False, + flattening_window_m=500, + horizontal_res=500, + vertical_res=0.25 +): + """ + Beam-by-beam wrapper that calls get_subsurface_photon for every beam and + concatenates the results. + + Flowchart steps 6–14 are all executed inside this function (delegated to + get_subsurface_photon per beam). See get_subsurface_photon for the + step-by-step breakdown. + """ + # Initialize lists to store results for each beam + sea_surface_heights = [] + sea_surface_labels = [] + filtered_beam_datasets = [] + + # Group the binned dataset by 'beam_id' and process each group separately + for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): + print(f'Processing subsurface filtering for beam: {beam_id}') + + # Apply get_subsurface_photon to the current beam's dataset + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res + ) + + # Append each result to the lists + sea_surface_heights.append(sea_surface_height) + sea_surface_labels.append(sea_surface_label) + filtered_beam_datasets.append(filtered_seafloor_subsurface_photon_dataset) + + # Combine all filtered beam datasets into a single DataFrame + combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) + + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/icesat-2_kdph_py/kd_utils/data_processing.py b/icesat-2_kdph_py/kd_utils/data_processing.py new file mode 100644 index 0000000..388ce31 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/data_processing.py @@ -0,0 +1,810 @@ +# utils/data_processing.py + + +import re +import os +import io +import time +import math +import h5py +import logging +import netCDF4 +import numpy as np +import geopandas as gpd + +import pandas as pd +from datetime import datetime +import matplotlib.pyplot as plt +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from scipy.spatial import ConvexHull + +import argparse +import subprocess +# import fiona +import utm +import pyproj +from pyproj import Transformer, Proj + +# from pyproj import Transformer +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path + +from sklearn.cluster import DBSCAN +from .interpolation import * + + +logger = logging.getLogger(__name__) + + +#this function from icesat2_toolkit +# PURPOSE: read ICESat-2 ATL03 HDF5 data files +def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): + """ + Reads ICESat-2 ATL03 Global Geolocated Photons data files. + + Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). + + Parameters + ---------- + FILENAME: str + full path to ATL03 file + ATTRIBUTES: bool, default False + read file, group and variable attributes + + Returns + ------- + IS2_atl03_mds: dict + ATL03 variables + IS2_atl03_attrs: dict + ATL03 attributes + IS2_atl03_beams: list + valid ICESat-2 beams within ATL03 file + """ + # Open the HDF5 file for reading + if isinstance(FILENAME, io.IOBase): + fileID = h5py.File(FILENAME, 'r') + else: + fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + + # Output HDF5 file information + logging.info(fileID.filename) + logging.info(list(fileID.keys())) + + # allocate python dictionaries for ICESat-2 ATL03 variables and attributes + IS2_atl03_mds = {} + IS2_atl03_attrs = {} + + # read each input beam within the file + IS2_atl03_beams = [] + for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + # check if subsetted beam contains data + # check in both the geolocation and heights groups + try: + fileID[gtx]['geolocation']['segment_id'] + fileID[gtx]['heights']['delta_time'] + except KeyError: + pass + else: + IS2_atl03_beams.append(gtx) + + # for each included beam + for gtx in IS2_atl03_beams: + + # ------------------------------------------- + # 1. make sure the beam-level dict exists + IS2_atl03_attrs.setdefault(gtx, {}) + # 2. always save the two “must-have” attributes + for key in ('atlas_beam_type', 'atlas_spot_number'): + IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] + + # get each HDF5 variable + IS2_atl03_mds[gtx] = {} + IS2_atl03_mds[gtx]['heights'] = {} + IS2_atl03_mds[gtx]['geolocation'] = {} + IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} + IS2_atl03_mds[gtx]['geophys_corr'] = {} + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_mds[gtx]['heights'][key] = val[:] + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, + # solid earth, pole, load, and equilibrium), inverted barometer (IB) + # effects, and range corrections for tropospheric delays + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + + # Getting attributes of included variables + if ATTRIBUTES: + # Getting attributes of IS2_atl03_mds beam variables + IS2_atl03_attrs[gtx] = {} + IS2_atl03_attrs[gtx]['heights'] = {} + IS2_atl03_attrs[gtx]['geolocation'] = {} + IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} + IS2_atl03_attrs[gtx]['geophys_corr'] = {} + + # Global Group Attributes + for att_name,att_val in fileID[gtx].attrs.items(): + IS2_atl03_attrs[gtx][att_name] = att_val + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_attrs[gtx]['heights'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_attrs[gtx]['geolocation'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + # ICESat-2 Geophysical Corrections Group + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + + # ICESat-2 spacecraft orientation at time + IS2_atl03_mds['orbit_info'] = {} + IS2_atl03_attrs['orbit_info'] = {} + for key,val in fileID['orbit_info'].items(): + IS2_atl03_mds['orbit_info'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID['orbit_info'].attrs.items(): + IS2_atl03_attrs['orbit_info'][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs['orbit_info'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + + # information ancillary to the data product + # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) + # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) + # Add this value to delta time parameters to compute full gps_seconds + # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 + # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) + IS2_atl03_mds['ancillary_data'] = {} + IS2_atl03_attrs['ancillary_data'] = {} + ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', + 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', + 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', + 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', + 'start_orbit','start_region','start_rgt','version'] + for key in ancillary_keys: + # get each HDF5 variable + IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data'][key] = {} + for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): + IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + + # transmit-echo-path (tep) parameters + IS2_atl03_mds['ancillary_data']['tep'] = {} + IS2_atl03_attrs['ancillary_data']['tep'] = {} + for key,val in fileID['ancillary_data']['tep'].items(): + # get each HDF5 variable + IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data']['tep'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + + # channel dead time and first photon bias derived from ATLAS calibration + cal1,cal2 = ('ancillary_data','calibrations') + for var in ['dead_time','first_photon_bias']: + IS2_atl03_mds[cal1][var] = {} + IS2_atl03_attrs[cal1][var] = {} + for key,val in fileID[cal1][cal2][var].items(): + # get each HDF5 variable + if isinstance(val, h5py.Dataset): + IS2_atl03_mds[cal1][var][key] = val[:] + elif isinstance(val, h5py.Group): + IS2_atl03_mds[cal1][var][key] = {} + for k,v in val.items(): + IS2_atl03_mds[cal1][var][key][k] = v[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs[cal1][var][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][att_name] = att_val + if isinstance(val, h5py.Group): + for k,v in val.items(): + IS2_atl03_attrs[cal1][var][key][k] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + + # get ATLAS impulse response variables for the transmitter echo path (TEP) + tep1,tep2 = ('atlas_impulse_response','tep_histogram') + IS2_atl03_mds[tep1] = {} + IS2_atl03_attrs[tep1] = {} + for pce in ['pce1_spot1','pce2_spot3']: + IS2_atl03_mds[tep1][pce] = {tep2:{}} + IS2_atl03_attrs[tep1][pce] = {tep2:{}} + # for each TEP variable + for key,val in fileID[tep1][pce][tep2].items(): + IS2_atl03_mds[tep1][pce][tep2][key] = val[:] + # Getting attributes of included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs[tep1][pce][tep2][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val + + # Global File Attributes + if ATTRIBUTES: + for att_name,att_val in fileID.attrs.items(): + IS2_atl03_attrs[att_name] = att_val + + # Closing the HDF5 file + fileID.close() + # Return the datasets and variables + return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + + + +# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 +def convert_wgs_to_utm(lon: float, lat: float): + """Based on lat and lng, return best utm epsg-code""" + utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0' + utm_band + if lat >= 0: + epsg_code = 'epsg:326' + utm_band + return epsg_code + epsg_code = 'epsg:327' + utm_band + return epsg_code + + +def orthometric_correction(lat, lon, Z, epsg): + # Define the Proj string + #To transform from WGS84 ellipsoidal height + # to EGM2008 orthometric height using PyProj + # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' + # # Define the Proj string for WGS84 ellipsoidal height + # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' + + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 + # egm2008_proj_string = \ + # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ + # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' + + # transform ellipsoid (WGS84) height to orthometric height + # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) + transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) + X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) + + # transform WGS84 proj to local UTM + myProj = Proj(epsg) + X_utm, Y_utm = myProj(lon, lat) + + return Y_utm, X_utm, Z_egm08 + + + +def load_data(file_path, read_attributes=False): + """ + Wrapper around read_granule that lets you decide + whether to pull the full attribute tree. + + Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). + + Parameters + ---------- + file_path : str + Path to the ATL03 HDF5 granule. + read_attributes : bool, optional + If True, read_granule returns the full IS2_atl03_attrs tree. + Defaults to False. + """ + + + return read_granule(file_path, ATTRIBUTES = read_attributes) + + + +# requires that the input gdf has ranged index values i +# will need to change if index is changed to time or something +# this currently checks point in polygon for EVERY point +# would be significantly sped up if evaluated at 10m or something similar +# maybe later, fine for now +def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): + # try loading the shoreline data + try: + ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + + # allocation of to be used arrays + zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) + + # Land flag initialized as -1 + # If shorelines downloaded already, will be set to 0 or 1 + ICESat2_GDF.insert(0, 'is_land', + zero_int_array - 1, False) + + # set the projection + ICESat2_GDF.set_crs("EPSG:4326", inplace=True) + + # load shoreline dataset to include only the features that intersect the bounding box + # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None + # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. + # engine str, 'fiona' or 'pyogrio' + # somtime it gives error if using fiona + # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') + land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + + # continue with getting a new array of 0-or-1 labels for each photon + land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) + + # update labels for points in the land polygons + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + + # get land or not bool value + land_loc = ICESat2_GDF.index.isin(pts_in_land.index) + + # asigned them to new numpy array + land_point_labels[land_loc] = 1 + land_point_labels[~land_loc] = 0 + + return land_point_labels + + except Exception as e: + + print(e) + + print("Error loading shoreline data, returning -1s for is_land flag") + + # if the shoreline data is not available + # return the original label array + + return -np.ones_like(ICESat2_GDF.is_land.values) + + +def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ + quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, + solar_elevation=None, background_rate=None): + # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights + h_ph_geoid_cor = h_ph[:] - geoid[:] + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) + + # Perform orthometric correction to obtain UTM coordinates and corrected heights + lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) + + # Put the data into the dataframe + sea_photon_dataset = pd.DataFrame({ + 'latitude': lat_ph, + 'longitude': lon_ph, + 'lat': lat_utm, + 'lon': lon_utm, + 'photon_height': h_ph_geoid_cor, + 'quality_ph': quality_ph, + 'is_land_label': is_land_label_interp1d, + 'photon_conf': signal_conf_photon, + 'ref_elevation': ref_elev, + 'ref_azimuth': ref_azimuth, + 'relative_AT_dist': relative_AT_dist + }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) + + if solar_elevation is not None: + sea_photon_dataset['solar_elevation'] = solar_elevation + if background_rate is not None: + sea_photon_dataset['background_rate'] = background_rate + + return sea_photon_dataset + + +def interpolate_by_time(source_time, source_values, target_time): + """Linearly interpolate source_values from source_time to target_time.""" + source_time = np.asarray(source_time) + source_values = np.asarray(source_values) + target_time = np.asarray(target_time) + + valid_mask = np.isfinite(source_time) & np.isfinite(source_values) + if valid_mask.sum() < 2: + return np.full_like(target_time, np.nan, dtype=float) + + sorted_idx = np.argsort(source_time[valid_mask]) + sorted_time = source_time[valid_mask][sorted_idx] + sorted_values = source_values[valid_mask][sorted_idx] + + unique_time, unique_idx = np.unique(sorted_time, return_index=True) + unique_values = sorted_values[unique_idx] + if unique_time.size < 2: + return np.full_like(target_time, unique_values[0], dtype=float) + + return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) + + +def apply_optional_solar_background_filter( + sea_photon_dataset, + enabled=False, + day_threshold=6.0, + median_window_deg=10.0, + noise_multiplier=1.5, + min_signal_conf=2 +): + """ + Optionally filter photons under strong daytime solar background conditions. + + Uses a degree-based rolling median of background_rate over solar_elevation + to compute the expected background at each elevation. Photons whose actual + background_rate exceeds expected * noise_multiplier are flagged noisy. + Noisy photons are removed unless their signal confidence >= min_signal_conf. + + Flowchart step: 5 — Compute solar background from atmospheric signal (using + selected x and z bin length scales) and subtract from subsurface photon + histogram data. + """ + if not enabled: + return sea_photon_dataset + + required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} + if not required_cols.issubset(set(sea_photon_dataset.columns)): + logger.warning("Solar background filter skipped because required columns are missing.") + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold + + # Select valid daytime photons for rolling median computation + daytime = filtered_dataset.loc[daytime_mask].copy() + valid = np.isfinite(daytime['background_rate']) & np.isfinite(daytime['solar_elevation']) + daytime = daytime.loc[valid] + + if len(daytime) < 10: + logger.info( + "Solar background filter enabled, but fewer than 10 valid daytime " + "photons were found. Skipping filter." + ) + return filtered_dataset + + # Clamp solar elevations to physical range [DAY_THRESHOLD, 90] to avoid + # HDF5 fill values (e.g. 3.4e+38) blowing up the bin array. + daytime = daytime.copy() + daytime['solar_elevation'] = daytime['solar_elevation'].clip( + lower=day_threshold, upper=90.0 + ) + + # Vectorized degree-based rolling median: + # Bin elevations into fine steps, compute one median per bin using + # np.searchsorted on the sorted array, then map back to photons. + daytime = daytime.sort_values('solar_elevation') + elevations = daytime['solar_elevation'].values + bg_rates = daytime['background_rate'].values + half_window = median_window_deg / 2.0 + + bin_step = 0.1 # degrees — fine enough for smooth curve + bin_centers = np.arange( + elevations[0], elevations[-1] + bin_step, bin_step + ) + bin_medians = np.empty(len(bin_centers)) + for i in range(len(bin_centers)): + lo = np.searchsorted(elevations, bin_centers[i] - half_window, side='left') + hi = np.searchsorted(elevations, bin_centers[i] + half_window, side='right') + if lo < hi: + bin_medians[i] = np.median(bg_rates[lo:hi]) + else: + bin_medians[i] = np.nan + + # Map bin medians back to each photon via nearest-bin lookup (vectorized) + bin_indices = np.searchsorted(bin_centers, elevations, side='right') - 1 + bin_indices = np.clip(bin_indices, 0, len(bin_centers) - 1) + expected_bg = bin_medians[bin_indices] + + # Map expected background onto the full DataFrame. Non-daytime and + # non-finite daytime rows stay NaN, so they are never flagged noisy. + filtered_dataset['expected_bg'] = np.nan + filtered_dataset.loc[daytime.index, 'expected_bg'] = expected_bg + + # Flag noisy daytime photons + noisy_daytime_mask = daytime_mask & ( + filtered_dataset['background_rate'] >= filtered_dataset['expected_bg'] * noise_multiplier + ) + keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) + + # Clean up temporary column + filtered_dataset.drop(columns=['expected_bg'], inplace=True, errors='ignore') + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "Solar background filter (multiplier=%.1f) removed %s photons (from %s to %s).", + noise_multiplier, before_count - after_count, before_count, after_count + ) + return filtered_dataset + + +def apply_optional_ir_ap_filter( + sea_photon_dataset, + enabled=False, + quality_max=0, + min_signal_conf=0 +): + """ + Optionally remove photons flagged as afterpulse using the quality_ph bitmask. + + Flowchart step: 2 — Remove photons flagged as afterpulse or impulse response. + + ATL03 v007 quality_ph bit encoding: + bit 0 (value 1) = possible afterpulse <- targeted by this filter + bit 1 (value 2) = possible impulse response effect + bit 2 (value 4) = possible TEP + + Only photons with bit 0 set (afterpulse) are removed. Photons flagged + for impulse response or TEP only are kept, avoiding over-filtering on + turbid high-background sites where many photons carry non-zero flags. + """ + if not enabled: + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + keep_mask = np.ones(len(filtered_dataset), dtype=bool) + + if 'quality_ph' in filtered_dataset.columns: + quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce').fillna(0).astype(int) + afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse + keep_mask &= ~afterpulse_mask + logger.info( + "IR/AP filter: %d photons flagged as afterpulse (quality_ph bit 0) will be removed.", + int(afterpulse_mask.sum()) + ) + else: + logger.warning("IR/AP filter enabled, but quality_ph is missing.") + + if 'photon_conf' in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') + keep_mask &= conf_values >= min_signal_conf + else: + logger.warning("IR/AP filter enabled, but photon_conf is missing.") + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "IR/AP proxy filter removed %s photons (from %s to %s).", + before_count - after_count, before_count, after_count + ) + return filtered_dataset + + +def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path): + Segment_ID = {} + Segment_Index_begin = {} + Segment_PE_count = {} + Equator_Segment_Distance = {} + Segment_Length = {} + Segment_Is_Land = {} + Segment_Lon = {} + Segment_Lat = {} + Segment_Elev = {} + Segment_Time = {} + Segment_ref_elev = {} + Segment_ref_azimuth = {} + background_rate = {} + background_counts = {} + + # Initialize a list to store data for each beam + beam_datasets = [] + + + # Loop over each strong beam in target_strong_beams + for gtx in target_strong_beams: + print('Processing strong beam ID:', gtx) + + # Access the data for the current beam + IS2_val = IS2_atl03_mds[gtx] + + # Initialize dictionaries to store segment data for the beam + Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + n_seg = len(Segment_ID[gtx]) + n_pe, = IS2_val['heights']['delta_time'].shape + Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 + Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] + Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] + Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] + delta_time = IS2_val['geolocation']['delta_time'] + segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() + segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() + ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() + ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() + geoid = IS2_val['geophys_corr']['geoid'][:].copy() + h_ph = IS2_val['heights']['h_ph'][:].copy() + photon_delta_time = IS2_val['heights']['delta_time'][:].copy() + lat_ph = IS2_val['heights']['lat_ph'][:].copy() + lon_ph = IS2_val['heights']['lon_ph'][:].copy() + signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() + x_atc = IS2_val['heights']['dist_ph_along'][:].copy() + y_atc = IS2_val['heights']['dist_ph_across'][:].copy() + quality_ph = IS2_val['heights']['quality_ph'] + + # Optional variables for solar background sensitivity testing + photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) + photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) + if 'solar_elevation' in IS2_val['geolocation']: + segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() + photon_solar_elevation = interpolate_by_time( + delta_time, segment_solar_elevation, photon_delta_time + ) + if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: + bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() + bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() + photon_background_rate = interpolate_by_time( + bckgrd_time, bckgrd_rate, photon_delta_time + ) + + # Adjust x_atc based on segment distances + for seg_index in range(n_seg): + idx = Segment_Index_begin[gtx][seg_index] + cnt = Segment_PE_count[gtx][seg_index] + x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + + # Calculate relative distances + relative_AT_dist = (x_atc - x_atc[0]) / 1000 + relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 + + # Create a GeoDataFrame to hold segment data for shoreline check + Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Determine if it is land by the land/sea mask + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + + # Apply interpolations for required data + is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) + ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) + ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) + ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) + + + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, + ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid,h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate) + + # Filter out land photons + sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] + + # Add a new column to indicate the beam ID + sea_photon_dataset['beam_id'] = gtx + + # Append the processed dataset for the current beam to the list + beam_datasets.append(sea_photon_dataset) + + # Concatenate all beam data into a single DataFrame + all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) + + return all_beams_dataset + + +def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000): + """ + Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, + convex hull areas, and convex hull points. + """ + lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) + filtered_dataset = photon_dataset.copy() + + convex_hulls = {} + convex_hull_areas = {} + + for lat_bin, bin_data in lat_bins_grouped: + if len(bin_data) >= 3: + points = bin_data[['lat', 'photon_height']].to_numpy() + hull = ConvexHull(points) + area = hull.volume + convex_hull_areas[lat_bin] = area + + # Store the ConvexHull points if area meets the threshold + if area >= hull_area_threshold: + convex_hulls[lat_bin] = points[hull.vertices] + else: + # Remove bins with hull area below the threshold + filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] + + return filtered_dataset, convex_hull_areas, convex_hulls + + + + + + + + + +########################## +##Discard Functions Below +########################## + + +# Extracts key information from filenames, +# which could include processed status, product ID, timestamps, identifiers, or metadata. +def extract_file_params(file_path): + ''' + Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + '2023', '11', '15', '001', '12', '_data') + ''' + # Defines a regex pattern to match strings in the file path + rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' + r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') + # Searches for all matches of the regex pattern in the given + params = rx.findall(file_path).pop() + return params + + +def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): + ''' + create a mask for filtering sea surface, sea floor, and threshold_height + ''' + mask = (binned_data['height'] <= sea_surface_height) & \ + (binned_data['height'] >= seafloor_height) & \ + (binned_data['height'] >= threshold_height) + return mask + + +def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): + ''' + horizontal_vertical_bin_dataset + Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals + ''' + # Create latitude bins based on the specified interval + lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) + + # Create height bins within the range [-12, 2] based on the specified interval + height_bins = np.arange(-12, 2 + height_interval, height_interval) + + # Identify which bin each masked latitude and height value belongs to + lat_bin_indices = np.digitize(lat[mask], lat_bins) + + # Determine which height bin each masked point belongs to + height_bin_indices = np.digitize(height[mask], height_bins) + + # Combine latitude and height bin indices for each point + combined_bins = list(zip(lat_bin_indices, height_bin_indices)) + + # Find unique bin combinations and count the number of points in each bin + unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) + + # Filter bins to include only those within valid index ranges + valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] + + polygons = [] + for bin_lat, bin_height in valid_bins: + if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): + bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) + if len(bin_points) > 2: + hull = ConvexHull(bin_points) + polygons.append(bin_points[hull.vertices]) + + return lat_bins, height_bins, valid_bins, polygons + + + diff --git a/icesat-2_kdph_py/kd_utils/interpolation.py b/icesat-2_kdph_py/kd_utils/interpolation.py new file mode 100644 index 0000000..7670c89 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/interpolation.py @@ -0,0 +1,19 @@ +# utils/interpolation.py + +import scipy.interpolate + +def interpolate_labels(segment_lat, labels): + model = scipy.interpolate.interp1d(segment_lat, labels, fill_value="extrapolate") + return model + +def apply_interpolation(model, lat_ph): + return model(lat_ph) + +def geoid_correction(lat_ph, segment_lat, geoid): + model = interpolate_labels(segment_lat, geoid) + return apply_interpolation(model, lat_ph) + +def refraction_correction(lat_ph, segment_lat, ref_elev, ref_azimuth): + elev_model = interpolate_labels(segment_lat, ref_elev) + azimuth_model = interpolate_labels(segment_lat, ref_azimuth) + return apply_interpolation(elev_model, lat_ph), apply_interpolation(azimuth_model, lat_ph) diff --git a/icesat-2_kdph_py/kd_utils/kd_utils.py b/icesat-2_kdph_py/kd_utils/kd_utils.py new file mode 100644 index 0000000..39c3687 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/kd_utils.py @@ -0,0 +1,1181 @@ +#!/usr/bin/env python +# coding: utf-8 + +##### +# This group of functions processes ICESat-2 data and creates a bathymetric model. +# To do this, it follows a number of steps in the form of functions, including: +# 1. Reading data (ReadATL03()) +# 2. Orthometrically correcting the dataset (OrthometricCorrection()) +# 3. Pulling down the data segment ID (getAtl03SegID()) +# 4. Bin the data along latitudinal and height gradients (bin_data()) +# 5. Calculate sea height (get_sea_height()) +# 6. Get water temperature (get_water_temp()) +# 7. Correct bathymetric surface for refraction (RefractionCorrection()) +# 8. Calculate bathymetric height (get_bath_height()) +# 9. Produce figures (produce_figures()) +##### +import os +# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" + +import io +import os +import re +import time +import math +import h5py +import logging +import netCDF4 +import numpy as np +import geopandas as gpd + +import pandas as pd +from datetime import datetime +import matplotlib.pyplot as plt +import hdbscan +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +import argparse +import subprocess +# import fiona +import utm +import pyproj +from pyproj import Transformer, Proj + +# from pyproj import Transformer +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path + +from sklearn.cluster import DBSCAN + +#this function from icesat2_toolkit +# PURPOSE: read ICESat-2 ATL03 HDF5 data files +def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): + """ + Reads ICESat-2 ATL03 Global Geolocated Photons data files + + Parameters + ---------- + FILENAME: str + full path to ATL03 file + ATTRIBUTES: bool, default False + read file, group and variable attributes + + Returns + ------- + IS2_atl03_mds: dict + ATL03 variables + IS2_atl03_attrs: dict + ATL03 attributes + IS2_atl03_beams: list + valid ICESat-2 beams within ATL03 file + """ + # Open the HDF5 file for reading + if isinstance(FILENAME, io.IOBase): + fileID = h5py.File(FILENAME, 'r') + else: + fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + + # Output HDF5 file information + logging.info(fileID.filename) + logging.info(list(fileID.keys())) + + # allocate python dictionaries for ICESat-2 ATL03 variables and attributes + IS2_atl03_mds = {} + IS2_atl03_attrs = {} + + # read each input beam within the file + IS2_atl03_beams = [] + for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + # check if subsetted beam contains data + # check in both the geolocation and heights groups + try: + fileID[gtx]['geolocation']['segment_id'] + fileID[gtx]['heights']['delta_time'] + except KeyError: + pass + else: + IS2_atl03_beams.append(gtx) + + # for each included beam + for gtx in IS2_atl03_beams: + # get each HDF5 variable + IS2_atl03_mds[gtx] = {} + IS2_atl03_mds[gtx]['heights'] = {} + IS2_atl03_mds[gtx]['geolocation'] = {} + IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} + IS2_atl03_mds[gtx]['geophys_corr'] = {} + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_mds[gtx]['heights'][key] = val[:] + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, + # solid earth, pole, load, and equilibrium), inverted barometer (IB) + # effects, and range corrections for tropospheric delays + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + + # Getting attributes of included variables + if ATTRIBUTES: + # Getting attributes of IS2_atl03_mds beam variables + IS2_atl03_attrs[gtx] = {} + IS2_atl03_attrs[gtx]['heights'] = {} + IS2_atl03_attrs[gtx]['geolocation'] = {} + IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} + IS2_atl03_attrs[gtx]['geophys_corr'] = {} + # Global Group Attributes + for att_name,att_val in fileID[gtx].attrs.items(): + IS2_atl03_attrs[gtx][att_name] = att_val + # ICESat-2 Measurement Group + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_attrs[gtx]['heights'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + # ICESat-2 Geolocation Group + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_attrs[gtx]['geolocation'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + # ICESat-2 Background Photon Rate Group + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + # ICESat-2 Geophysical Corrections Group + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + + # ICESat-2 spacecraft orientation at time + IS2_atl03_mds['orbit_info'] = {} + IS2_atl03_attrs['orbit_info'] = {} + for key,val in fileID['orbit_info'].items(): + IS2_atl03_mds['orbit_info'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID['orbit_info'].attrs.items(): + IS2_atl03_attrs['orbit_info'][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs['orbit_info'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + + # information ancillary to the data product + # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) + # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) + # Add this value to delta time parameters to compute full gps_seconds + # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 + # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) + IS2_atl03_mds['ancillary_data'] = {} + IS2_atl03_attrs['ancillary_data'] = {} + ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', + 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', + 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', + 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', + 'start_orbit','start_region','start_rgt','version'] + for key in ancillary_keys: + # get each HDF5 variable + IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data'][key] = {} + for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): + IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + + # transmit-echo-path (tep) parameters + IS2_atl03_mds['ancillary_data']['tep'] = {} + IS2_atl03_attrs['ancillary_data']['tep'] = {} + for key,val in fileID['ancillary_data']['tep'].items(): + # get each HDF5 variable + IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs['ancillary_data']['tep'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + + # channel dead time and first photon bias derived from ATLAS calibration + cal1,cal2 = ('ancillary_data','calibrations') + for var in ['dead_time','first_photon_bias']: + IS2_atl03_mds[cal1][var] = {} + IS2_atl03_attrs[cal1][var] = {} + for key,val in fileID[cal1][cal2][var].items(): + # get each HDF5 variable + if isinstance(val, h5py.Dataset): + IS2_atl03_mds[cal1][var][key] = val[:] + elif isinstance(val, h5py.Group): + IS2_atl03_mds[cal1][var][key] = {} + for k,v in val.items(): + IS2_atl03_mds[cal1][var][key][k] = v[:] + # Getting attributes of group and included variables + if ATTRIBUTES: + # Variable Attributes + IS2_atl03_attrs[cal1][var][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][att_name] = att_val + if isinstance(val, h5py.Group): + for k,v in val.items(): + IS2_atl03_attrs[cal1][var][key][k] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + + # get ATLAS impulse response variables for the transmitter echo path (TEP) + tep1,tep2 = ('atlas_impulse_response','tep_histogram') + IS2_atl03_mds[tep1] = {} + IS2_atl03_attrs[tep1] = {} + for pce in ['pce1_spot1','pce2_spot3']: + IS2_atl03_mds[tep1][pce] = {tep2:{}} + IS2_atl03_attrs[tep1][pce] = {tep2:{}} + # for each TEP variable + for key,val in fileID[tep1][pce][tep2].items(): + IS2_atl03_mds[tep1][pce][tep2][key] = val[:] + # Getting attributes of included variables + if ATTRIBUTES: + # Global Group Attributes + for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val + # Variable Attributes + IS2_atl03_attrs[tep1][pce][tep2][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val + + # Global File Attributes + if ATTRIBUTES: + for att_name,att_val in fileID.attrs.items(): + IS2_atl03_attrs[att_name] = att_val + + # Closing the HDF5 file + fileID.close() + # Return the datasets and variables + return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + +# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 +def convert_wgs_to_utm(lon: float, lat: float): + """Based on lat and lng, return best utm epsg-code""" + utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) + if len(utm_band) == 1: + utm_band = '0' + utm_band + if lat >= 0: + epsg_code = 'epsg:326' + utm_band + return epsg_code + epsg_code = 'epsg:327' + utm_band + return epsg_code + + +def orthometric_correction(lat, lon, Z, epsg): + # Define the Proj string + #To transform from WGS84 ellipsoidal height + # to EGM2008 orthometric height using PyProj + # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' + # # Define the Proj string for WGS84 ellipsoidal height + # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' + + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 + # egm2008_proj_string = \ + # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ + # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' + + # transform ellipsoid (WGS84) height to orthometric height + # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) + transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) + X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) + + # transform WGS84 proj to local UTM + myProj = Proj(epsg) + X_utm, Y_utm = myProj(lon, lat) + + return Y_utm, X_utm, Z_egm08 + + +# Snippet by Eric Guenther (via Amy N.) for assigning photons to a segment +def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): + # We need to know spacecraft orbit info, + # which is provided across segments. + # This first function assigns photons to the segment they belong to. + # We end up making a new array + # that has more points to match the photon. Segment is defined as every 100m in the long track. + + # Filter all data where atl03_ph_index starts at 0 (0 indicates errors) + indsNotZero = atl03_ph_index_beg != 0 + atl03_ph_index_beg = atl03_ph_index_beg[indsNotZero] + atl03_segment_id = atl03_segment_id[indsNotZero] + + # Subtract 1 from ph_index_beg to start at python 0th pos + atl03_ph_index_beg = atl03_ph_index_beg - 1 + + # Sometimes the ph_index_beg is not at the 0th position, it is not, + # add it in and then add the associated segment id + # Warning, this is assuming that the segment id for the points are from + # the segment id directly before it, this assumption might fail, but I have + # not come across a case yet where it does. If you want to play it safe + # you could comment this section out and then if the first position is not + # 0 then all photons before the first position will not be assigned a + # segment id. + # if atl03_ph_index_beg[0] != 0: + # atl03_ph_index_beg = np.append(0,atl03_ph_index_beg) + # first_seg_id = atl03_segment_id[0] -1 + # atl03_segment_id = np.append(first_seg_id,atl03_segment_id) + + # Append atl03_height_len to end of array for final position + atl03_ph_index_beg = np.append(atl03_ph_index_beg, atl03_heights_len) + + # Make array equal to the length of the atl03_heights photon level data + ph_segment_id = np.zeros(atl03_heights_len) + + # Iterate through ph_index_beg, from the first to second to last number + # and set the photons between ph_index_beg i to ph_index_beg i + 1 to + # segment id i + for i in range(0, len(atl03_ph_index_beg) - 1): + ph_segment_id[atl03_ph_index_beg[i]:atl03_ph_index_beg[i + 1]] = atl03_segment_id[i] + + # Return list of segment_id at the photon level + return ph_segment_id + + +def ref_linear_interp(x, y): + # initialize an empty list + arr = [] + + # get unique x values + ux = np.unique(x) + for u in ux: + # get y values for x=u + idx = y[x == u] + + # try to get the y values for x=u-1 and x=u + # if this is not possible, set min and max to y values for x=u + try: + min = y[x == u - 1][0] + max = y[x == u][0] + except: + min = y[x == u][0] + max = y[x == u][0] + + # try to get the y values for x=u and x=u+1 + # if this is not possible, set min and max to y values for x=u + try: + min = y[x == u][0] + max = y[x == u + 1][0] + except: + min = y[x == u][0] + max = y[x == u][0] + + # if the min and max values are the same, + # fill the sub array with the value + if min == max: + sub = np.full((len(idx)), min) + arr.append(sub) + + # if min and max are different, + # create a sub array using linear interpolation + else: + sub = np.linspace(min, max, len(idx)) + arr.append(sub) + + # concatenate all the sub arrays into a single array and return + return np.concatenate(arr, axis=None).ravel() + + +# Bin data along vertical and horizontal scales +def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): + """Bin data along vertical and horizontal scales + for later segmentation""" + + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise + valid_range = (-50, 10) + valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + + # Apply the valid_mask to filter unwanted values + filtered_dataset = dataset[valid_mask] + + # Calculate the number of height bins + height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) + height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + + # Calculate the number of latitude bins + lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin + + # Create bins for latitude + lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + + # Create bins for height + height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, + labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), + filtered_dataset['photon_height'].max(), + num=height_bin_number), decimals=1)) + + # Add bins to dataframe using .loc to avoid SettingWithCopyWarning + filtered_dataset.loc[:, 'lat_bins'] = lat_bins + filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset = filtered_dataset.reset_index(drop=True) + + return filtered_dataset + + +# thinking about grid searching to detect bathymetric directly +# rather than bin and then search +def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density_threshold): + """Bin data along vertical and horizontal scales + and calculate high-density points using a grid method""" + + # Calculate the number of bins required both vertically + lat_bin_number = round(abs(dataset['lat'].min() - dataset['lat'].max()) / lat_res) + # and horizontally based on resolution size + height_bin_number = round(abs(dataset['photon_height'].min() - dataset['photon_height'].max()) / vertical_res) + + # Create the grid + grid = np.zeros((lat_bin_number, height_bin_number)) + + # Iterate over the dataset and assign points to cells + for _, row in dataset.iterrows(): + lat_index = int((row['lat'] - dataset['lat'].min()) / lat_res) + height_index = int((row['photon_height'] - dataset['photon_height'].min()) / vertical_res) + grid[lat_index, height_index] += 1 + + # Identify high-density cells + high_density_cells = np.argwhere(grid > density_threshold) + + # Create a copy of the dataset + dataset_copy = dataset.copy() + + # Assign the cell indices as bins to the dataset + dataset_copy['lat_bins'] = pd.cut(dataset['lat'], bins=lat_bin_number, + labels=np.arange(lat_bin_number)) + dataset_copy['height_bins'] = pd.cut(dataset['photon_height'], bins=height_bin_number, + labels=np.arange(height_bin_number)) + + # Reset the index of the copied dataset + dataset_copy = dataset_copy.reset_index(drop=True) + + return dataset_copy, high_density_cells + + +# Bin data along horizontal scale only +def horizontal_bin_dataset(dataset, lat_res): + """Bin data along the horizontal scale (lat) only + for later segmentation""" + + # Calculate the number of bins required horizontally based on resolution size + lat_bin_number = round(abs(dataset['lat_utm'].min() - dataset['lat_utm'].max()) / lat_res) + + # Cut lat bins + # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + lat_bins = pd.cut(dataset['lat_utm'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + + # Create a copy of the dataset + dataset_copy = dataset.copy() + + # Add lat bins to the dataframe + dataset_copy['lat_bins'] = lat_bins + + # Reset the index of the copied dataset + dataset_copy = dataset_copy.reset_index(drop=True) + + return dataset_copy + + +# Bin data first, and then throw away only the surface bin +def get_rm_sea_surface_bin(binned_dataset): + """Calculate mean sea height for easier calculation of depth and cleaner + figures""" + + # set flag for the df save + flag = 1 + + # group dataset by lat bins + grouped_data = binned_dataset.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k, v in data_groups.items(): + + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby('height_bins',observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # get all values below this bin + # Use boolean indexing to select only the values below the peak bin + new_photon_array_without_peak_bin = v.loc[v['height_bins'] < largest_h_index] + + if flag == 1: + photon_array_without_peak_bin = new_photon_array_without_peak_bin + flag = 2 + + else: + photon_array_without_peak_bin = photon_array_without_peak_bin.append(new_photon_array_without_peak_bin) + + del new_df + + return photon_array_without_peak_bin + + +def get_sea_surface_height(binned_data, threshold): + """Calculate mean sea height for easier calculation of depth and cleaner figures""" + + #set flag for the df save + firstTimeIndex = True + + # Create sea height list + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + + # Group dataset by latitude bins + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k, v in data_groups.items(): + # based on lat_utm + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # Calculate the median value of all photon height values within this bin + photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + lat_bin_sea_median = photons_sea_surface.median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) + mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Get all photons below sea surface + # to determine segment type of each subsurface water column + # Use calculated sea height to determine photons at 0.5m below peak + photons_sea_surface_up = \ + v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & + (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + + # Calculate the photon ratio between surface and whole photons + if v['photon_height'].shape[0] > 0: + new_photons_ratio_sea_surface = \ + photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + else: + new_photons_ratio_sea_surface = np.nan + + sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + + final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | + (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + + # Determine label based on ratio of sea surface photons and subsurface photons + sea_surface_dominated_label = \ + np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Loop through groups again and return photons below 0.5m of sea height + PhotonDFBelowThresholdPeak = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + # Get all values below this bin + NewPhotonDFBelowThresholdPeak = \ + v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] + + if firstTimeIndex: + PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak + firstTimeIndex=False + else: + PhotonDFBelowThresholdPeak=\ + pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) + + + return final_sea_surface_height, \ + sea_surface_height_abnormal_label, \ + sea_surface_dominated_label,\ + PhotonDFBelowThresholdPeak + + +# +#Arbitrary cutoff below the max value - 0.5 m below peak +# (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) +def get_photon_below_sea_surface(binned_data,threshold): + '''Calculate mean sea height for easier calculation of depth and cleaner figures''' + + #set flag for the df save + firstTimeIndex=1 + + # Create sea height list + sea_surface_height = [] +# mean_lat_bins_seq=[] + + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average sea height + for k,v in data_groups.items(): + + lat_bin_average=v['lat_utm'].mean() + + # Create new dataframe based on occurance of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Return the bin with the highest count + largest_h_bin = new_df['lat_utm'].argmax() + + # Select the index of the bin with the highest count + largest_h = new_df.index[largest_h_bin] + + # Calculate the median value of all values within this bin + lat_bin_sea_median = v.loc[v['height_bins']==largest_h, 'photon_height'].median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) +# mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + Final_sea_height = np.where((sea_surface_height > (mean + 2*sd)) | (sea_surface_height < (mean - 2*sd)), np.nan, + sea_surface_height).tolist() + Abnormal_sea_height_label=np.where(np.isnan(Final_sea_height), 1, 0) + + + # Loop through groups again and return photons below 0.5m of sea height + for k,v in data_groups.items(): + + # get all values below this bin + # Use calculated sea height to determine photons at 0.5m below peak + NewPhotonArrayBelowThresholdPeak= \ + v.loc[v['photon_height']<(sea_surface_height[k]-threshold)] + + if firstTimeIndex ==1: + PhotonArrayBelowThresholdPeak=NewPhotonArrayBelowThresholdPeak + firstTimeIndex=2 + + else: + PhotonArrayBelowThresholdPeak=PhotonArrayBelowThresholdPeak.append(NewPhotonArrayBelowThresholdPeak) + + + return PhotonArrayBelowThresholdPeak + + + +# Function to get elevation for multiple points +def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): + # Convert geographic coordinates to the raster's coordinate system + rows, cols = raster.index(lons, lats) + rows, cols = np.array(rows), np.array(cols) + + # Ensure the indices are within bounds + valid_mask = (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + elevations = np.full(lons.shape, np.nan) + elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] + + return elevations + +def get_water_temp(date_year, date_month, date_day, latitude, longitude): + """ + Pull down surface water temperature along the track from the JPL GHRSST opendap website. + + The GHRSST data are gridded tiles with dimension 17998 x 35999. + To get the specific grid tile of the SST, you must convert from lat, lon coordinates + to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. + """ + # Get date from data filename + # data_path[-33:-25] + date = date_year + date_month + date_day + # date[0:4] + year = date_year + # date[4:6] + month = date_month + # date[6:8] + day = date_day + day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + # Add zero in front of day of year string + zero_day_of_year = day_of_year.zfill(3) + + # Calculate ratio of latitude from mid-point of IS2 track + old_lat = latitude.mean() + old_lat_min = -90 + old_lat_max = 90 + new_lat_min = 0 + new_lat_max = 17998 + + new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * + (new_lat_max - new_lat_min) + new_lat_min) + + # Calculate ratio of longitude from mid-point of IS2 track + old_lon = longitude.mean() + old_lon_min = -180 + old_lon_max = 180 + new_lon_min = 0 + new_lon_max = 35999 + + new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * + (new_lon_max - new_lon_min) + new_lon_min) + + # Access the SST data using the JPL OpenDap interface + url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ + + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ + + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + + dataset = netCDF4.Dataset(url) + + # Access the data and convert the temperature from K to C + water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + return water_temp + + +def refraction_correction(WTemp, WSmodel, Wavelength, + Photon_ref_elev, Ph_ref_azimuth, + PhotonZ, PhotonX, PhotonY, Ph_Conf): + """ + WTemp; there is python library that pulls water temp data + WSmodel is the value surface height + Wavelength is fixed + """ + + # Only process photons below water surface model + PhotonX = PhotonX[PhotonZ <= WSmodel] + PhotonY = PhotonY[PhotonZ <= WSmodel] + Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] + Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] + Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] + PhotonZ = PhotonZ[PhotonZ <= WSmodel] + + # water temp for refraction correction + WaterTemp = WTemp + + # Refraction coefficient # + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + wl = Wavelength + + # refractive index of air + n1 = 1.00029 + + # refractive index of water + n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + + # assumption is 0.25416 + # This example is refractionCoef = 0.25449 + # 1.00029 is refraction of air constant + correction_coef = (1 - (n1 / n2)) + + # read photon ref_elev to get theta1 + theta1 = np.pi / 2 - Photon_ref_elev + + # eq 1. Theta2 + theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + + # eq 3. S + # Approximate water Surface = 1.5 + # D = raw uncorrected depth + D = WSmodel - PhotonZ + + # For Triangle DTS + S = D / np.cos(theta1) + + # eq 2. R + R = (S * n1) / n2 + Gamma = (np.pi / 2) - theta1 + + # For triangle RPS + # phi is an angle needed + phi = theta1 - theta2 + + # P is the difference between raw and corrected YZ location + P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + + # alpha is an angle needed + alpha = np.arcsin((R * np.sin(phi)) / P) + + # Beta angle needed for Delta Y an d Delta Z + Beta = Gamma - alpha + + # Delta Y + DY = P * np.cos(Beta) + + # Delta Z + DZ = P * np.sin(Beta) + + # Delta Easting + DE = DY * np.sin(Ph_ref_azimuth) + + # Delta Northing + DN = DY * np.cos(Ph_ref_azimuth) + + outX = PhotonX + DE + outY = PhotonY + DN + outZ = PhotonZ + DZ + + ''' + print('For selected Bathy photon:') + print('lat = ', PhotonY[9000]) + print('long = ', PhotonX[9000]) + print('Raw Depth = ', PhotonZ[9000]) + print('D = ', D[9000]) + + print('ref_elev = ', Photon_ref_elev[9000]) + + print('Delta East = ', DE[9000]) + print('Delta North = ', DN[9000]) + print('Delta Z = ', DZ[9000]) + ''' + return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, + Photon_ref_elev) # We are most interested in out-x, out-y, out-z + + +def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surface_height, vertical_res): + """ Detect bathymetric level per bin based on percentile_thresh """ + # Create sea height list + bath_height = [] + + geo_photon_height = [] + geo_longitude = [] + geo_latitude = [] + + # Group data by latitude + # Filter out surface data that are two bins below median surface value calculated above + binned_data_bath = binned_data[(binned_data['photon_height'] < + sea_surface_height - (vertical_res * 2))] + grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Create a percentile threshold of photon counts in each grid, + # grouped by both x and y axes. + count_threshold = np.percentile( + binned_data.groupby(['lat_bins', 'height_bins']).size().reset_index().groupby('lat_bins')[[0]].max(), + percentile_thresh) + + # Loop through groups and return average bathy height + for k, v in data_groups.items(): + new_df = pd.DataFrame(v.groupby('height_bins').count()) + bath_bin = new_df['lat'].argmax() + bath_bin_h = new_df.index[bath_bin] + + # Set threshold of photon counts per bin + # here this script determines whether there is bathymetry signals by + # the photon counts per bin below sea surface height + if new_df.iloc[bath_bin]['lat'] >= count_threshold: + + geo_photon_height.append(v.loc[v['height_bins'] == + bath_bin_h, 'cor_photon_height'].values) + geo_longitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lon'].values) + geo_latitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lat'].values) + + bath_bin_median = v.loc[v['height_bins'] == bath_bin_h, 'cor_photon_height'].median() + bath_height.append(bath_bin_median) + del new_df + + else: + bath_height.append(np.nan) + del new_df + + geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() + geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() + geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() + geo_depth = sea_surface_height - geo_photon_list + geo_df = pd.DataFrame( + {'lon': geo_longitude_list, 'lat': geo_latitude_list, + 'photon_height': geo_photon_list, + 'depth': geo_depth}) + + del geo_longitude_list, geo_latitude_list, geo_photon_list + + return bath_height, geo_df + + +def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_height, vertical_res): + """ Calculate bathymetric level per lat bin based on horizontal resolution """ + # Create sea height list + bath_height = [] + geo_photon_height = [] + geo_longitude = [] + geo_latitude = [] + + # Group data by latitude + # Filter out surface data that are two bins (2 times height resolution) + # below median sea surface value calculated above + binned_data_bath = lat_binned_data[(lat_binned_data['photon_height'] < + sea_surface_height - (vertical_res * 2))] + + grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + data_groups = dict(list(grouped_data)) + + # Loop through groups and return average bathymetric height + for k, v in data_groups.items(): + + # assign each group of dataset to a new dataframe + new_df = pd.DataFrame(v) + + # Check if the DataFrame is empty + if new_df.empty: + # If the DataFrame is empty, append null values to the lists and skip to the next iteration + bath_height.append(np.nan) + geo_photon_height.append([]) + geo_longitude.append([]) + geo_latitude.append([]) + del new_df + continue + + lat_height_pairs = list(zip(new_df['lat_utm'], new_df['cor_photon_height'])) + + # Convert to a numpy array for sklearn + lat_height_pairs_array = np.array(lat_height_pairs) + + # Perform HDBSCAN clustering on the photons below sea surface for each lat bin + # scaler = StandardScaler() + scaler = MinMaxScaler() + data_scaled = scaler.fit_transform(lat_height_pairs_array) + + min_cluster_size = 4 + + # Check the number of data points is greater than the initial min_cluster_size + if len(data_scaled) < min_cluster_size: + min_cluster_size = len(data_scaled) // 2 + if min_cluster_size < 2: + bath_height.append(np.nan) + geo_photon_height.append([]) + geo_longitude.append([]) + geo_latitude.append([]) + del new_df + continue + + # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) + cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, + min_samples=3, cluster_selection_epsilon=20, + leaf_size=25, core_dist_n_jobs=-1) + cluster.fit(data_scaled) + + # Get the labels assigned to each point by the HDBSCAN model + labels = cluster.labels_ + + # Count the number of points assigned to each cluster + unique, counts = np.unique(labels, return_counts=True) + + # Create a dictionary that maps each cluster label to the count of points assigned to it + clusters_dict = dict(zip(unique, counts)) + + # Print the dictionary to see the size of each cluster + print(clusters_dict) + + # Identify the label of the largest cluster + max_cluster_label = max(clusters_dict, key=clusters_dict.get) + + # Check if no cluster is found + if max_cluster_label == -1: + bath_height.append(np.nan) + else: + # Add labels to the DataFrame + new_df_copy = new_df.copy() + new_df_copy['cluster'] = labels + + # Subset the DataFrame to get only the data points in the largest cluster + bath_cluster_data = new_df[new_df_copy['cluster'] == max_cluster_label] + + # calculate the median height value as the average bathymetric height + bath_bin_median = bath_cluster_data['cor_photon_height'].median() + bath_height.append(bath_bin_median) + + # Extract the longitude, latitude, and photon height for each lat bin + # and add it to respective lists + geo_photon_height.append(bath_cluster_data['cor_photon_height'].values) + geo_longitude.append(bath_cluster_data['lon_utm'].values) + geo_latitude.append(bath_cluster_data['lat_utm'].values) + + del new_df + + # Convert the lists to a single list + geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() + geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() + geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() + + # Calculate depth + geo_depth = sea_surface_height - np.array(geo_photon_list) + + # Create a DataFrame + geo_df = pd.DataFrame( + {'lon': geo_longitude_list, 'lat': geo_latitude_list, + 'photon_height': geo_photon_list, + 'depth': geo_depth.tolist()}) + + return bath_height, geo_df + +# requires that the input gdf has ranged index values i +# will need to change if index is changed to time or something +# this currently checks point in polygon for EVERY point +# would be significantly sped up if evaluated at 10m or something similar +# maybe later, fine for now +def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): + # try loading the shoreline data + try: + ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + + # allocation of to be used arrays + zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) + + # Land flag initialized as -1 + # If shorelines downloaded already, will be set to 0 or 1 + ICESat2_GDF.insert(0, 'is_land', + zero_int_array - 1, False) + + # set the projection + ICESat2_GDF.set_crs("EPSG:4326", inplace=True) + + # load shoreline dataset to include only the features that intersect the bounding box + # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None + # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. + # engine str, 'fiona' or 'pyogrio' + # somtime it gives error if using fiona + # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') + land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + + # continue with getting a new array of 0-or-1 labels for each photon + land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) + + # update labels for points in the land polygons + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + + # get land or not bool value + land_loc = ICESat2_GDF.index.isin(pts_in_land.index) + + # asigned them to new numpy array + land_point_labels[land_loc] = 1 + land_point_labels[~land_loc] = 0 + + return land_point_labels + + except Exception as e: + + print(e) + + print("Error loading shoreline data, returning -1s for is_land flag") + + # if the shoreline data is not available + # return the original label array + + return -np.ones_like(ICESat2_GDF.is_land.values) + + +# +def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, + y_limit_top, y_limit_bottom, percentile, file, geo_df, + ref_y, ref_z, beam, epsg_num): + """Create figures""" + + # Create bins for latitude + bath_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(bath_height))+20 + + sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(sea_height))+10 + + # Create new dataframes for median values + bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + + # Create uniform sea surface based on median sea surface values and filter out surface breaching + sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] + sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + + # Create uniform solo sea surface label + sea_surface_label = solo_sea_surface_label + sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + idx_1 = np.where(sea_surface_label_df.y == 1) + idx_0 = np.where(sea_surface_label_df.y == 0) + + # Define figure size + fig = plt.rcParams["figure.figsize"] = (40, 25) + + # Plot raw points + # plt.scatter(x=binned_data.lat, + # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, + # c = 'yellow', label = 'Raw photon height') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') + plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', + alpha=0.1, c='red', label='Classified Photons') + + # plt.scatter(x=geo_df.lat, + # y = geo_df.photon_height, marker='o', lw=0, s=0.8, + # alpha = 0.8, c = 'black', label = 'Corrected photon bin') + + # Plot median values + plt.scatter(bath_median_df.x, bath_median_df.y, + marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') + + plt.scatter(sea_median_df.x, sea_median_df.y, + marker='o', c='b', alpha=1, s=2, label='Median sea surface') + + plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, + marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') + plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, + marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + + # Insert titles and subtitles + plt.title('Icesat2 Bathymetry\n' + file) + plt.xlabel('Latitude', fontsize=25) + plt.ylabel('Photon Height (m)', fontsize=25) + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + + plt.legend(loc="upper left", prop={'size': 20}) + + # Limit the x and y axes using parameters + plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) + plt.ylim(top=y_limit_top, bottom=y_limit_bottom) + + timestr = time.strftime("%Y%m%d_%H%M%S") + file = file.replace('.h5', '') + # Define where to save file + plt.tight_layout() + plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + + str(beam) + '_' + str(percentile) + + '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + # plt.show() + # plt.close() + + # convert corrected locations back to wgs84 (useful to contain) + transformer = Transformer.from_crs("EPSG:" + str(epsg_num), + "EPSG:4326", always_xy=True) + print(transformer) + lon_wgs84, lat_wgs84 = transformer.transform( + geo_df.lon.values, geo_df.lat.values) + + geo_df['lon_wgs84'] = lon_wgs84 + geo_df['lat_wgs84'] = lat_wgs84 + + geodf = gpd.GeoDataFrame(geo_df, + geometry=gpd.points_from_xy(geo_df.lon_wgs84, + geo_df.lat_wgs84)) + + geodf.set_crs(epsg=4326, inplace=True) + + # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + + # str(epsg_num) + '_' + timestr + ".gpkg", + # driver="GPKG") diff --git a/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py b/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py new file mode 100644 index 0000000..9df2766 --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py @@ -0,0 +1,489 @@ + +import numpy as np +import geopandas as gpd +from datetime import datetime +import pandas as pd +import netCDF4 +from scipy.signal import find_peaks +from scipy.stats import norm + +# Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset +def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): + """ + Apply horizontal and vertical binning to each beam. + + Flowchart steps: + 3 — Choose along-track (x) and vertical (z) bin sizes + (e.g., 500 m in x, 0.25 m in z). + 4 — Build vertical histograms of photon counts using selected x, z bin sizes. + """ + # Initialize list to store results from each beam + binned_beam_datasets = [] + + # Group the dataset by 'beam_id' and process each group separately + for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): + print(f'Processing binning for beam: {beam_id}') + + # Apply binning to the current beam dataset + binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) + + # Append the binned data for the current beam to the list + binned_beam_datasets.append(binned_beam_data) + + # Combine all binned beam datasets into a single DataFrame + binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) + + return binned_dataset_sea_surface + + +# Bin data along vertical and horizontal scales +def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): + """ + Bin data along vertical and horizontal scales for later segmentation. + + Flowchart steps: + 3 — Choose along-track (x) and vertical (z) bin sizes + (e.g., 500 m in x, 0.25 m in z). + 4 — Build vertical histograms of photon counts using selected x, z bin sizes. + 10 — Re-build histograms based on corrected depths (called again after + refraction correction when apply_post_refraction_refit is enabled). + """ + + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise + valid_range = (-70, 5) + valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + + # Apply the valid_mask to filter unwanted values + # and create a copy to avoid SettingWithCopyWarning + filtered_dataset = dataset[valid_mask].copy() + + # Calculate the number of height bins + height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) + height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + + # Calculate the number of latitude bins + lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin + + # Create bins for latitude + lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + + # Create bins for height + height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, + labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), + filtered_dataset['photon_height'].max(), + num=height_bin_number), decimals=1)) + + # Add bins to dataframe using .loc to avoid SettingWithCopyWarning + filtered_dataset.loc[:, 'lat_bins'] = lat_bins + filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset = filtered_dataset.reset_index(drop=True) + + return filtered_dataset + + +def get_sea_surface_height_static(binned_data, threshold): + """ + Calculate sea surface height and filter subsurface photons using static threshold. + + Flowchart step: 7 — Fit Gaussian curve to identify surface elevation; compute + standard deviation of Gaussian peak. + + Parameters: + binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height + + Returns: + final_sea_surface_height (list): Detected sea surface heights + sea_surface_height_abnormal_label (array): Labels for abnormal heights + sea_surface_dominated_label (array): Labels for surface-dominated bins + PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface + """ + + #set flag for the df save + firstTimeIndex = True + + # Create sea height list + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + + # Group dataset along horizental (latitude bins) + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + # Loop through groups to detect sea surface + for k, v in data_groups.items(): + # based on lat_utm + lat_bin_average = v['lat'].mean() + + # Create new dataframe based on occurrence of photons per height bin + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + + # Check if new_df is not empty before finding the bin with the highest photon count + if not new_df.empty: + # Find the vertical bin with the highest photon count + largest_h_bin = new_df['lat'].argmax() + + # Select the index of the bin with the highest count + largest_h_index = new_df.index[largest_h_bin] + + # Calculate the median value of all photon height values within this bin + photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + lat_bin_sea_median = photons_sea_surface.median() + + # Append to sea height list + sea_surface_height.append(lat_bin_sea_median) + mean_lat_bins_seq.append(lat_bin_average) + del new_df + + # Get all photons below sea surface + # to determine segment type of each subsurface water column + # Use calculated sea height to determine photons at 0.5m below peak + photons_sea_surface_up = \ + v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & + (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + + # Calculate the photon ratio between surface and whole photons + if v['photon_height'].shape[0] > 0: + new_photons_ratio_sea_surface = \ + photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + else: + new_photons_ratio_sea_surface = np.nan + + sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + + else: + # Append NaNs if the group is empty + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + + # Filter out sea height bin values outside 2 SD of mean. + mean = np.nanmean(sea_surface_height, axis=0) + sd = np.nanstd(sea_surface_height, axis=0) + + final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | + (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + + # Determine label based on ratio of sea surface photons and subsurface photons + sea_surface_dominated_label = \ + np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Loop through groups again and return photons below 0.5m of sea height + PhotonDFBelowThresholdPeak = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + + # Get all values below this bin + if not np.isnan(final_sea_surface_height[i]): + NewPhotonDFBelowThresholdPeak = \ + v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] + + if firstTimeIndex: + PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak + firstTimeIndex=False + else: + PhotonDFBelowThresholdPeak=\ + pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) + + + return final_sea_surface_height, \ + sea_surface_height_abnormal_label, \ + sea_surface_dominated_label,\ + PhotonDFBelowThresholdPeak + + +def get_sea_surface_height_adaptive(binned_data): + """ + Calculate sea surface height and filter subsurface photons with enhanced wave removal. + + Flowchart steps: + 7 — Fit Gaussian curve to identify surface elevation; compute standard + deviation of Gaussian peak. + 11 — Re-fit Gaussian curve to surface to identify surface peak; compute + standard deviation and remove histogram data within three standard + deviations. (This function is called a second time after refraction + correction and histogram rebuild when apply_post_refraction_refit + is enabled.) + + Parameters: + binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height + + Returns: + final_sea_surface_height (list): Detected sea surface heights + sea_surface_height_abnormal_label (array): Labels for abnormal heights + sea_surface_dominated_label (array): Labels for surface-dominated bins + PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed + """ + + firstTimeIndex = True + sea_surface_height = [] + mean_lat_bins_seq = [] + sea_surface_subsurface_photons_ratio = [] + + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + data_groups = dict(list(grouped_data)) + + for k, v in data_groups.items(): + lat_bin_average = v['lat'].mean() + + if len(v) < 20: # Increase minimum photon count for robustness + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + continue + + # Finer histogram for better peak resolution + hist, bin_edges = np.histogram(v['photon_height'], + bins=100, # Finer bins + density=True, + range=(v['photon_height'].min(), v['photon_height'].max())) + bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 + + # Enhanced peak detection + peaks, properties = find_peaks(hist, + height=np.max(hist)*0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist)*0.1) # Require prominent peaks + + if len(peaks) == 0: + sea_surface_height.append(np.nan) + mean_lat_bins_seq.append(np.nan) + sea_surface_subsurface_photons_ratio.append(np.nan) + continue + + # Use strongest peak as surface, consider nearby peaks as wave effects + surface_peak_idx = peaks[np.argmax(hist[peaks])] + surface_height = bin_centers[surface_peak_idx] + + # Two-pass Gaussian fit: start with ±1 m, widen if sigma is large + # (the ±1 m window underestimates sigma for SWH > 2 m) + half_win = 1.0 + peak_data = v[(v['photon_height'] > surface_height - half_win) & + (v['photon_height'] < surface_height + half_win)] + if len(peak_data) > 10: + mu, sigma = norm.fit(peak_data['photon_height']) + # Refit with wider window if sigma suggests truncation + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = v[(v['photon_height'] > mu - half_win2) & + (v['photon_height'] < mu + half_win2)] + if len(peak_data2) > 10: + mu, sigma = norm.fit(peak_data2['photon_height']) + else: + mu, sigma = surface_height, 0.2 # More conservative default + + adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # at least 1 m below + + # Extended surface layer to remove wave effects + surface_photons = v[(v['photon_height'] > adaptive_threshold) & + (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound + ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan + + sea_surface_height.append(mu) + mean_lat_bins_seq.append(lat_bin_average) + sea_surface_subsurface_photons_ratio.append(1 - ratio) + + # Outlier filtering + mean = np.nanmean(sea_surface_height) + sd = np.nanstd(sea_surface_height) + final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | + (sea_surface_height < mean - 2*sd), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + + # Filter subsurface photons with diagnostics + PhotonDFBelowSurface = pd.DataFrame() + for i, (k, v) in enumerate(data_groups.items()): + if not np.isnan(final_sea_surface_height[i]): + half_win = 1.0 + peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - half_win) & + (v['photon_height'] < final_sea_surface_height[i] + half_win)] + if len(peak_data) > 10: + mu, sigma = norm.fit(peak_data['photon_height']) + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = v[(v['photon_height'] > mu - half_win2) & + (v['photon_height'] < mu + half_win2)] + if len(peak_data2) > 10: + mu, sigma = norm.fit(peak_data2['photon_height']) + adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) + else: + adaptive_threshold = final_sea_surface_height[i] - 1.0 + + NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] + + if firstTimeIndex: + PhotonDFBelowSurface = NewPhotonDFBelowSurface + firstTimeIndex = False + else: + PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) + + return (final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface) + + +def get_water_temp(date_year, date_month, date_day, latitude, longitude): + """ + Pull down surface water temperature along the track from the JPL GHRSST opendap website. + + The GHRSST data are gridded tiles with dimension 17998 x 35999. + To get the specific grid tile of the SST, you must convert from lat, lon coordinates + to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. + """ + # Get date from data filename + # data_path[-33:-25] + date = date_year + date_month + date_day + # date[0:4] + year = date_year + # date[4:6] + month = date_month + # date[6:8] + day = date_day + day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + # Add zero in front of day of year string + zero_day_of_year = day_of_year.zfill(3) + + # Calculate ratio of latitude from mid-point of IS2 track + old_lat = latitude.mean() + old_lat_min = -90 + old_lat_max = 90 + new_lat_min = 0 + new_lat_max = 17998 + + new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * + (new_lat_max - new_lat_min) + new_lat_min) + + # Calculate ratio of longitude from mid-point of IS2 track + old_lon = longitude.mean() + old_lon_min = -180 + old_lon_max = 180 + new_lon_min = 0 + new_lon_max = 35999 + + new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * + (new_lon_max - new_lon_min) + new_lon_min) + + # Access the SST data using the JPL OpenDap interface + url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ + + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ + + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + + dataset = netCDF4.Dataset(url) + + # Access the data and convert the temperature from K to C + water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + return water_temp + + +def refraction_correction(WTemp, WSmodel, Wavelength, + Photon_ref_elev, Ph_ref_azimuth, + PhotonZ, PhotonX, PhotonY, Ph_Conf): + """ + WTemp; there is python library that pulls water temp data + WSmodel is the value surface height + Wavelength is fixed + """ + + # Only process photons below water surface model + PhotonX = PhotonX[PhotonZ <= WSmodel] + PhotonY = PhotonY[PhotonZ <= WSmodel] + Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] + Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] + Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] + PhotonZ = PhotonZ[PhotonZ <= WSmodel] + + # water temp for refraction correction + WaterTemp = WTemp + + # Refraction coefficient # + a = -0.000001501562500 + b = 0.000000107084865 + c = -0.000042759374989 + d = -0.000160475520686 + e = 1.398067112092424 + wl = Wavelength + + # refractive index of air + n1 = 1.00029 + + # refractive index of water + n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + + # assumption is 0.25416 + # This example is refractionCoef = 0.25449 + # 1.00029 is refraction of air constant + correction_coef = (1 - (n1 / n2)) + + # read photon ref_elev to get theta1 + theta1 = np.pi / 2 - Photon_ref_elev + + # eq 1. Theta2 + theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + + # eq 3. S + # Approximate water Surface = 1.5 + # D = raw uncorrected depth + D = WSmodel - PhotonZ + + # For Triangle DTS + S = D / np.cos(theta1) + + # eq 2. R + R = (S * n1) / n2 + Gamma = (np.pi / 2) - theta1 + + # For triangle RPS + # phi is an angle needed + phi = theta1 - theta2 + + # P is the difference between raw and corrected YZ location + P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + + # alpha is an angle needed + alpha = np.arcsin((R * np.sin(phi)) / P) + + # Beta angle needed for Delta Y an d Delta Z + Beta = Gamma - alpha + + # Delta Y + DY = P * np.cos(Beta) + + # Delta Z + DZ = P * np.sin(Beta) + + # Delta Easting + DE = DY * np.sin(Ph_ref_azimuth) + + # Delta Northing + DN = DY * np.cos(Ph_ref_azimuth) + + outX = PhotonX + DE + outY = PhotonY + DN + outZ = PhotonZ + DZ + + ''' + print('For selected Bathy photon:') + print('lat = ', PhotonY[9000]) + print('long = ', PhotonX[9000]) + print('Raw Depth = ', PhotonZ[9000]) + print('D = ', D[9000]) + + print('ref_elev = ', Photon_ref_elev[9000]) + + print('Delta East = ', DE[9000]) + print('Delta North = ', DN[9000]) + print('Delta Z = ', DZ[9000]) + ''' + return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, + Photon_ref_elev) # We are most interested in out-x, out-y, out-z + diff --git a/icesat-2_kdph_py/kd_utils/visualization.py b/icesat-2_kdph_py/kd_utils/visualization.py new file mode 100644 index 0000000..67c45ba --- /dev/null +++ b/icesat-2_kdph_py/kd_utils/visualization.py @@ -0,0 +1,317 @@ +# utils/visualization.py + +import numpy as np +import os +import pandas as pd +import time +import geopandas as gpd +from pyproj import Transformer, Proj +import matplotlib.pyplot as plt +from scipy.spatial import ConvexHull + +def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): + fig, ax = plt.subplots() + ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') + ax.set_xlabel('Relative AT Distance') + ax.set_ylabel('Height (h_ph)') + ax.set_title('Scatter Plot of ATL03 Photons') + ax.set_ylim(hlims) + ax.legend() + plt.show() + +def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): + """ + Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. + + Parameters: + - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_surface_height: Array of sea surface height values. + - output_path: Path to save the output plot. + """ + + # get sea_surface_x_axis_bins + sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), + filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), + len(sea_surface_height)) + + hlims = [-25, 10] + fig, ax = plt.subplots() + + # Scatter plot of subsurface photons + ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + + # Overlay the seafloor elevation data as points + ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + + ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], + linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') + + # Set labels and title + ax.set_xlabel('Distance From Start of Track (km)') + ax.set_ylabel('Photon Height (m)') + ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') + ax.set_ylim(hlims) + + # Add a legend + ax.legend() + + # Save the plot + plt.legend(loc='upper right') + plt.savefig(output_path, dpi=400, format='jpeg') + + # Show the plot + plt.show() + + +def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull_areas): + """ + Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. + """ + #filter beam type + # photon_dataset + photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] + + fig, ax = plt.subplots(figsize=(10, 6)) + hlims = [-10, 2] + + for lat_bin, hull_points in convex_hulls.items(): + hull = ConvexHull(hull_points) + ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') + + # Calculate centroid to place the label + centroid = np.mean(hull_points[hull.vertices], axis=0) + ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', + horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') + + ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.set_ylim(hlims) + ax.set_xlabel('Latitude') + ax.set_ylabel('Photon Height') + ax.set_title('ConvexHull of Photons within each Lat Bin') + plt.show() + +# plot photons coloured by quality_ph flag for diagnostic purposes +def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target_beam_ids): + """ + Scatter plot of photon height vs. along-track distance, coloured by quality_ph value. + + quality_ph bit meanings (ATL03 v007): + bit 0 (value 1) = possible afterpulse + bit 1 (value 2) = possible impulse response effect + bit 2 (value 4) = possible TEP + """ + dataset = sea_photon_dataset[sea_photon_dataset['beam_id'].isin(target_beam_ids)].copy() + if dataset.empty: + return + + quality_colors = { + 0: ('dimgray', 'nominal (0)'), + 1: ('red', 'afterpulse (1)'), + 2: ('orange', 'impulse response (2)'), + 3: ('darkred', 'afterpulse + impulse (3)'), + 4: ('royalblue','TEP (4)'), + 5: ('purple', 'afterpulse + TEP (5)'), + 6: ('cyan', 'impulse + TEP (6)'), + 7: ('black', 'all flags (7)'), + } + + fig, ax = plt.subplots(figsize=(12, 5)) + unique_flags = sorted(dataset['quality_ph'].dropna().astype(int).unique()) + for flag in unique_flags: + subset = dataset[dataset['quality_ph'].astype(int) == flag] + color, label = quality_colors.get(flag, ('magenta', f'unknown ({flag})')) + ax.scatter(subset['relative_AT_dist'], subset['photon_height'], + s=0.5, c=color, alpha=0.3, edgecolors='none', label=label) + + ax.set_xlabel('Relative Along-Track Distance (km)', fontsize=12) + ax.set_ylabel('Photon Height (m)', fontsize=12) + ax.set_title('Photon quality_ph flag distribution', fontsize=13) + ax.set_ylim([-15, 5]) + ax.legend(loc='lower right', markerscale=6, fontsize=9) + fig.tight_layout() + save_path = os.path.join(output_path, f'{timestamp}_quality_ph_flags.jpg') + plt.savefig(save_path, dpi=300, format='jpeg') + plt.show() + print(f"Quality flag plot saved: {save_path}") + + # Print summary counts + print("\nquality_ph flag summary:") + for flag in unique_flags: + count = (dataset['quality_ph'].astype(int) == flag).sum() + _, label = quality_colors.get(flag, ('', f'unknown ({flag})')) + pct = 100.0 * count / len(dataset) + print(f" {label:35s}: {count:>8,d} ({pct:.1f}%)") + + +# plot the kd and photon +def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): + #filter beam type + # photon_dataset + subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] + + hlims = [-45, 5] + fig, ax1 = plt.subplots(figsize=(10, 6)) + ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], + s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') + ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) + ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel('Photon Height', color='b', fontsize=18) + ax1.tick_params(axis='y', labelcolor='b', labelsize=18) + if 'seafloor_elevation' in subsurface_photon_dataset.columns: + ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') + # ax1.set_xlim([1800, 2100]) # Set x-axis limits + ax1.set_ylim(hlims) + + ax2 = ax1.twinx() + ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], + label='Kd values', color='r', alpha=0.6) + ax2.set_ylabel('Kd Value', color='r', fontsize=18) + ax2.tick_params(axis='y', labelcolor='r',labelsize=18) + # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') + handles1, labels1 = ax1.get_legend_handles_labels() + handles2, labels2 = ax2.get_legend_handles_labels() + fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) + # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) + fig.tight_layout() + plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') + plt.show() + + + +def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): + plt.figure(figsize=(12, 6)) + + plt.subplot(1, 2, 1) + plt.title("Original Height Data") + plt.scatter(lat, height, c=height, cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') + plt.ylim(y_min, y_max) + plt.legend() + + plt.subplot(1, 2, 2) + plt.title("Masked Region (ROI)") + plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + + for polygon in polygons: + plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) + + for lb in lat_bins: + plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) + for hb in height_bins: + plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) + + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') + plt.ylim(y_min, y_max) + plt.legend() + + plt.tight_layout() + plt.show() + +# +def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, + y_limit_top, y_limit_bottom, percentile, file, geo_df, + ref_y, ref_z, beam, epsg_num): + """Create figures""" + + # Create bins for latitude + bath_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(bath_height))+20 + + sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(sea_height))+10 + + # Create new dataframes for median values + bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + + # Create uniform sea surface based on median sea surface values and filter out surface breaching + sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] + sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + + # Create uniform solo sea surface label + sea_surface_label = solo_sea_surface_label + sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + idx_1 = np.where(sea_surface_label_df.y == 1) + idx_0 = np.where(sea_surface_label_df.y == 0) + + # Define figure size + fig = plt.rcParams["figure.figsize"] = (40, 25) + + # Plot raw points + # plt.scatter(x=binned_data.lat, + # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, + # c = 'yellow', label = 'Raw photon height') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') + plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', + alpha=0.1, c='red', label='Classified Photons') + + # plt.scatter(x=geo_df.lat, + # y = geo_df.photon_height, marker='o', lw=0, s=0.8, + # alpha = 0.8, c = 'black', label = 'Corrected photon bin') + + # Plot median values + plt.scatter(bath_median_df.x, bath_median_df.y, + marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') + + plt.scatter(sea_median_df.x, sea_median_df.y, + marker='o', c='b', alpha=1, s=2, label='Median sea surface') + + plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, + marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') + plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, + marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + + # Insert titles and subtitles + plt.title('Icesat2 Bathymetry\n' + file) + plt.xlabel('Latitude', fontsize=25) + plt.ylabel('Photon Height (m)', fontsize=25) + plt.xticks(fontsize=16) + plt.yticks(fontsize=16) + + plt.legend(loc="upper left", prop={'size': 20}) + + # Limit the x and y axes using parameters + plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) + plt.ylim(top=y_limit_top, bottom=y_limit_bottom) + + timestr = time.strftime("%Y%m%d_%H%M%S") + file = file.replace('.h5', '') + # Define where to save file + plt.tight_layout() + plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + + str(beam) + '_' + str(percentile) + + '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + # plt.show() + # plt.close() + + # convert corrected locations back to wgs84 (useful to contain) + transformer = Transformer.from_crs("EPSG:" + str(epsg_num), + "EPSG:4326", always_xy=True) + print(transformer) + lon_wgs84, lat_wgs84 = transformer.transform( + geo_df.lon.values, geo_df.lat.values) + + geo_df['lon_wgs84'] = lon_wgs84 + geo_df['lat_wgs84'] = lat_wgs84 + + geodf = gpd.GeoDataFrame(geo_df, + geometry=gpd.points_from_xy(geo_df.lon_wgs84, + geo_df.lat_wgs84)) + + geodf.set_crs(epsg=4326, inplace=True) + + # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + + # str(epsg_num) + '_' + timestr + ".gpkg", + # driver="GPKG") \ No newline at end of file diff --git a/icesat-2_kdph_py/main.py b/icesat-2_kdph_py/main.py new file mode 100644 index 0000000..bf046f3 --- /dev/null +++ b/icesat-2_kdph_py/main.py @@ -0,0 +1,288 @@ +import glob +import logging +import os +import re +import sys + +import pandas as pd + +from config import get_args +from kd_utils.Kd_analysis import process_kd_calculation +from kd_utils.bathy_processing import process_subsurface_photon_filtering +from kd_utils.data_processing import ( + Extract_sea_photons, + apply_optional_ir_ap_filter, + apply_optional_solar_background_filter, + filter_photon_dataset_by_hull_area, + load_data, +) +from kd_utils.sea_photons_analysis import process_sea_photon_binning +from kd_utils.visualization import plot_convex_hulls, plot_kd_photons, plot_photon_quality_flags + + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +PAIR_ID_MAP = { + 'gt1l': 'gt1_pair', + 'gt1r': 'gt1_pair', + 'gt2l': 'gt2_pair', + 'gt2r': 'gt2_pair', + 'gt3l': 'gt3_pair', + 'gt3r': 'gt3_pair', +} + + +def get_target_beams(all_beams, beam_attrs, target_beams_arg): + strong_beams = [ + gtx for gtx in all_beams + if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' + ] + if not target_beams_arg: + return strong_beams[:1] # auto-select first strong beam + + requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] + return [b for b in requested if b in strong_beams] + + +def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=False): + """ + Optionally combine left/right beams into pair groups before Kd fitting. + Uses shared along-track bins from relative_AT_dist when available. + + Flowchart step: 15 — Combine data from paired beams if appropriate (not + recommended in optically complex water). + """ + if (not enabled) or dataset.empty: + return dataset + + combined = dataset.copy() + combined['source_beam_id'] = combined['beam_id'] + combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) + + if 'relative_AT_dist' in combined.columns: + rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') + pair_bins = pair_bins.fillna(-1).astype(int) + combined['lat_bins'] = pair_bins + + return combined + + +def run_pipeline(args): + """ + Top-level pipeline orchestrator. Executes all 19 flowchart steps in sequence. + + Flowchart steps directly handled here: + 1 — Load ATL03 data (load_data / Extract_sea_photons). + 2 — Remove afterpulse/impulse-response photons (apply_optional_ir_ap_filter). + 3 — Bin sizes set via args.horizontal_res / args.vertical_res. + 4 — Build histograms (process_sea_photon_binning). + 5 — Solar background filter (apply_optional_solar_background_filter). + 6–14 — Subsurface filtering chain (process_subsurface_photon_filtering). + 15 — Paired beam combine (optionally_combine_paired_beams_for_kd). + 16–17 — Beer's Law Kd fit (process_kd_calculation). + 18 — Save Kd CSV output (subsurface_photon_df_added_kd.to_csv). + 19 — Check data for reasonableness (filter_photon_dataset_by_hull_area + + plot_kd_photons). + """ + atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) + shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) + gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) + atl24_file_path = args.atl24_file + if atl24_file_path and (not os.path.isabs(atl24_file_path)): + atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) + output_path = os.path.join(args.workspace_path, args.output_path) + os.makedirs(output_path, exist_ok=True) + + match = re.search(r"_(\d{14})_", atl03_h5_file_path) + timestamp = match.group(1) if match else "unknown" + + version_match = re.search(r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path)) + atl03_version = int(version_match.group(1)) if version_match else None + logger.info("ATL03 version detected: %s", f"{atl03_version:03d}" if atl03_version else "unknown") + + if args.enable_ir_ap_filter: + if atl03_version is None or atl03_version < 7: + logger.error( + "IR/AP filter requires ATL03 version 007 or later. " + "Detected version: %s. " + "Skipping IR/AP filter for this run. " + "Switch to a version 007 file (e.g., Wax Delta dataset) to enable this step.", + f"{atl03_version:03d}" if atl03_version else "unknown", + ) + args.enable_ir_ap_filter = False + + gebco_pattern = os.path.join(gebco_full_path, "gebco_*.tif") + gebco_file_path_lists = [p for p in glob.glob(gebco_pattern)] + + is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) + target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) + if not target_strong_beams: + logger.error("No target strong beams found. Check input file and --target_beams.") + sys.exit(1) + logger.info("Strong beams selected: %s", target_strong_beams) + plot_target_beam = [target_strong_beams[0]] + + sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + + if args.enable_ir_ap_filter and not args.no_plot: + plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, plot_target_beam) + + sea_photon_dataset = apply_optional_ir_ap_filter( + sea_photon_dataset, + enabled=args.enable_ir_ap_filter, + quality_max=args.ir_ap_quality_max, + min_signal_conf=args.ir_ap_min_signal_conf, + ) + + # Step 5 — Solar background filter is ON by default. + # It self-gates on solar_elevation so has zero effect on nighttime passes. + # Use --disable_solar_background_filter to turn it off. + solar_bg_enabled = not args.disable_solar_background_filter + if not solar_bg_enabled: + if 'solar_elevation' in sea_photon_dataset.columns: + max_solar_elev = sea_photon_dataset['solar_elevation'].max() + if max_solar_elev > args.solar_elevation_day_threshold: + logger.warning( + "Solar background filter is DISABLED via --disable_solar_background_filter " + "but this granule is a daytime pass " + "(max solar elevation = %.1f°, daytime threshold = %.1f°). " + "This filter is recommended for daytime passes.", + max_solar_elev, + args.solar_elevation_day_threshold, + ) + + sea_photon_dataset = apply_optional_solar_background_filter( + sea_photon_dataset, + enabled=solar_bg_enabled, + day_threshold=args.solar_elevation_day_threshold, + median_window_deg=args.solar_bg_median_window_deg, + noise_multiplier=args.solar_bg_noise_multiplier, + min_signal_conf=args.solar_background_min_signal_conf, + ) + + binned_dataset_sea_surface = process_sea_photon_binning( + sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res + ) + + post_refraction_refit_enabled = args.enable_post_refraction_refit + if post_refraction_refit_enabled and (not args.enable_refraction_correction): + logger.warning( + "--enable_post_refraction_refit requested without --enable_refraction_correction. " + "The post-refraction refit step will be skipped." + ) + post_refraction_refit_enabled = False + + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) + + final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) + ].copy() + + if args.enable_convex_hull_filter: + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ + filter_photon_dataset_by_hull_area( + final_filtered_subsurface_photon_dataset, + hull_area_threshold=args.convex_hull_area_threshold + ) + if plot_target_beam and not args.no_plot: + plot_convex_hulls( + final_filtered_subsurface_photon_dataset, + plot_target_beam, + convex_hulls, + convex_hull_areas + ) + + subsurface_output_path = os.path.join( + output_path, + f"{timestamp}_strongBeam_{'_'.join(target_strong_beams)}_subsurface_photons.csv", + ) + final_filtered_subsurface_photon_dataset.to_csv(subsurface_output_path, index=False) + + kd_input_dataset = optionally_combine_paired_beams_for_kd( + final_filtered_subsurface_photon_dataset, + horizontal_res=args.horizontal_res, + enabled=args.enable_paired_beam_combine + ) + wave_mult = args.wave_exclusion_multiplier if args.enable_wave_adaptive_fit else 0.0 + subsurface_photon_df_added_kd = process_kd_calculation( + kd_input_dataset, decay_zone_threshold=args.decay_zone_threshold, + kd_fit_method=args.kd_fit_method, + wave_exclusion_multiplier=wave_mult, + wave_sigma_calm_threshold=args.wave_sigma_calm_threshold, + ) + kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") + subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) + + if not args.no_plot and 'relative_AT_dist' in kd_input_dataset.columns: + unique_photon_dataset = kd_input_dataset[ + ['relative_AT_dist', 'lat_bins', 'photon_height'] + ].drop_duplicates() + unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( + 'lat_bins', observed=False + )['relative_AT_dist'].transform('mean') + + _closest_rows = [] + for _lat_bin, _group in unique_photon_dataset.groupby('lat_bins', observed=False): + if _group.empty: + continue + _center = _group['relative_AT_dist_center'].iloc[0] + _group = _group.copy() + _group['dist_to_center'] = abs(_group['relative_AT_dist'] - _center) + _closest_rows.append(_group.loc[[_group['dist_to_center'].idxmin()]]) + if not _closest_rows: + logger.warning("No along-track bins survived filtering. Skipping plot.") + else: + closest_to_center = pd.concat(_closest_rows).reset_index(drop=True) + kd_df_merged_distance = closest_to_center.merge( + subsurface_photon_df_added_kd, + on='lat_bins', + how='left' + ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + + plot_kd_photons( + output_path, + timestamp, + plot_target_beam, + kd_input_dataset, + kd_df_merged_distance, + ) + + logger.info("SUCCESS! Kd output: %s", kd_output_path) + + +if __name__ == '__main__': + cli_args = get_args() + try: + run_pipeline(cli_args) + except Exception as e: + logger.error("An error occurred: %s", e) + raise From 782d205813932870a86d9a681468eed2d6bd2184 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:10:34 +0000 Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/dependabot.yml | 2 +- .gitignore | 2 +- .pre-commit-config.yaml | 2 +- CITATION.cff | 2 +- CONTRIBUTORS.md | 2 +- README.md | 3 +- aok/core/kd_utils/Kd_analysis.py | 177 ++-- aok/core/kd_utils/README.md | 1 - aok/core/kd_utils/SeaSurfaceFlattening.py | 593 +++++++----- .../kd_utils/SeafloorFilterByGEBCO_Module.py | 66 +- .../IS2_BG_Rate_Land_Ocean_plot.py | 741 ++++++++++----- .../analyze_nighttime_BG_rate_icesat2.py | 764 +++++++++++----- .../analyze_solarbckgrd_icesat2.py | 715 ++++++++++----- aok/core/kd_utils/SolarBckgrd/config.py | 130 ++- aok/core/kd_utils/__init__.py | 20 +- aok/core/kd_utils/bathy_processing.py | 815 ++++++++++------- aok/core/kd_utils/config.py | 402 ++++++--- aok/core/kd_utils/data_processing.py | 823 +++++++++-------- aok/core/kd_utils/interpolation.py | 8 +- aok/core/kd_utils/kd_utils.py | 847 +++++++++++------- aok/core/kd_utils/main.py | 174 ++-- aok/core/kd_utils/sea_photons_analysis.py | 340 ++++--- aok/core/kd_utils/sliderule_adapter.py | 173 ++-- aok/core/kd_utils/visualization.py | 401 ++++++--- code_of_conduct.md | 2 +- icesat-2_kdph_py/config.py | 506 +++++++---- icesat-2_kdph_py/kd_utils/Kd_analysis.py | 460 ++++++---- .../kd_utils/SeafloorFilterByGEBCO_Module.py | 66 +- icesat-2_kdph_py/kd_utils/__init__.py | 20 +- icesat-2_kdph_py/kd_utils/bathy_processing.py | 582 +++++++----- icesat-2_kdph_py/kd_utils/data_processing.py | 651 ++++++++------ icesat-2_kdph_py/kd_utils/interpolation.py | 8 +- icesat-2_kdph_py/kd_utils/kd_utils.py | 847 +++++++++++------- .../kd_utils/sea_photons_analysis.py | 341 ++++--- icesat-2_kdph_py/kd_utils/visualization.py | 462 +++++++--- icesat-2_kdph_py/main.py | 205 +++-- pyproject.toml | 2 +- requirements.txt | 14 +- 38 files changed, 7172 insertions(+), 4197 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index fdf94da..0b84a24 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -10,4 +10,4 @@ updates: patterns: - "*" # Group all Actions updates into a single larger pull request schedule: - interval: monthly \ No newline at end of file + interval: monthly diff --git a/.gitignore b/.gitignore index 20f1040..c1d6066 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,4 @@ cython_debug/ .vscode/ # icepyx -.order_restart \ No newline at end of file +.order_restart diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 98d28bc..6557e47 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,4 +40,4 @@ repos: ci: autoupdate_schedule: monthly -# you can run `pre-commit autoupdate` to automatically update to the latest version of hooks! \ No newline at end of file +# you can run `pre-commit autoupdate` to automatically update to the latest version of hooks! diff --git a/CITATION.cff b/CITATION.cff index c227693..128a5d7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -18,4 +18,4 @@ keywords: - Python - open science - NASA -license: BSD-3-Clause \ No newline at end of file +license: BSD-3-Clause diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 45e63a8..4b3f90a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -22,4 +22,4 @@ Thanks goes to these wonderful people [emoji key](https://allcontributors.org/do - \ No newline at end of file + diff --git a/README.md b/README.md index 812d929..4dcf9cc 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Coming soon... # Contributing -[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md) +[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md) Our team follows the [Contributor Covenant Code of Conduct](https://www.contributor-covenant.org), version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). @@ -38,4 +38,3 @@ Our full attribution guidelines are coming soon. We use the [All Contributors Specification](https://allcontributors.org/docs/en/specification) to recognize our incredible [contributor team](CONTRIBUTORS.md). - diff --git a/aok/core/kd_utils/Kd_analysis.py b/aok/core/kd_utils/Kd_analysis.py index 6e28301..d315da2 100644 --- a/aok/core/kd_utils/Kd_analysis.py +++ b/aok/core/kd_utils/Kd_analysis.py @@ -1,61 +1,63 @@ # utils/Kd_analysis.py # updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. +import logging + import numpy as np import pandas as pd -import logging + # from scipy.optimize import curve_fit from sklearn.linear_model import LinearRegression # def log_model(z, kd, e0): # return np.log(e0) - kd * z -## This is wrong because the input is already a bined data, +## This is wrong because the input is already a bined data, ## it is not necessary to do a histogram again # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -66,22 +68,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -95,47 +97,47 @@ # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -146,22 +148,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -174,68 +176,91 @@ # another solution is to calculate kd without hist def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): # Early exit if DataFrame is empty or missing required column - if df.empty or 'lat_bins' not in df.columns: - return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + if df.empty or "lat_bins" not in df.columns: + return pd.DataFrame( + { + "lat_bins": [np.nan], + "kd": [np.nan], + "e0": [np.nan], + "latitude": [np.nan], + "longitude": [np.nan], + } + ) + # Retrieve latitude and longitude - lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan - latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan - longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + lat_bin_value = df["lat_bins"].iloc[0] if not df["lat_bins"].empty else np.nan + latitude = ( + df["latitude"].mean() + if "latitude" in df.columns + else df["lat"].mean() + if "lat" in df.columns + else np.nan + ) + longitude = ( + df["longitude"].mean() + if "longitude" in df.columns + else df["lon"].mean() + if "lon" in df.columns + else np.nan + ) + # Use value_counts to get photon counts in each height bin - height_counts = df['height_bins'].value_counts().sort_index() + height_counts = df["height_bins"].value_counts().sort_index() bin_centers = height_counts.index.astype(float) - + # Create a DataFrame for the height bins and counts - hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) - + hist_df = pd.DataFrame( + {"zdepth": bin_centers, "photon_counts": height_counts.values} + ) + # Reverse zdepth for model alignment - hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + hist_df["zdepth"] = hist_df["zdepth"].max() - hist_df["zdepth"] + # Log-transform photon counts, replacing zeros with NaN for regression - hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) - hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + hist_df["log_photon_counts"] = np.log(hist_df["photon_counts"].replace(0, np.nan)) + hist_df.loc[np.isinf(hist_df["log_photon_counts"]), "log_photon_counts"] = np.nan + # Filter for rows without NaNs for regression - valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + valid_data = hist_df.dropna(subset=["zdepth", "log_photon_counts"]) + # Perform regression if there are sufficient valid data points - if valid_data['log_photon_counts'].notna().sum() > 3: - zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) - log_counts_valid = valid_data['log_photon_counts'].values + if valid_data["log_photon_counts"].notna().sum() > 3: + zdepth_valid = valid_data["zdepth"].values.reshape(-1, 1) + log_counts_valid = valid_data["log_photon_counts"].values # Perform linear regression model = LinearRegression() model.fit(zdepth_valid, log_counts_valid) - + # Calculate kd and e0 kd = -model.coef_[0] e0 = np.exp(model.intercept_) - + # Set kd to NaN if negative if kd < 0: kd = np.nan else: kd, e0 = np.nan, np.nan - - return pd.DataFrame({ - 'lat_bins': [lat_bin_value], - 'kd': [kd], - 'e0': [e0], - 'latitude': [latitude], - 'longitude': [longitude] - }) + + return pd.DataFrame( + { + "lat_bins": [lat_bin_value], + "kd": [kd], + "e0": [e0], + "latitude": [latitude], + "longitude": [longitude], + } + ) # Original kd calculation function remains unchanged def calculate_kd(filtered_seafloor_subsurface_photon_dataset): logging.info("Calculating Kd from filtered subsurface photon dataset") - SubsurfacePhotonDFAddedKd = filtered_seafloor_subsurface_photon_dataset\ - .groupby('lat_bins', observed=False)\ - .apply(CalculateKdFromFilteredSubsurfacePhoton, include_groups=True) - + SubsurfacePhotonDFAddedKd = filtered_seafloor_subsurface_photon_dataset.groupby( + "lat_bins", observed=False + ).apply(CalculateKdFromFilteredSubsurfacePhoton, include_groups=True) - # Remove the index without resetting it if 'lat_bins' already exists + # Remove the index without resetting it if 'lat_bins' already exists SubsurfacePhotonDFAddedKd = SubsurfacePhotonDFAddedKd.droplevel(0) return SubsurfacePhotonDFAddedKd @@ -246,14 +271,16 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset): kd_beam_datasets = [] # Group by 'beam_id' to process each beam independently - for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby( + "beam_id" + ): logging.info(f"Calculating Kd for beam: {beam_id}") # Apply the calculate_kd function to the current beam's dataset SubsurfacePhotonDFAddedKd = calculate_kd(beam_data) # Add a column to track the beam_id in the results - SubsurfacePhotonDFAddedKd['beam_id'] = beam_id + SubsurfacePhotonDFAddedKd["beam_id"] = beam_id # Append the result to the list kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) @@ -261,4 +288,4 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset): # Combine results from all beams into a single DataFrame combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) - return combined_kd_dataset \ No newline at end of file + return combined_kd_dataset diff --git a/aok/core/kd_utils/README.md b/aok/core/kd_utils/README.md index 28f6ee4..8cefa84 100644 --- a/aok/core/kd_utils/README.md +++ b/aok/core/kd_utils/README.md @@ -195,4 +195,3 @@ Eidam, E.F., K. Bisson, C. Wang, C. Walker, and A. Gibbons (2024). ICESat-2 and ### License This project is licensed under the GNU GPLv3 license. - diff --git a/aok/core/kd_utils/SeaSurfaceFlattening.py b/aok/core/kd_utils/SeaSurfaceFlattening.py index 6b3d64f..1c02e21 100644 --- a/aok/core/kd_utils/SeaSurfaceFlattening.py +++ b/aok/core/kd_utils/SeaSurfaceFlattening.py @@ -1,36 +1,34 @@ -# -*- coding: utf-8 -*- """ Created on Thu Jul 25 13:25:53 2024 @author: wayne """ -from kd_utils import * import scipy.interpolate +from kd_utils import * -path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/' +path = "C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/" ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5" ATL03_h5_file_path = path + ATL03_h5_file # Set the shoreline data path -Current_Path='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' - -#load the shoreline package -shoreline_data_path = Current_Path+'Shorelines/GeoPkgGlobalShoreline.gpkg' +Current_Path = "C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/" +# load the shoreline package +shoreline_data_path = Current_Path + "Shorelines/GeoPkgGlobalShoreline.gpkg" # speed of light c = 299792458.0 # bin data first based on horizontal resolution -horizontal_res=10 +horizontal_res = 10 # vertical resolution # vertical_res=0.1 -vertical_res=0.25 +vertical_res = 0.25 # vertical_res=1 # sea subsurface thresh @@ -44,16 +42,22 @@ ######################################## -#readATL03_github -IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams = read_granule(ATL03_h5_file_path,ATTRIBUTES=True) +# readATL03_github +IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams = read_granule( + ATL03_h5_file_path, ATTRIBUTES=True +) # extract parameters from ICESat-2 ATLAS HDF5 file name -rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' - r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') -SUB,PRD,YY,MM,DD,HH,MN,SS,TRK,CYCL,GRAN,RL,VERS,AUX = rx.findall(ATL03_h5_file_path).pop() +rx = re.compile( + r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" + r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" +) +SUB, PRD, YY, MM, DD, HH, MN, SS, TRK, CYCL, GRAN, RL, VERS, AUX = rx.findall( + ATL03_h5_file_path +).pop() -# initialize data storage variables +# initialize data storage variables # variables of interest for generating three types classification Segment_ID = {} Segment_Index_begin = {} @@ -62,7 +66,7 @@ Equator_Segment_Distance = {} Segment_Length = {} -Segment_Is_Land={} +Segment_Is_Land = {} # mean geolocation, height and delta time Segment_Lon = {} @@ -70,286 +74,325 @@ Segment_Elev = {} Segment_Time = {} -Segment_ref_elev= {} -Segment_ref_azimuth= {} +Segment_ref_elev = {} +Segment_ref_azimuth = {} # relative_AT_dist = {} # relative_seg_dist = {} # background photon rate background_rate = {} -background_counts={} - +background_counts = {} ######################################## -#proc02_manually_select_water_forGitHub -#remove water surface based on simple 0.5 m +# proc02_manually_select_water_forGitHub +# remove water surface based on simple 0.5 m -#raw beam-by-beam photon data after cutoff below the max value 0.5 below peak -IS2_atl03Cut50cmBelowPeak={} +# raw beam-by-beam photon data after cutoff below the max value 0.5 below peak +IS2_atl03Cut50cmBelowPeak = {} # for each input beam within the file for gtx in sorted(IS2_atl03_beams[0:1]): -# for gtx in sorted(IS2_atl03_beams): + # for gtx in sorted(IS2_atl03_beams): # data and attributes for beam gtx IS2_val = IS2_atl03_mds[gtx] IS2_attrs = IS2_atl03_attrs[gtx] - + # ATL03 Segment ID - Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] # number of valid overlapping ATL03 segments n_seg = len(Segment_ID[gtx]) # number of photon events - n_pe, = IS2_val['heights']['delta_time'].shape - + (n_pe,) = IS2_val["heights"]["delta_time"].shape + # first photon ID (1-based) in each segment (convert to 0-based indexing) - Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 - + Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 + # number of photon events in the segment - Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] - + Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] + # along-track distance from the equator crossing to the start of each ATL03 20 meter geolocation segment - Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] - + Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] + # along-track length for each ATL03 segment - Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] - + Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] + # Transmit time of the reference photon - delta_time = IS2_val['geolocation']['delta_time'] - + delta_time = IS2_val["geolocation"]["delta_time"] + # get geolocation lat/lon - segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() - segment_lon= IS2_val['geolocation']['reference_photon_lon'][:].copy() - + segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() + segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() + # get parameters ref elev and azimuth for refraction correction - ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() - ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() - - #get geoid for converting WGS84 ellipsoid to geoid correction - geoid = IS2_val['geophys_corr']['geoid'][:].copy() - + ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() + ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() + + # get geoid for converting WGS84 ellipsoid to geoid correction + geoid = IS2_val["geophys_corr"]["geoid"][:].copy() + # photon event heights - h_ph = IS2_val['heights']['h_ph'][:].copy() - lat_ph = IS2_val['heights']['lat_ph'][:].copy() - lon_ph = IS2_val['heights']['lon_ph'][:].copy() -# dist_ph_along = IS2_val['heights']['dist_ph_along'][:].copy() - signal_conf_photon = IS2_val['heights']['signal_conf_ph'][...,0].copy() + h_ph = IS2_val["heights"]["h_ph"][:].copy() + lat_ph = IS2_val["heights"]["lat_ph"][:].copy() + lon_ph = IS2_val["heights"]["lon_ph"][:].copy() + # dist_ph_along = IS2_val['heights']['dist_ph_along'][:].copy() + signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() # along-track and across-track distance for photon events - x_atc = IS2_val['heights']['dist_ph_along'][:].copy() - y_atc = IS2_val['heights']['dist_ph_across'][:].copy() - + x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() + y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() + # photon quality - quality_ph=IS2_val['heights']['quality_ph'] - - #calculate along track distance + quality_ph = IS2_val["heights"]["quality_ph"] + + # calculate along track distance for seg_index in range(n_seg): # index for 20m segment j idx = Segment_Index_begin[gtx][seg_index] # number of photons in 20m segment cnt = Segment_PE_count[gtx][seg_index] # add segment distance to along-track coordinates - x_atc[idx:idx+cnt] += Equator_Segment_Distance[gtx][seg_index] - + x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + # calculate along track distance relative to the beginning of the cut segment - relative_AT_dist=(x_atc-x_atc[0])/1000 - - - relative_seg_dist=(Equator_Segment_Distance[gtx]-Equator_Segment_Distance[gtx][0])/1000 - - + relative_AT_dist = (x_atc - x_atc[0]) / 1000 + + relative_seg_dist = ( + Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] + ) / 1000 + # this function is a significant slowdown without pygeos - Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) - - # create a geo dataframe - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land,crs="EPSG:4326") - - #Step1: isolate water using landmask + Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) + + # create a geo dataframe + ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") + + # Step1: isolate water using landmask # spatial joint to determine land/sea mask - Segment_Is_Land_Labels=isolate_sea_land_photons(shoreline_data_path,ICESat2_GDF) - -# # set the labels back to ICESat2 geopandas dataframe - ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels - - # interpolate is_land labels based on photon lat - island_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], - Segment_Is_Land_Labels[:], - fill_value="extrapolate") - + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + + # # set the labels back to ICESat2 geopandas dataframe + ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels + + # interpolate is_land labels based on photon lat + island_interp1d_model = scipy.interpolate.interp1d( + segment_lat[:], Segment_Is_Land_Labels[:], fill_value="extrapolate" + ) + # Apply interpid model is_land_label_interp1d = island_interp1d_model(lat_ph[:]) # Ref_elev on a per photon level (assign seg ref_elev to photons) - ref_elev_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], - ref_elev[:], - fill_value="extrapolate") + ref_elev_interp1d_model = scipy.interpolate.interp1d( + segment_lat[:], ref_elev[:], fill_value="extrapolate" + ) # apply interpid model ph_ref_elev = ref_elev_interp1d_model(lat_ph[:]) # Ref_azimuth on a per photon level (assign seg ref_azimuth to photons) - ref_azimuth_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], - ref_azimuth[:], - fill_value="extrapolate") - + ref_azimuth_interp1d_model = scipy.interpolate.interp1d( + segment_lat[:], ref_azimuth[:], fill_value="extrapolate" + ) + # apply interpid model ph_ref_azimuth = ref_azimuth_interp1d_model(lat_ph[:]) - - - # interpolate geoid based on segment lat - geoid_interp1d_model = scipy.interpolate.interp1d(segment_lat[:], - geoid[:], - fill_value="extrapolate") - + + # interpolate geoid based on segment lat + geoid_interp1d_model = scipy.interpolate.interp1d( + segment_lat[:], geoid[:], fill_value="extrapolate" + ) + # Apply interpid model ph_geoid = geoid_interp1d_model(lat_ph[:]) - - - #geoid correct + + # geoid correct h_ph_geoid_cor = h_ph[:] - ph_geoid[:] - + # Find the epsg code epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - epsg_num = int(epsg_code.split(':')[-1]) - + epsg_num = int(epsg_code.split(":")[-1]) + # Orthometrically correct the data using the epsg code lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) # Aggregate data into dataframe - sea_photon_dataset = \ - pd.DataFrame({'latitude': lat_ph, 'longitude': lon_ph, - 'lat': lat_utm, 'lon': lon_utm, - 'photon_height': h_ph_geoid_cor, - 'quality_ph': quality_ph, - 'is_land_label': is_land_label_interp1d, - 'photon_conf': signal_conf_photon, - 'ref_elevation': ph_ref_elev, - 'ref_azimuth': ph_ref_azimuth, - 'relative_AT_dist':relative_AT_dist}, - columns=['latitude', 'longitude', - 'lat', 'lon', - 'photon_height', - 'quality_ph', - 'is_land_label', 'photon_conf', - 'ref_elevation', 'ref_azimuth', - 'relative_AT_dist']) - + sea_photon_dataset = pd.DataFrame( + { + "latitude": lat_ph, + "longitude": lon_ph, + "lat": lat_utm, + "lon": lon_utm, + "photon_height": h_ph_geoid_cor, + "quality_ph": quality_ph, + "is_land_label": is_land_label_interp1d, + "photon_conf": signal_conf_photon, + "ref_elevation": ph_ref_elev, + "ref_azimuth": ph_ref_azimuth, + "relative_AT_dist": relative_AT_dist, + }, + columns=[ + "latitude", + "longitude", + "lat", + "lon", + "photon_height", + "quality_ph", + "is_land_label", + "photon_conf", + "ref_elevation", + "ref_azimuth", + "relative_AT_dist", + ], + ) + # #remove the saturated photons # sea_photon_dataset=sea_photon_dataset[sea_photon_dataset['quality_ph'] < 1] - - #remove the land photons - sea_photon_dataset=sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] - + + # remove the land photons + sea_photon_dataset = sea_photon_dataset[sea_photon_dataset["is_land_label"] != 1] + # bin the data by 25 cm - binned_dataset_sea_surface=horizontal_vertical_bin_dataset(sea_photon_dataset, horizontal_res, vertical_res) - + binned_dataset_sea_surface = horizontal_vertical_bin_dataset( + sea_photon_dataset, horizontal_res, vertical_res + ) + # get sea surface height # threshold to determine how much depth below sea surface will be accounted # Output: # ->final_sea_surface_height # ->sea_surface_height_abnormal_label # ->sea_surface_dominated_label - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - get_sea_surface_height(binned_dataset_sea_surface, subsurface_thresh) - - -sea_surface_height.reverse() - + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = get_sea_surface_height(binned_dataset_sea_surface, subsurface_thresh) +sea_surface_height.reverse() #################################################################### # Tasks: Sea Surface flattening -# I have a list value of sea_surface_height over each 10 m bin, I would like to +# I have a list value of sea_surface_height over each 10 m bin, I would like to # 1) calculate the mean sea_surface_height value along each big bin with 500 m ; -# 2) Within each 10-m along track increment, -# adjust all of the photons (i.e., subsurface_photon_dataset) within that 10-m column either up or down -# based on the deviation of the local sea surface height from the mean sea surface height +# 2) Within each 10-m along track increment, +# adjust all of the photons (i.e., subsurface_photon_dataset) within that 10-m column either up or down +# based on the deviation of the local sea surface height from the mean sea surface height ###################### # Step 1: Creating a DataFrame for sea_surface_height with a corresponding 10 m small bin index # each element in sea_surface_height corresponds to a 10 m bin -df_sea_surface = pd.DataFrame({ - 'sea_surface_height': sea_surface_height, # sea_surface_height is provided as a list - 'small_bins': np.arange(len(sea_surface_height)) # Index for each 10 m bin -}) +df_sea_surface = pd.DataFrame( + { + "sea_surface_height": sea_surface_height, # sea_surface_height is provided as a list + "small_bins": np.arange(len(sea_surface_height)), # Index for each 10 m bin + } +) # # Map 'small_bins'(i.e., 'lat_bins') to 500 m 'big_bins' -df_sea_surface['big_bins'] = df_sea_surface['small_bins'] // 50 # 50 10 m bins in each 500 m bin +df_sea_surface["big_bins"] = ( + df_sea_surface["small_bins"] // 50 +) # 50 10 m bins in each 500 m bin ###################### # Step 2: Calculate Mean Sea_surface_height value for Each 500-m Big Bin -mean_sea_surface_elevation_bigBin = df_sea_surface.groupby('big_bins')['sea_surface_height'].mean().reset_index() -mean_sea_surface_elevation_bigBin.rename(columns={'sea_surface_height': 'mean_sea_surface_height_big_bins'}, inplace=True) +mean_sea_surface_elevation_bigBin = ( + df_sea_surface.groupby("big_bins")["sea_surface_height"].mean().reset_index() +) +mean_sea_surface_elevation_bigBin.rename( + columns={"sea_surface_height": "mean_sea_surface_height_big_bins"}, inplace=True +) # Prepare subsurface_photon_dataset for merging -subsurface_photon_dataset['big_bins'] = subsurface_photon_dataset['lat_bins'].astype(int) // 50 +subsurface_photon_dataset["big_bins"] = ( + subsurface_photon_dataset["lat_bins"].astype(int) // 50 +) # Merge the mean elevation data with the subsurface_photon_dataset -subsurface_photon_dataset = subsurface_photon_dataset.merge(mean_sea_surface_elevation_bigBin,\ - on='big_bins', how='left') +subsurface_photon_dataset = subsurface_photon_dataset.merge( + mean_sea_surface_elevation_bigBin, on="big_bins", how="left" +) # Merge sea_surface_height data with subsurface_photon_dataset based on the small_bins -subsurface_photon_dataset = subsurface_photon_dataset.merge(df_sea_surface[['small_bins', 'sea_surface_height']], \ - left_on='lat_bins', right_on='small_bins', how='left') +subsurface_photon_dataset = subsurface_photon_dataset.merge( + df_sea_surface[["small_bins", "sea_surface_height"]], + left_on="lat_bins", + right_on="small_bins", + how="left", +) ####################### -# Step 3: Within each 10 m small bin, adjust the photon heights +# Step 3: Within each 10 m small bin, adjust the photon heights # based on the deviation between their sea surface height at each small bin and the mean sea surface height within each 500-m bin -subsurface_photon_dataset['adjusted_photon_height'] = subsurface_photon_dataset['photon_height'] - \ - (subsurface_photon_dataset['mean_sea_surface_height_big_bins']-\ - subsurface_photon_dataset['sea_surface_height']\ - ) - -subsurface_photon_dataset['Dif_photon_height'] = (subsurface_photon_dataset['mean_sea_surface_height_big_bins']-\ - subsurface_photon_dataset['sea_surface_height']\ - ) +subsurface_photon_dataset["adjusted_photon_height"] = subsurface_photon_dataset[ + "photon_height" +] - ( + subsurface_photon_dataset["mean_sea_surface_height_big_bins"] + - subsurface_photon_dataset["sea_surface_height"] +) + +subsurface_photon_dataset["Dif_photon_height"] = ( + subsurface_photon_dataset["mean_sea_surface_height_big_bins"] + - subsurface_photon_dataset["sea_surface_height"] +) ###################################### ################################### ###################################### ################################### -###Visualization +###Visualization -import matplotlib.pyplot as plt -from mpl_toolkits.axes_grid1.inset_locator import inset_axes from matplotlib.patches import Rectangle +import matplotlib.pyplot as plt import matplotlib.transforms as mtransforms +from mpl_toolkits.axes_grid1.inset_locator import inset_axes +sea_surface_x_axis_bins = np.linspace( + relative_seg_dist.min(), relative_seg_dist.max(), len(sea_surface_height) +) -sea_surface_x_axis_bins = np.linspace(relative_seg_dist.min(), - relative_seg_dist.max(), len(sea_surface_height)) - - - -mean_sea_surface_x_axis_bins = np.linspace(relative_seg_dist.min(), - relative_seg_dist.max(), len(mean_sea_surface_elevation_bigBin)) +mean_sea_surface_x_axis_bins = np.linspace( + relative_seg_dist.min(), + relative_seg_dist.max(), + len(mean_sea_surface_elevation_bigBin), +) # Adjusting the global font size -plt.rcParams['font.size'] = 18 # You can choose your own font size here +plt.rcParams["font.size"] = 18 # You can choose your own font size here # Define your zoom area limits for the x-axis zoom_xlim = (10.5, 11.5) # for example, from 10 to 20 on your x-axis # Filter the data based on your x-axis limits # Here, we're creating a boolean index that matches your zoom criteria -x_index_within_zoom = (sea_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & (sea_photon_dataset.relative_AT_dist <= zoom_xlim[1]) +x_index_within_zoom = (sea_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & ( + sea_photon_dataset.relative_AT_dist <= zoom_xlim[1] +) # Apply the index to your x and y data zoomed_relative_AT_dist = sea_photon_dataset.relative_AT_dist[x_index_within_zoom] zoomed_photon_height = sea_photon_dataset.photon_height[x_index_within_zoom] -#get the x index for adjusted subsurface_photon -subsurface_x_index_within_zoom = (subsurface_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & (subsurface_photon_dataset.relative_AT_dist <= zoom_xlim[1]) -subsurface_zoomed_relative_AT_dist = subsurface_photon_dataset.relative_AT_dist[subsurface_x_index_within_zoom] -zoomed_photon_height_adjusted = subsurface_photon_dataset.adjusted_photon_height[subsurface_x_index_within_zoom] -zoomed_photon_height_Dif = subsurface_photon_dataset.Dif_photon_height[subsurface_x_index_within_zoom] - +# get the x index for adjusted subsurface_photon +subsurface_x_index_within_zoom = ( + subsurface_photon_dataset.relative_AT_dist >= zoom_xlim[0] +) & (subsurface_photon_dataset.relative_AT_dist <= zoom_xlim[1]) +subsurface_zoomed_relative_AT_dist = subsurface_photon_dataset.relative_AT_dist[ + subsurface_x_index_within_zoom +] +zoomed_photon_height_adjusted = subsurface_photon_dataset.adjusted_photon_height[ + subsurface_x_index_within_zoom +] +zoomed_photon_height_Dif = subsurface_photon_dataset.Dif_photon_height[ + subsurface_x_index_within_zoom +] # Now proceed with creating your main plot @@ -357,43 +400,104 @@ sub_hlims = [-10, 3] # height limits - ######################### -fig = plt.figure(figsize=(16, 8), dpi=300, facecolor='w', edgecolor='k') +fig = plt.figure(figsize=(16, 8), dpi=300, facecolor="w", edgecolor="k") ax = fig.add_subplot(111) -ax.set_xlabel('Distance From Start of Track (km)') -ax.set_ylabel('Photon Height (m)') - -ax.scatter(sea_photon_dataset.relative_AT_dist, sea_photon_dataset.photon_height, - s=10, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') -ax.plot(sea_surface_x_axis_bins, sea_surface_height, linewidth=0.8, color='b', label='Surface Peak') -ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], linewidth=0.8, color='#DD571C', label='0.5 m Below Surface Peak') -ax.plot(sea_surface_x_axis_bins, [x - 1 for x in sea_surface_height], linewidth=0.8, color='#FDA172', label='1 m Below Surface Peak') -ax.plot(sea_surface_x_axis_bins, [x - 2 for x in sea_surface_height], linewidth=0.8, color='#FCAE1E', label='2 m Below Surface Peak') +ax.set_xlabel("Distance From Start of Track (km)") +ax.set_ylabel("Photon Height (m)") + +ax.scatter( + sea_photon_dataset.relative_AT_dist, + sea_photon_dataset.photon_height, + s=10, + c="k", + alpha=0.15, + edgecolors="none", + label="ATL03 Photons", +) +ax.plot( + sea_surface_x_axis_bins, + sea_surface_height, + linewidth=0.8, + color="b", + label="Surface Peak", +) +ax.plot( + sea_surface_x_axis_bins, + [x - 0.5 for x in sea_surface_height], + linewidth=0.8, + color="#DD571C", + label="0.5 m Below Surface Peak", +) +ax.plot( + sea_surface_x_axis_bins, + [x - 1 for x in sea_surface_height], + linewidth=0.8, + color="#FDA172", + label="1 m Below Surface Peak", +) +ax.plot( + sea_surface_x_axis_bins, + [x - 2 for x in sea_surface_height], + linewidth=0.8, + color="#FCAE1E", + label="2 m Below Surface Peak", +) # Create an inset axis for the zoomed area # axins = inset_axes(ax, width=5, height=3, loc='lower left') # adjust the location as needed # Assuming 'ax' is your main axes trans = mtransforms.blended_transform_factory(ax.figure.transFigure, ax.transAxes) -axins = inset_axes(ax, width=5, height=3, loc=3, - bbox_to_anchor=(0.2, 0.1, 0.4, 0.4), # The numbers are x0, y0, width, and height in percentages of the figure size. - bbox_transform=trans, - borderpad=0) +axins = inset_axes( + ax, + width=5, + height=3, + loc=3, + bbox_to_anchor=( + 0.2, + 0.1, + 0.4, + 0.4, + ), # The numbers are x0, y0, width, and height in percentages of the figure size. + bbox_transform=trans, + borderpad=0, +) # Plot the filtered data on the inset axes -axins.scatter(zoomed_relative_AT_dist, zoomed_photon_height, - s=10, c='k', alpha=0.15, edgecolors='none', label='Zoomed ATL03 Photons') - -axins.scatter(subsurface_zoomed_relative_AT_dist, zoomed_photon_height_adjusted, - s=10, c='r', alpha=0.15, edgecolors='none', label='Zoomed Adjusted ATL03 Photons') - - -axins.scatter(subsurface_zoomed_relative_AT_dist, zoomed_photon_height_Dif, - s=20, c='g', alpha=0.15, edgecolors='none', label='Zoomed Dif ATL03 Photons') +axins.scatter( + zoomed_relative_AT_dist, + zoomed_photon_height, + s=10, + c="k", + alpha=0.15, + edgecolors="none", + label="Zoomed ATL03 Photons", +) + +axins.scatter( + subsurface_zoomed_relative_AT_dist, + zoomed_photon_height_adjusted, + s=10, + c="r", + alpha=0.15, + edgecolors="none", + label="Zoomed Adjusted ATL03 Photons", +) + + +axins.scatter( + subsurface_zoomed_relative_AT_dist, + zoomed_photon_height_Dif, + s=20, + c="g", + alpha=0.15, + edgecolors="none", + label="Zoomed Dif ATL03 Photons", +) # Add horizontal line at y=0 -axins.axhline(y=0, color='blue', alpha=0.15, linestyle='--', linewidth=1) +axins.axhline(y=0, color="blue", alpha=0.15, linestyle="--", linewidth=1) # # For the lines, you might need to adjust your data arrays to match the zoomed range # # assuming your sea_surface_x_axis_bins aligns directly with your sea_surface_height data @@ -401,46 +505,97 @@ # zoomed_sea_surface_height = sea_surface_height[:len(zoomed_sea_surface_x)] # adjust based on your data alignment # First, we need to find the indices of the items in sea_surface_x_axis_bins that fall within the zoomed range. -zoomed_indices = [index for index, x in enumerate(sea_surface_x_axis_bins) if zoom_xlim[0] <= x <= zoom_xlim[1]] +zoomed_indices = [ + index + for index, x in enumerate(sea_surface_x_axis_bins) + if zoom_xlim[0] <= x <= zoom_xlim[1] +] # Then, use these indices to select the corresponding items from both sea_surface_x_axis_bins and sea_surface_height. zoomed_sea_surface_x = [sea_surface_x_axis_bins[i] for i in zoomed_indices] zoomed_sea_surface_height = [sea_surface_height[i] for i in zoomed_indices] -#0197f6,#BE5504,#FA8128 -axins.plot(zoomed_sea_surface_x, zoomed_sea_surface_height, linewidth=1, color='b', label='Zoomed Surface Peak') -axins.plot(zoomed_sea_surface_x, [x - 0.5 for x in zoomed_sea_surface_height], linewidth=1, color='#DD571C', label='0.5 m Below Zoomed Surface Peak') -axins.plot(zoomed_sea_surface_x, [x - 1 for x in zoomed_sea_surface_height], linewidth=1, color='#FDA172', label='1 m Below Zoomed Surface Peak') -axins.plot(zoomed_sea_surface_x, [x - 2 for x in zoomed_sea_surface_height],linewidth=1, color='#FCAE1E', label='2 m Below Zoomed Surface Peak') - -#mean sea surface height at 500m bins -axins.plot(mean_sea_surface_x_axis_bins, mean_sea_surface_elevation_bigBin, linewidth=1, color='r', label='Zoomed Surface Peak 500m bin') +# 0197f6,#BE5504,#FA8128 +axins.plot( + zoomed_sea_surface_x, + zoomed_sea_surface_height, + linewidth=1, + color="b", + label="Zoomed Surface Peak", +) +axins.plot( + zoomed_sea_surface_x, + [x - 0.5 for x in zoomed_sea_surface_height], + linewidth=1, + color="#DD571C", + label="0.5 m Below Zoomed Surface Peak", +) +axins.plot( + zoomed_sea_surface_x, + [x - 1 for x in zoomed_sea_surface_height], + linewidth=1, + color="#FDA172", + label="1 m Below Zoomed Surface Peak", +) +axins.plot( + zoomed_sea_surface_x, + [x - 2 for x in zoomed_sea_surface_height], + linewidth=1, + color="#FCAE1E", + label="2 m Below Zoomed Surface Peak", +) + +# mean sea surface height at 500m bins +axins.plot( + mean_sea_surface_x_axis_bins, + mean_sea_surface_elevation_bigBin, + linewidth=1, + color="r", + label="Zoomed Surface Peak 500m bin", +) # Set the x limits for the inset axes; y limits remain unchanged axins.set_xlim(zoom_xlim) axins.set_ylim(sub_hlims) -axins.set_title('Zoomed in') +axins.set_title("Zoomed in") # Adding the rectangle indicating the zoomed area # Draw a rectangle on the main plot to indicate the zoomed area # We use the data coordinates for the rectangle, as it needs to correspond to the actual data points on the main axes. -rec_hlims=[-10,3] -rectangle = Rectangle((zoom_xlim[0], rec_hlims[0]), zoom_xlim[1] - zoom_xlim[0], rec_hlims[1] - rec_hlims[0], - linewidth=1, edgecolor='r', facecolor='none', linestyle='--') +rec_hlims = [-10, 3] +rectangle = Rectangle( + (zoom_xlim[0], rec_hlims[0]), + zoom_xlim[1] - zoom_xlim[0], + rec_hlims[1] - rec_hlims[0], + linewidth=1, + edgecolor="r", + facecolor="none", + linestyle="--", +) ax.add_patch(rectangle) # Finalize the main plot adjustments -ax.legend(loc='lower right') +ax.legend(loc="lower right") ax.set_ylim(hlims) -plt.savefig(path+'/'+YY+MM+DD+'Track'+TRK+'Plot'+gtx+'_With_depth_colored.jpg', dpi=500,bbox_inches='tight') +plt.savefig( + path + + "/" + + YY + + MM + + DD + + "Track" + + TRK + + "Plot" + + gtx + + "_With_depth_colored.jpg", + dpi=500, + bbox_inches="tight", +) # Display the plot plt.show() - - - diff --git a/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py b/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py index 51a7396..68234ae 100644 --- a/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py +++ b/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py @@ -1,17 +1,17 @@ -import rasterio import numpy as np -import pandas as pd +import rasterio from rtree import index -from shapely.geometry import box, Point +from shapely.geometry import box + # 1. Function to create an R-tree spatial index for raster bounds def create_spatial_index(gebco_paths): """ Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. - + Returns: raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. @@ -19,26 +19,29 @@ def create_spatial_index(gebco_paths): idx = index.Index() raster_data_dict = {} for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): gebco_raster = rasterio.open(path) raster_data = gebco_raster.read(1) raster_data_dict[path] = (gebco_raster, raster_data) bounds = gebco_raster.bounds - idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + idx.insert( + i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path + ) return raster_data_dict, idx + # 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): """ Determine which raster datasets are relevant for the given coordinates using spatial index. This function uses a more efficient approach by performing a bounding box query for batches of points. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. raster_data_dict (dict): Dictionary containing raster datasets and their data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - + Returns: relevant_rasters (list): List of relevant raster datasets and their respective data. """ @@ -50,59 +53,70 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index # Get all rasters that intersect with the bounding box matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) relevant_paths = {match.object for match in matches} - relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + relevant_rasters = [ + (raster_data_dict[path][0], raster_data_dict[path][1]) + for path in relevant_paths + ] return relevant_rasters + # 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner def get_seafloor_elevation(lons, lats, relevant_rasters): """ Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. relevant_rasters (list): List of relevant raster datasets and their respective data. - + Returns: seafloor_elevations (numpy.ndarray): Array of seafloor elevations. """ points = np.vstack((lons, lats)).T seafloor_elevations = np.full(len(lons), np.nan) - + for gebco_raster, raster_data in relevant_rasters: # Use rasterio.sample to get values for multiple points in a batch values = list(gebco_raster.sample(points)) for idx, value in enumerate(values): if np.isnan(seafloor_elevations[idx]) and value is not None: seafloor_elevations[idx] = value[0] - + return seafloor_elevations + # 4. The main process function that ties everything together def process_seafloor_data(gebco_paths, sea_photon_dataset): """ Main function to process the seafloor data. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - + Returns: filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. """ # Create spatial index and load GEBCO Raster Data raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - + # Get the relevant rasters using spatial index - lons = sea_photon_dataset['longitude'].values - lats = sea_photon_dataset['latitude'].values - relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) - + lons = sea_photon_dataset["longitude"].values + lats = sea_photon_dataset["latitude"].values + relevant_rasters = get_relevant_rasters_using_index( + lons, lats, raster_data_dict, spatial_index + ) + # Get seafloor elevation for all points - sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) - + sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( + lons, lats, relevant_rasters + ) + # Filter out points below the seafloor - filtered_sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']] + filtered_sea_photon_dataset = sea_photon_dataset[ + sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] + ] # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) - - return filtered_sea_photon_dataset \ No newline at end of file + + return filtered_sea_photon_dataset diff --git a/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py b/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py index 4b80bbf..e73f1f0 100644 --- a/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py +++ b/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py @@ -1,457 +1,744 @@ -import h5py +import logging +from multiprocessing import Pool import os + +from config import get_args +import geopandas as gpd +import h5py +import matplotlib.pyplot as plt +from noise_utils.data_processing import isolate_sea_land_photons import numpy as np import pandas as pd -import geopandas as gpd -from shapely.geometry import Point from scipy.interpolate import interp1d from scipy.optimize import curve_fit -import matplotlib.pyplot as plt -from multiprocessing import Pool -from config import get_args -from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation -from noise_utils.data_processing import isolate_sea_land_photons -import random # Added for random selection - -import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) + # Define model functions def power_model(x, a, b, c): return a * np.power(x, b) + c + def exp_model(x, a, b, c): """Exponential model: a * exp(b * x) + c""" return a * np.exp(b * x) + c + def twilight_model(x, a, b, c): return a * np.exp(-b * x**2) + c # Gaussian form for twilight + def process_file(args): """Process a single ATL03 file, skipping files missing required datasets.""" file, beam, shoreline_data_path = args try: - with h5py.File(file, 'r') as f: + with h5py.File(file, "r") as f: # Check for required datasets required_keys = [ - f'{beam}/bckgrd_atlas/bckgrd_rate', - f'{beam}/bckgrd_atlas/delta_time', - f'{beam}/heights/delta_time', - f'{beam}/heights/h_ph', - f'{beam}/geolocation/reference_photon_lat', - f'{beam}/geolocation/reference_photon_lon', - f'{beam}/geolocation/solar_azimuth', - f'{beam}/geolocation/solar_elevation' + f"{beam}/bckgrd_atlas/bckgrd_rate", + f"{beam}/bckgrd_atlas/delta_time", + f"{beam}/heights/delta_time", + f"{beam}/heights/h_ph", + f"{beam}/geolocation/reference_photon_lat", + f"{beam}/geolocation/reference_photon_lon", + f"{beam}/geolocation/solar_azimuth", + f"{beam}/geolocation/solar_elevation", ] for key in required_keys: if key not in f: raise KeyError(f"Missing dataset: {key}") - bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) - bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] - heights = f[f'{beam}/heights/h_ph'][:] - segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] - segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] - segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] - - Segment_Is_Land = pd.DataFrame({ - 'latitude': segment_lat, - 'longitude': segment_lon, - 'delta_time': segment_delta_time - }) - + bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) + bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] + heights = f[f"{beam}/heights/h_ph"][:] + segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] + segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] + segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] + + Segment_Is_Land = pd.DataFrame( + { + "latitude": segment_lat, + "longitude": segment_lon, + "delta_time": segment_delta_time, + } + ) + # Create a GeoDataFrame with geometry - Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + Segment_Is_Land["geometry"] = gpd.points_from_xy( + Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] + ) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF["is_land"] = Segment_Is_Land_Labels # Interpolate is_land labels to bckgrd_time interp_is_land = interp1d( - ICESat2_GDF['delta_time'], - ICESat2_GDF['is_land'].astype(int), + ICESat2_GDF["delta_time"], + ICESat2_GDF["is_land"].astype(int), bounds_error=False, - fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + fill_value=( + ICESat2_GDF["is_land"].astype(int).iloc[0], + ICESat2_GDF["is_land"].astype(int).iloc[-1], + ), ) is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - + # Compute total background rates for land and ocean land_mask = is_land_interpolated == 1 ocean_mask = is_land_interpolated == 0 - total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + total_background_rate_land = ( + np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + ) + total_background_rate_ocean = ( + np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + ) # Solar elevation handling FILL_VALUE = 3.4028235e38 - solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) - valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( + np.float64 + ) + valid_solar = solar_elevations[ + (solar_elevations != FILL_VALUE) + & (np.abs(solar_elevations) < 1e30) + & np.isfinite(solar_elevations) + ] solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan result = { - 'file': os.path.basename(file), - 'total_background_rate_land': total_background_rate_land, - 'total_background_rate_ocean': total_background_rate_ocean, - 'solar_elevation': solar_elevation, - 'mean_photon_height': mean_photon_height + "file": os.path.basename(file), + "total_background_rate_land": total_background_rate_land, + "total_background_rate_ocean": total_background_rate_ocean, + "solar_elevation": solar_elevation, + "mean_photon_height": mean_photon_height, } logger.info(f"Processed {file}: {result}") return result except Exception as e: - logger.error(f"Error processing {file}: {str(e)}") - return {'file': os.path.basename(file), 'error': str(e)} - -def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + logger.error(f"Error processing {file}: {e!s}") + return {"file": os.path.basename(file), "error": str(e)} + + +def fit_surface_model( + df, + classification, + surface_type, + model_func, + p0, + bounds, + x_col="solar_elevation", + y_col="solar_background_rate", +): """Fit a model to a specific classification and surface type.""" - subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + subset = df[ + (df["classification"] == classification) & (df["surface_type"] == surface_type) + ].copy() if len(subset) < 3: print(f"Warning: Insufficient data for {classification} {surface_type} model.") return None, None, None - mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + mask = ( + subset[x_col].notna() + & subset[y_col].notna() + & np.isfinite(subset[x_col]) + & np.isfinite(subset[y_col]) + ) valid_x = subset.loc[mask, x_col] valid_y = subset.loc[mask, y_col] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." + ) return None, None, None try: - popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + popt, _ = curve_fit( + model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 + ) y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( + (valid_y - np.mean(valid_y)) ** 2 + ) return popt, r2, lambda x: model_func(x, *popt) except RuntimeError as e: print(f"Fit failed for {classification} {surface_type}: {e}") return None, None, None + def load_and_process_files(atl03_directory, beam, shoreline_data_path): """Load and process ATL03 files in parallel.""" - atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + atl03_files = [ + os.path.join(atl03_directory, f) + for f in os.listdir(atl03_directory) + if f.endswith(".h5") and "ATL03" in f + ] print(f"Found {len(atl03_files)} ATL03 files.") - + if not atl03_files: raise ValueError("No ATL03 files found in the directory.") with Pool() as pool: - results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) - + results = pool.map( + process_file, [(f, beam, shoreline_data_path) for f in atl03_files] + ) + print("Results from process_file:", results) df = pd.DataFrame(results) - - if 'error' in df.columns: - error_files = df[df['error'].notna()]['file'].tolist() + + if "error" in df.columns: + error_files = df[df["error"].notna()]["file"].tolist() print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df['error'].isna()].drop(columns=['error']) - + df = df[df["error"].isna()].drop(columns=["error"]) + if df.empty: - raise ValueError("No files were successfully processed. Check the error messages above.") - + raise ValueError( + "No files were successfully processed. Check the error messages above." + ) + print("DataFrame columns after processing:", df.columns) return df + def assign_surface_type(df): """Assign surface type based on background rates.""" - required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + required_columns = ["total_background_rate_land", "total_background_rate_ocean"] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - df['surface_type'] = np.where( - df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), - 'Land', - np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + df["surface_type"] = np.where( + df["total_background_rate_land"].notna() + & (df["total_background_rate_land"] > 0), + "Land", + np.where( + df["total_background_rate_ocean"].notna() + & (df["total_background_rate_ocean"] > 0), + "Ocean", + "Mixed", + ), ) return df + def classify_data(df, twilight_threshold=6.0): """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df['classification'] = np.where( - df['solar_elevation'].isna(), 'Unknown', - np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', - np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + df["classification"] = np.where( + df["solar_elevation"].isna(), + "Unknown", + np.where( + df["solar_elevation"] > twilight_threshold, + "Daytime", + np.where( + df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" + ), + ), + ) + daytime_df = df[df["classification"] == "Daytime"].copy() + nighttime_df = df[df["classification"] == "Nighttime"].copy() + twilight_df = df[df["classification"] == "Twilight"].copy() + unknown_df = df[df["classification"] == "Unknown"].copy() + print( + f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" ) - daytime_df = df[df['classification'] == 'Daytime'].copy() - nighttime_df = df[df['classification'] == 'Nighttime'].copy() - twilight_df = df[df['classification'] == 'Twilight'].copy() - unknown_df = df[df['classification'] == 'Unknown'].copy() - print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") return df, daytime_df, nighttime_df, twilight_df, unknown_df + def calculate_non_solar_rate(nighttime_df): """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) - non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) + non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") return non_solar_rate + def compute_solar_background_rates(df, non_solar_rate): """Compute solar background rates for land and ocean.""" - df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) - df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) - df['solar_background_rate'] = np.where( - df['surface_type'] == 'Land', df['solar_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + df["solar_background_rate_land"] = ( + df["total_background_rate_land"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate_ocean"] = ( + df["total_background_rate_ocean"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate"] = np.where( + df["surface_type"] == "Land", + df["solar_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan + ), ) return df + def fit_daytime_model(df, daytime_df, non_solar_rate): """Fit daytime models for land and ocean.""" daytime_models = {} - for surface_type in ['Land', 'Ocean']: - subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() if len(subset) < 3: print(f"Warning: Insufficient daytime {surface_type} data for modeling.") continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_x = subset.loc[mask, 'solar_elevation'] - valid_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_x = subset.loc[mask, "solar_elevation"] + valid_y = subset.loc[mask, "solar_background_rate"] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for daytime {surface_type} model." + ) continue # Remove outliers log_y = np.log10(valid_y + 1) mean_log_y = np.mean(log_y) std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( + log_y < mean_log_y + 3 * std_log_y + ) valid_x = valid_x[outlier_mask] valid_y = valid_y[outlier_mask] print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") try: p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) - popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) - r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_exp = ( + [0, 0.01, non_solar_rate * 0.9], + [1e8, 0.5, non_solar_rate * 1.1], + ) + popt_exp, _ = curve_fit( + exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 + ) + r2_exp = 1 - np.sum( + (valid_y - exp_model(valid_x, *popt_exp)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) x_shifted = valid_x - min(valid_x) + 1 p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) - popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) - r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_power = ( + [0, 0.1, non_solar_rate * 0.9], + [1e8, 1.0, non_solar_rate * 1.1], + ) + popt_power, _ = curve_fit( + power_model, + x_shifted, + valid_y, + p0=p0_power, + bounds=bounds_power, + maxfev=20000, + ) + r2_power = 1 - np.sum( + (valid_y - power_model(x_shifted, *popt_power)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) if r2_power > r2_exp: - print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") - daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + print( + f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_power, + "r2": r2_power, + "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), + } else: - print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") - daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + print( + f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_exp, + "r2": r2_exp, + "model": lambda x: exp_model(x, *popt_exp), + } # Apply model - daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) - valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + daytime_mask = (df["classification"] == "Daytime") & ( + df["surface_type"] == surface_type + ) + valid_daytime_mask = ( + daytime_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] if r2_power > r2_exp: x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model_shifted).clip(lower=0) else: - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( + non_solar_rate + ) except RuntimeError as e: - print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") - df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." + ) + df.loc[ + (df["classification"] == "Daytime") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate return df, daytime_models + def fit_twilight_model(df, twilight_df, non_solar_rate): """Fit twilight models for land and ocean.""" twilight_models = {} if len(twilight_df) > 3: - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( - df['solar_background_rate'].fillna(0).clip(lower=0) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + df["solar_background_rate"].fillna(0).clip(lower=0) ) - for surface_type in ['Land', 'Ocean']: - subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() if len(subset) < 3: - print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + print( + f"Warning: Insufficient twilight {surface_type} data for modeling." + ) continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_twilight_x = subset.loc[mask, 'solar_elevation'] - valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_twilight_x = subset.loc[mask, "solar_elevation"] + valid_twilight_y = subset.loc[mask, "solar_background_rate"] if len(valid_twilight_x) < 3: - print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for twilight {surface_type} model." + ) continue - print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + print( + f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" + ) try: initial_a = np.max(valid_twilight_y) - non_solar_rate initial_b = 0.01 initial_c = non_solar_rate p0 = [initial_a, initial_b, initial_c] - bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) - - popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + bounds = ( + [0, 0.001, non_solar_rate * 0.9], + [1e6, 1.0, non_solar_rate * 1.1], + ) + + popt, _ = curve_fit( + twilight_model, + valid_twilight_x, + valid_twilight_y, + p0=p0, + bounds=bounds, + maxfev=50000, + ) y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( + (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 + ) if r_squared < 0: mean_rate = np.mean(valid_twilight_y) twilight_models[surface_type] = { - 'params': [mean_rate], - 'r2': 0.0, - 'model': lambda x: np.full_like(x, mean_rate) + "params": [mean_rate], + "r2": 0.0, + "model": lambda x: np.full_like(x, mean_rate), } - print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" + ) + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate else: - print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") - twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] - df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" + ) + twilight_models[surface_type] = { + "params": popt, + "r2": r_squared, + "model": lambda x: twilight_model(x, *popt), + } + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] + df.loc[valid_twilight_mask, "solar_background_rate"] = ( + twilight_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + ) + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate except RuntimeError as e: - print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") - df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." + ) + df.loc[ + (df["classification"] == "Twilight") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate else: - print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + print( + "Too few twilight points for modeling. Setting twilight rates to non-solar rate." + ) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + non_solar_rate + ) return df, twilight_models -def create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir): + +def create_diagnostic_plot( + df, + daytime_df, + twilight_df, + daytime_models, + twilight_models, + non_solar_rate, + output_dir, +): """Create diagnostic plot of background rate vs. solar elevation.""" plt.figure(figsize=(10, 6)) - plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], - c='blue', alpha=0.5, label='Daytime Land') - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='cyan', alpha=0.5, label='Daytime Ocean') - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], - c='orange', alpha=0.5, label='Twilight Land') - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='yellow', alpha=0.5, label='Twilight Ocean') - plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + plt.scatter( + df["solar_elevation"], + df["total_background_rate"], + c="gray", + alpha=0.5, + label="Total", + ) + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], + c="blue", + alpha=0.5, + label="Daytime Land", + ) + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="cyan", + alpha=0.5, + label="Daytime Ocean", + ) + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], + c="orange", + alpha=0.5, + label="Twilight Land", + ) + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="yellow", + alpha=0.5, + label="Twilight Ocean", + ) + plt.axhline( + non_solar_rate, + color="r", + linestyle="--", + label=f"Non-Solar: {non_solar_rate:.2e} ph/s", + ) # Plot daytime fits for surface_type in daytime_models: - if daytime_models[surface_type]['model'] is not None: - x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) - y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) - plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if daytime_models[surface_type]["model"] is not None: + x_fit = np.linspace( + min(daytime_df["solar_elevation"]), + max(daytime_df["solar_elevation"]), + 100, + ) + y_fit = daytime_models[surface_type]["model"]( + x_fit + if "power" not in daytime_models[surface_type] + else x_fit - min(daytime_df["solar_elevation"]) + 1 + ) + plt.plot( + x_fit, + y_fit, + label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) # Plot twilight fits for surface_type in twilight_models: - if twilight_models[surface_type]['model'] is not None: - x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) - y_twilight = twilight_models[surface_type]['model'](x_twilight) - plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if twilight_models[surface_type]["model"] is not None: + x_twilight = np.linspace( + min(twilight_df["solar_elevation"]), + max(twilight_df["solar_elevation"]), + 100, + ) + y_twilight = twilight_models[surface_type]["model"](x_twilight) + plt.plot( + x_twilight, + y_twilight, + label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) - plt.yscale('log') + plt.yscale("log") plt.ylim(1e3, 1e8) - plt.xlabel('Solar Elevation (degrees)') - plt.ylabel('Background Rate (ph/s)') + plt.xlabel("Solar Elevation (degrees)") + plt.ylabel("Background Rate (ph/s)") plt.legend() - plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean)') + plt.title("Background Rate vs. Solar Elevation (Land vs. Ocean)") plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) plt.close() + def create_calibration_plot(df, daytime_df, output_dir): """Create a calibration plot using mean photon height as a proxy.""" if not daytime_df.empty: sample_file = daytime_df.iloc[0] - heights = sample_file['mean_photon_height'] + heights = sample_file["mean_photon_height"] if np.isfinite(heights): - solar_rate = sample_file['solar_background_rate'] + solar_rate = sample_file["solar_background_rate"] bin_size = 0.1 - hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) - photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + hist, edges = np.histogram( + np.array([heights] * 1000), + bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), + ) + photons_per_bin = ( + solar_rate + * (1000 / np.sum(hist)) + * bin_size + / (max(heights) - min(heights) + 2) + ) calibrated_counts = np.maximum(hist - photons_per_bin, 0) calibrated_heights = [] for i, count in enumerate(calibrated_counts): if count > 0: - calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + calibrated_heights.extend( + np.random.uniform(edges[i], edges[i + 1], int(count)) + ) plt.figure(figsize=(8, 6)) - plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') - plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') - plt.xlabel('Photon Height (m)') - plt.ylabel('Count') + plt.hist( + [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" + ) + plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") + plt.xlabel("Photon Height (m)") + plt.ylabel("Count") plt.legend() plt.title(f'Calibration Example: {sample_file["file"]}') plt.savefig(os.path.join(output_dir, "calibration_example.png")) plt.close() + def save_results(df, output_dir): """Save the results to a CSV file.""" df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) + # New function to calculate averages def calculate_background_averages(daytime_df, nighttime_df): """Calculate average background rates for daytime and nighttime, land and ocean.""" # Calculate averages, ignoring NaN values - daytime_land_avg = np.nanmean(daytime_df['total_background_rate_land']) - daytime_ocean_avg = np.nanmean(daytime_df['total_background_rate_ocean']) - nighttime_land_avg = np.nanmean(nighttime_df['total_background_rate_land']) - nighttime_ocean_avg = np.nanmean(nighttime_df['total_background_rate_ocean']) + daytime_land_avg = np.nanmean(daytime_df["total_background_rate_land"]) + daytime_ocean_avg = np.nanmean(daytime_df["total_background_rate_ocean"]) + nighttime_land_avg = np.nanmean(nighttime_df["total_background_rate_land"]) + nighttime_ocean_avg = np.nanmean(nighttime_df["total_background_rate_ocean"]) # Log the averages logger.info(f"Average Daytime Land Background Rate: {daytime_land_avg:.2e} ph/s") logger.info(f"Average Daytime Ocean Background Rate: {daytime_ocean_avg:.2e} ph/s") - logger.info(f"Average Nighttime Land Background Rate: {nighttime_land_avg:.2e} ph/s") - logger.info(f"Average Nighttime Ocean Background Rate: {nighttime_ocean_avg:.2e} ph/s") + logger.info( + f"Average Nighttime Land Background Rate: {nighttime_land_avg:.2e} ph/s" + ) + logger.info( + f"Average Nighttime Ocean Background Rate: {nighttime_ocean_avg:.2e} ph/s" + ) return { - 'daytime_land': daytime_land_avg, - 'daytime_ocean': daytime_ocean_avg, - 'nighttime_land': nighttime_land_avg, - 'nighttime_ocean': nighttime_ocean_avg + "daytime_land": daytime_land_avg, + "daytime_ocean": daytime_ocean_avg, + "nighttime_land": nighttime_land_avg, + "nighttime_ocean": nighttime_ocean_avg, } + # New function to plot averages def plot_background_averages(averages, output_dir): """Plot the average background rates for daytime and nighttime, land and ocean.""" - labels = ['Daytime Land', 'Daytime Ocean', 'Nighttime Land', 'Nighttime Ocean'] + labels = ["Daytime Land", "Daytime Ocean", "Nighttime Land", "Nighttime Ocean"] rates = [ - averages['daytime_land'] if not pd.isna(averages['daytime_land']) else 0, - averages['daytime_ocean'] if not pd.isna(averages['daytime_ocean']) else 0, - averages['nighttime_land'] if not pd.isna(averages['nighttime_land']) else 0, - averages['nighttime_ocean'] if not pd.isna(averages['nighttime_ocean']) else 0 + averages["daytime_land"] if not pd.isna(averages["daytime_land"]) else 0, + averages["daytime_ocean"] if not pd.isna(averages["daytime_ocean"]) else 0, + averages["nighttime_land"] if not pd.isna(averages["nighttime_land"]) else 0, + averages["nighttime_ocean"] if not pd.isna(averages["nighttime_ocean"]) else 0, ] plt.figure(figsize=(8, 6)) - bars = plt.bar(labels, rates, color=['blue', 'cyan', 'purple', 'lightgreen']) - - plt.ylabel('Average Background Rate (ph/s)') - plt.title('Average Background Rate Comparison\n(Daytime vs. Nighttime, Land vs. Ocean)') - plt.yscale('log') + bars = plt.bar(labels, rates, color=["blue", "cyan", "purple", "lightgreen"]) + + plt.ylabel("Average Background Rate (ph/s)") + plt.title( + "Average Background Rate Comparison\n(Daytime vs. Nighttime, Land vs. Ocean)" + ) + plt.yscale("log") plt.ylim(1e3, 1e8) for bar in bars: yval = bar.get_height() - plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2e}', ha='center', va='bottom') + plt.text( + bar.get_x() + bar.get_width() / 2, + yval, + f"{yval:.2e}", + ha="center", + va="bottom", + ) plt.savefig(os.path.join(output_dir, "average_background_rate_comparison.png")) plt.close() logger.info("Average background rate comparison plot generated.") -def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + +def analyze_icesat2_data( + atl03_directory, + beam="gt1l", + twilight_threshold=6.0, + output_dir="output", + shoreline_data_path=None, +): """Main analysis function orchestrating the workflow.""" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -459,17 +746,22 @@ def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, o df = load_and_process_files(atl03_directory, beam, shoreline_data_path) if df.empty: raise ValueError("No data processed successfully. Check logs for errors.") - + df = assign_surface_type(df) - + # Compute total_background_rate based on surface_type - df['total_background_rate'] = np.where( - df['surface_type'] == 'Land', df['total_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + df["total_background_rate"] = np.where( + df["surface_type"] == "Land", + df["total_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan + ), + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( + df, twilight_threshold ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) - + averages = calculate_background_averages(daytime_df, nighttime_df) plot_background_averages(averages, output_dir) @@ -479,5 +771,10 @@ def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, o atl03_directory = os.path.join(args.workspace_path, args.atl03_path) shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) output_dir = os.path.join(args.workspace_path, args.output_path) - analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) - print("Analysis complete.") \ No newline at end of file + analyze_icesat2_data( + atl03_directory, + args.beam, + output_dir=output_dir, + shoreline_data_path=shoreline_data_path, + ) + print("Analysis complete.") diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py index bded772..48dfa66 100644 --- a/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py +++ b/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py @@ -1,84 +1,95 @@ -import h5py +import logging +from multiprocessing import Pool import os + +from config import get_args +import geopandas as gpd +import h5py +import matplotlib.pyplot as plt +from noise_utils.data_processing import isolate_sea_land_photons import numpy as np import pandas as pd -import geopandas as gpd -from shapely.geometry import Point from scipy.interpolate import interp1d from scipy.optimize import curve_fit -import matplotlib.pyplot as plt -from multiprocessing import Pool -from config import get_args -from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation -from noise_utils.data_processing import isolate_sea_land_photons - -import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) + # Define model functions def power_model(x, a, b, c): return a * np.power(x, b) + c + def exp_model(x, a, b, c): """Exponential model: a * exp(b * x) + c""" return a * np.exp(b * x) + c + def twilight_model(x, a, b, c): return a * np.exp(-b * x**2) + c # Gaussian form for twilight + def process_file(args): """Process a single ATL03 file, skipping files missing required datasets.""" file, beam, shoreline_data_path = args try: - with h5py.File(file, 'r') as f: + with h5py.File(file, "r") as f: # Check for required datasets required_keys = [ - f'{beam}/bckgrd_atlas/bckgrd_rate', - f'{beam}/bckgrd_atlas/delta_time', - f'{beam}/heights/delta_time', - f'{beam}/heights/h_ph', - f'{beam}/geolocation/reference_photon_lat', - f'{beam}/geolocation/reference_photon_lon', - f'{beam}/geolocation/solar_azimuth', - f'{beam}/geolocation/solar_elevation' + f"{beam}/bckgrd_atlas/bckgrd_rate", + f"{beam}/bckgrd_atlas/delta_time", + f"{beam}/heights/delta_time", + f"{beam}/heights/h_ph", + f"{beam}/geolocation/reference_photon_lat", + f"{beam}/geolocation/reference_photon_lon", + f"{beam}/geolocation/solar_azimuth", + f"{beam}/geolocation/solar_elevation", ] for key in required_keys: if key not in f: raise KeyError(f"Missing dataset: {key}") - bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) - bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] - heights = f[f'{beam}/heights/h_ph'][:] - segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] - segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] - segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] - - Segment_Is_Land = pd.DataFrame({ - 'latitude': segment_lat, # Use unique column names to avoid conflicts - 'longitude': segment_lon, - 'delta_time': segment_delta_time - }) - + bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) + bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] + heights = f[f"{beam}/heights/h_ph"][:] + segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] + segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] + segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] + + Segment_Is_Land = pd.DataFrame( + { + "latitude": segment_lat, # Use unique column names to avoid conflicts + "longitude": segment_lon, + "delta_time": segment_delta_time, + } + ) + # Create a GeoDataFrame with geometry - Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + Segment_Is_Land["geometry"] = gpd.points_from_xy( + Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] + ) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF["is_land"] = Segment_Is_Land_Labels # Interpolate is_land labels to bckgrd_time interp_is_land = interp1d( - ICESat2_GDF['delta_time'], - ICESat2_GDF['is_land'].astype(int), + ICESat2_GDF["delta_time"], + ICESat2_GDF["is_land"].astype(int), bounds_error=False, - fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + fill_value=( + ICESat2_GDF["is_land"].astype(int).iloc[0], + ICESat2_GDF["is_land"].astype(int).iloc[-1], + ), ) is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - + # ##########Debug################################# # # Interpolate latitude and longitude to bckgrd_time # interp_lat = interp1d( @@ -108,7 +119,7 @@ def process_file(args): # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), # crs="EPSG:4326" # ) - + # # Save to a GeoPackage file # file_dir = os.path.dirname(file) # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") @@ -122,187 +133,311 @@ def process_file(args): # Compute total background rates for land and ocean land_mask = is_land_interpolated == 1 ocean_mask = is_land_interpolated == 0 - total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + total_background_rate_land = ( + np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + ) + total_background_rate_ocean = ( + np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + ) # Solar elevation handling FILL_VALUE = 3.4028235e38 - solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) - valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( + np.float64 + ) + valid_solar = solar_elevations[ + (solar_elevations != FILL_VALUE) + & (np.abs(solar_elevations) < 1e30) + & np.isfinite(solar_elevations) + ] solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan result = { - 'file': os.path.basename(file), - 'total_background_rate_land': total_background_rate_land, - 'total_background_rate_ocean': total_background_rate_ocean, - 'solar_elevation': solar_elevation, - 'mean_photon_height': mean_photon_height + "file": os.path.basename(file), + "total_background_rate_land": total_background_rate_land, + "total_background_rate_ocean": total_background_rate_ocean, + "solar_elevation": solar_elevation, + "mean_photon_height": mean_photon_height, } logger.info(f"Processed {file}: {result}") return result except Exception as e: - logger.error(f"Error processing {file}: {str(e)}") - return {'file': os.path.basename(file), 'error': str(e)} - -def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + logger.error(f"Error processing {file}: {e!s}") + return {"file": os.path.basename(file), "error": str(e)} + + +def fit_surface_model( + df, + classification, + surface_type, + model_func, + p0, + bounds, + x_col="solar_elevation", + y_col="solar_background_rate", +): """Fit a model to a specific classification and surface type.""" - subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + subset = df[ + (df["classification"] == classification) & (df["surface_type"] == surface_type) + ].copy() if len(subset) < 3: print(f"Warning: Insufficient data for {classification} {surface_type} model.") return None, None, None - mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + mask = ( + subset[x_col].notna() + & subset[y_col].notna() + & np.isfinite(subset[x_col]) + & np.isfinite(subset[y_col]) + ) valid_x = subset.loc[mask, x_col] valid_y = subset.loc[mask, y_col] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." + ) return None, None, None try: - popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + popt, _ = curve_fit( + model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 + ) y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( + (valid_y - np.mean(valid_y)) ** 2 + ) return popt, r2, lambda x: model_func(x, *popt) except RuntimeError as e: print(f"Fit failed for {classification} {surface_type}: {e}") return None, None, None + def load_and_process_files(atl03_directory, beam, shoreline_data_path): """Load and process ATL03 files in parallel.""" - atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + atl03_files = [ + os.path.join(atl03_directory, f) + for f in os.listdir(atl03_directory) + if f.endswith(".h5") and "ATL03" in f + ] print(f"Found {len(atl03_files)} ATL03 files.") - + if not atl03_files: raise ValueError("No ATL03 files found in the directory.") with Pool() as pool: - results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) - + results = pool.map( + process_file, [(f, beam, shoreline_data_path) for f in atl03_files] + ) + print("Results from process_file:", results) # Debug output df = pd.DataFrame(results) - - - if 'error' in df.columns: - error_files = df[df['error'].notna()]['file'].tolist() + + if "error" in df.columns: + error_files = df[df["error"].notna()]["file"].tolist() print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df['error'].isna()].drop(columns=['error']) - + df = df[df["error"].isna()].drop(columns=["error"]) + if df.empty: - raise ValueError("No files were successfully processed. Check the error messages above.") - + raise ValueError( + "No files were successfully processed. Check the error messages above." + ) + print("DataFrame columns after processing:", df.columns) return df + def assign_surface_type(df): """Assign surface type based on background rates.""" - required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + required_columns = ["total_background_rate_land", "total_background_rate_ocean"] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - df['surface_type'] = np.where( - df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), - 'Land', - np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + df["surface_type"] = np.where( + df["total_background_rate_land"].notna() + & (df["total_background_rate_land"] > 0), + "Land", + np.where( + df["total_background_rate_ocean"].notna() + & (df["total_background_rate_ocean"] > 0), + "Ocean", + "Mixed", + ), ) return df + def classify_data(df, twilight_threshold=6.0): """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df['classification'] = np.where( - df['solar_elevation'].isna(), 'Unknown', - np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', - np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + df["classification"] = np.where( + df["solar_elevation"].isna(), + "Unknown", + np.where( + df["solar_elevation"] > twilight_threshold, + "Daytime", + np.where( + df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" + ), + ), + ) + daytime_df = df[df["classification"] == "Daytime"].copy() + nighttime_df = df[df["classification"] == "Nighttime"].copy() + twilight_df = df[df["classification"] == "Twilight"].copy() + unknown_df = df[df["classification"] == "Unknown"].copy() + print( + f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" ) - daytime_df = df[df['classification'] == 'Daytime'].copy() - nighttime_df = df[df['classification'] == 'Nighttime'].copy() - twilight_df = df[df['classification'] == 'Twilight'].copy() - unknown_df = df[df['classification'] == 'Unknown'].copy() - print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") return df, daytime_df, nighttime_df, twilight_df, unknown_df + def calculate_non_solar_rate(nighttime_df): """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) - non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) + non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") return non_solar_rate + def compute_solar_background_rates(df, non_solar_rate): """Compute solar background rates for land and ocean.""" - df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) - df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) - df['solar_background_rate'] = np.where( - df['surface_type'] == 'Land', df['solar_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + df["solar_background_rate_land"] = ( + df["total_background_rate_land"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate_ocean"] = ( + df["total_background_rate_ocean"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate"] = np.where( + df["surface_type"] == "Land", + df["solar_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan + ), ) return df + def fit_daytime_model(df, daytime_df, non_solar_rate): """Fit daytime models for land and ocean.""" daytime_models = {} - for surface_type in ['Land', 'Ocean']: - subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() if len(subset) < 3: print(f"Warning: Insufficient daytime {surface_type} data for modeling.") continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_x = subset.loc[mask, 'solar_elevation'] - valid_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_x = subset.loc[mask, "solar_elevation"] + valid_y = subset.loc[mask, "solar_background_rate"] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for daytime {surface_type} model." + ) continue # Remove outliers log_y = np.log10(valid_y + 1) mean_log_y = np.mean(log_y) std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( + log_y < mean_log_y + 3 * std_log_y + ) valid_x = valid_x[outlier_mask] valid_y = valid_y[outlier_mask] print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") try: p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) - popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) - r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_exp = ( + [0, 0.01, non_solar_rate * 0.9], + [1e8, 0.5, non_solar_rate * 1.1], + ) + popt_exp, _ = curve_fit( + exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 + ) + r2_exp = 1 - np.sum( + (valid_y - exp_model(valid_x, *popt_exp)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) x_shifted = valid_x - min(valid_x) + 1 p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) - popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) - r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_power = ( + [0, 0.1, non_solar_rate * 0.9], + [1e8, 1.0, non_solar_rate * 1.1], + ) + popt_power, _ = curve_fit( + power_model, + x_shifted, + valid_y, + p0=p0_power, + bounds=bounds_power, + maxfev=20000, + ) + r2_power = 1 - np.sum( + (valid_y - power_model(x_shifted, *popt_power)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) if r2_power > r2_exp: - print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") - daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + print( + f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_power, + "r2": r2_power, + "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), + } else: - print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") - daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + print( + f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_exp, + "r2": r2_exp, + "model": lambda x: exp_model(x, *popt_exp), + } # Apply model - daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) - valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + daytime_mask = (df["classification"] == "Daytime") & ( + df["surface_type"] == surface_type + ) + valid_daytime_mask = ( + daytime_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] if r2_power > r2_exp: x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model_shifted).clip(lower=0) else: - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( + non_solar_rate + ) except RuntimeError as e: - print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") - df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." + ) + df.loc[ + (df["classification"] == "Daytime") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate return df, daytime_models @@ -310,161 +445,311 @@ def fit_twilight_model(df, twilight_df, non_solar_rate): """Fit twilight models for land and ocean.""" twilight_models = {} if len(twilight_df) > 3: - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( - df['solar_background_rate'].fillna(0).clip(lower=0) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + df["solar_background_rate"].fillna(0).clip(lower=0) ) - for surface_type in ['Land', 'Ocean']: - subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() if len(subset) < 3: - print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + print( + f"Warning: Insufficient twilight {surface_type} data for modeling." + ) continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_twilight_x = subset.loc[mask, 'solar_elevation'] - valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_twilight_x = subset.loc[mask, "solar_elevation"] + valid_twilight_y = subset.loc[mask, "solar_background_rate"] if len(valid_twilight_x) < 3: - print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for twilight {surface_type} model." + ) continue - print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + print( + f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" + ) try: initial_a = np.max(valid_twilight_y) - non_solar_rate initial_b = 0.01 # Smaller b for a flatter Gaussian initial_c = non_solar_rate p0 = [initial_a, initial_b, initial_c] - bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) - - popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + bounds = ( + [0, 0.001, non_solar_rate * 0.9], + [1e6, 1.0, non_solar_rate * 1.1], + ) + + popt, _ = curve_fit( + twilight_model, + valid_twilight_x, + valid_twilight_y, + p0=p0, + bounds=bounds, + maxfev=50000, + ) y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( + (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 + ) if r_squared < 0: # Fallback to a constant model (mean of the data) mean_rate = np.mean(valid_twilight_y) twilight_models[surface_type] = { - 'params': [mean_rate], - 'r2': 0.0, # R² of a constant model is 0 by definition - 'model': lambda x: np.full_like(x, mean_rate) + "params": [mean_rate], + "r2": 0.0, # R² of a constant model is 0 by definition + "model": lambda x: np.full_like(x, mean_rate), } - print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" + ) + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate else: - print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") - twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] - df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" + ) + twilight_models[surface_type] = { + "params": popt, + "r2": r_squared, + "model": lambda x: twilight_model(x, *popt), + } + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] + df.loc[valid_twilight_mask, "solar_background_rate"] = ( + twilight_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + ) + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate except RuntimeError as e: - print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") - df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." + ) + df.loc[ + (df["classification"] == "Twilight") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate else: - print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + print( + "Too few twilight points for modeling. Setting twilight rates to non-solar rate." + ) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + non_solar_rate + ) return df, twilight_models -def create_diagnostic_plot(df, daytime_df, twilight_df, nighttime_df, daytime_models, twilight_models, non_solar_rate, output_dir): + +def create_diagnostic_plot( + df, + daytime_df, + twilight_df, + nighttime_df, + daytime_models, + twilight_models, + non_solar_rate, + output_dir, +): """Create diagnostic plot of background rate vs. solar elevation, including nighttime comparison.""" plt.figure(figsize=(10, 6)) - + # Total background rate (all data) - plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') - + plt.scatter( + df["solar_elevation"], + df["total_background_rate"], + c="gray", + alpha=0.5, + label="Total", + ) + # Daytime data - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], - c='blue', alpha=0.5, label='Daytime Land') - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='cyan', alpha=0.5, label='Daytime Ocean') - + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], + c="blue", + alpha=0.5, + label="Daytime Land", + ) + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="cyan", + alpha=0.5, + label="Daytime Ocean", + ) + # Twilight data - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], - c='orange', alpha=0.5, label='Twilight Land') - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='yellow', alpha=0.5, label='Twilight Ocean') - + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], + c="orange", + alpha=0.5, + label="Twilight Land", + ) + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="yellow", + alpha=0.5, + label="Twilight Ocean", + ) + # Nighttime data (using total_background_rate since solar contribution is minimal) - plt.scatter(nighttime_df[nighttime_df['surface_type'] == 'Land']['solar_elevation'], - nighttime_df[nighttime_df['surface_type'] == 'Land']['total_background_rate'], - c='purple', alpha=0.5, label='Nighttime Land') - plt.scatter(nighttime_df[nighttime_df['surface_type'] == 'Ocean']['solar_elevation'], - nighttime_df[nighttime_df['surface_type'] == 'Ocean']['total_background_rate'], - c='lightgreen', alpha=0.5, label='Nighttime Ocean') - + plt.scatter( + nighttime_df[nighttime_df["surface_type"] == "Land"]["solar_elevation"], + nighttime_df[nighttime_df["surface_type"] == "Land"]["total_background_rate"], + c="purple", + alpha=0.5, + label="Nighttime Land", + ) + plt.scatter( + nighttime_df[nighttime_df["surface_type"] == "Ocean"]["solar_elevation"], + nighttime_df[nighttime_df["surface_type"] == "Ocean"]["total_background_rate"], + c="lightgreen", + alpha=0.5, + label="Nighttime Ocean", + ) + # Non-solar background rate line - plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + plt.axhline( + non_solar_rate, + color="r", + linestyle="--", + label=f"Non-Solar: {non_solar_rate:.2e} ph/s", + ) # Plot daytime fits for surface_type in daytime_models: - if daytime_models[surface_type]['model'] is not None: - x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) - y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) - plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if daytime_models[surface_type]["model"] is not None: + x_fit = np.linspace( + min(daytime_df["solar_elevation"]), + max(daytime_df["solar_elevation"]), + 100, + ) + y_fit = daytime_models[surface_type]["model"]( + x_fit + if "power" not in daytime_models[surface_type] + else x_fit - min(daytime_df["solar_elevation"]) + 1 + ) + plt.plot( + x_fit, + y_fit, + label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) # Plot twilight fits for surface_type in twilight_models: - if twilight_models[surface_type]['model'] is not None: - x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) - y_twilight = twilight_models[surface_type]['model'](x_twilight) - plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if twilight_models[surface_type]["model"] is not None: + x_twilight = np.linspace( + min(twilight_df["solar_elevation"]), + max(twilight_df["solar_elevation"]), + 100, + ) + y_twilight = twilight_models[surface_type]["model"](x_twilight) + plt.plot( + x_twilight, + y_twilight, + label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) # Set plot properties - plt.yscale('log') + plt.yscale("log") plt.ylim(1e3, 1e8) - plt.xlabel('Solar Elevation (degrees)') - plt.ylabel('Background Rate (ph/s)') + plt.xlabel("Solar Elevation (degrees)") + plt.ylabel("Background Rate (ph/s)") plt.legend() - plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean) with Nighttime Comparison') - + plt.title( + "Background Rate vs. Solar Elevation (Land vs. Ocean) with Nighttime Comparison" + ) + # Save the plot - plt.savefig(os.path.join(output_dir, "background_vs_elevation_with_nighttime_2021.png")) + plt.savefig( + os.path.join(output_dir, "background_vs_elevation_with_nighttime_2021.png") + ) plt.close() + def create_calibration_plot(df, daytime_df, output_dir): """Create a calibration plot using mean photon height as a proxy.""" if not daytime_df.empty: sample_file = daytime_df.iloc[0] - heights = sample_file['mean_photon_height'] + heights = sample_file["mean_photon_height"] if np.isfinite(heights): - solar_rate = sample_file['solar_background_rate'] + solar_rate = sample_file["solar_background_rate"] bin_size = 0.1 # Simulate histogram with 1000 points - hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) - photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + hist, edges = np.histogram( + np.array([heights] * 1000), + bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), + ) + photons_per_bin = ( + solar_rate + * (1000 / np.sum(hist)) + * bin_size + / (max(heights) - min(heights) + 2) + ) calibrated_counts = np.maximum(hist - photons_per_bin, 0) calibrated_heights = [] for i, count in enumerate(calibrated_counts): if count > 0: - calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + calibrated_heights.extend( + np.random.uniform(edges[i], edges[i + 1], int(count)) + ) plt.figure(figsize=(8, 6)) - plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') - plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') - plt.xlabel('Photon Height (m)') - plt.ylabel('Count') + plt.hist( + [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" + ) + plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") + plt.xlabel("Photon Height (m)") + plt.ylabel("Count") plt.legend() plt.title(f'Calibration Example: {sample_file["file"]}') plt.savefig(os.path.join(output_dir, "calibration_example.png")) plt.close() + def save_results(df, output_dir): """Save the results to a CSV file.""" df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) -def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + +def analyze_icesat2_data( + atl03_directory, + beam="gt1l", + twilight_threshold=6.0, + output_dir="output", + shoreline_data_path=None, +): """Main analysis function orchestrating the workflow.""" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -472,49 +757,64 @@ def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, o df = load_and_process_files(atl03_directory, beam, shoreline_data_path) if df.empty: raise ValueError("No data processed successfully. Check logs for errors.") - + df = assign_surface_type(df) - + # Compute total_background_rate based on surface_type - df['total_background_rate'] = np.where( - df['surface_type'] == 'Land', df['total_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + df["total_background_rate"] = np.where( + df["surface_type"] == "Land", + df["total_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan + ), + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( + df, twilight_threshold ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) - + non_solar_rate = calculate_non_solar_rate(nighttime_df) df = compute_solar_background_rates(df, non_solar_rate) - + # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates - daytime_df = df[df['classification'] == 'Daytime'].copy() - nighttime_df = df[df['classification'] == 'Nighttime'].copy() - twilight_df = df[df['classification'] == 'Twilight'].copy() - unknown_df = df[df['classification'] == 'Unknown'].copy() - + daytime_df = df[df["classification"] == "Daytime"].copy() + nighttime_df = df[df["classification"] == "Nighttime"].copy() + twilight_df = df[df["classification"] == "Twilight"].copy() + unknown_df = df[df["classification"] == "Unknown"].copy() + df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) save_results(df, output_dir) - + # Updated call to include nighttime_df - create_diagnostic_plot(df, daytime_df, twilight_df, nighttime_df, daytime_models, twilight_models, non_solar_rate, output_dir) + create_diagnostic_plot( + df, + daytime_df, + twilight_df, + nighttime_df, + daytime_models, + twilight_models, + non_solar_rate, + output_dir, + ) # create_calibration_plot(df, daytime_df, output_dir) - + return df, { - 'daytime_models': daytime_models, - 'twilight_models': twilight_models, - 'non_solar_rate': non_solar_rate - } + "daytime_models": daytime_models, + "twilight_models": twilight_models, + "non_solar_rate": non_solar_rate, + } + if __name__ == "__main__": args = get_args() atl03_directory = os.path.join(args.workspace_path, args.atl03_path) shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) output_dir = os.path.join(args.workspace_path, args.output_path) - df, models = analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) + df, models = analyze_icesat2_data( + atl03_directory, + args.beam, + output_dir=output_dir, + shoreline_data_path=shoreline_data_path, + ) print("Analysis complete.") - - - - - \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py index 3701a3f..1267057 100644 --- a/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py +++ b/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py @@ -1,84 +1,95 @@ -import h5py +import logging +from multiprocessing import Pool import os + +from config import get_args +import geopandas as gpd +import h5py +import matplotlib.pyplot as plt +from noise_utils.data_processing import isolate_sea_land_photons import numpy as np import pandas as pd -import geopandas as gpd -from shapely.geometry import Point from scipy.interpolate import interp1d from scipy.optimize import curve_fit -import matplotlib.pyplot as plt -from multiprocessing import Pool -from config import get_args -from noise_utils.interpolation import geoid_correction, refraction_correction, interpolate_labels, apply_interpolation -from noise_utils.data_processing import isolate_sea_land_photons - -import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) + # Define model functions def power_model(x, a, b, c): return a * np.power(x, b) + c + def exp_model(x, a, b, c): """Exponential model: a * exp(b * x) + c""" return a * np.exp(b * x) + c + def twilight_model(x, a, b, c): return a * np.exp(-b * x**2) + c # Gaussian form for twilight + def process_file(args): """Process a single ATL03 file, skipping files missing required datasets.""" file, beam, shoreline_data_path = args try: - with h5py.File(file, 'r') as f: + with h5py.File(file, "r") as f: # Check for required datasets required_keys = [ - f'{beam}/bckgrd_atlas/bckgrd_rate', - f'{beam}/bckgrd_atlas/delta_time', - f'{beam}/heights/delta_time', - f'{beam}/heights/h_ph', - f'{beam}/geolocation/reference_photon_lat', - f'{beam}/geolocation/reference_photon_lon', - f'{beam}/geolocation/solar_azimuth', - f'{beam}/geolocation/solar_elevation' + f"{beam}/bckgrd_atlas/bckgrd_rate", + f"{beam}/bckgrd_atlas/delta_time", + f"{beam}/heights/delta_time", + f"{beam}/heights/h_ph", + f"{beam}/geolocation/reference_photon_lat", + f"{beam}/geolocation/reference_photon_lon", + f"{beam}/geolocation/solar_azimuth", + f"{beam}/geolocation/solar_elevation", ] for key in required_keys: if key not in f: raise KeyError(f"Missing dataset: {key}") - bckgrd_rate = f[f'{beam}/bckgrd_atlas/bckgrd_rate'][:].astype(np.float64) - bckgrd_time = f[f'{beam}/bckgrd_atlas/delta_time'][:] - heights = f[f'{beam}/heights/h_ph'][:] - segment_lat = f[f'{beam}/geolocation/reference_photon_lat'][:] - segment_lon = f[f'{beam}/geolocation/reference_photon_lon'][:] - segment_delta_time = f[f'{beam}/geolocation/delta_time'][:] - - Segment_Is_Land = pd.DataFrame({ - 'latitude': segment_lat, # Use unique column names to avoid conflicts - 'longitude': segment_lon, - 'delta_time': segment_delta_time - }) - + bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) + bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] + heights = f[f"{beam}/heights/h_ph"][:] + segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] + segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] + segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] + + Segment_Is_Land = pd.DataFrame( + { + "latitude": segment_lat, # Use unique column names to avoid conflicts + "longitude": segment_lon, + "delta_time": segment_delta_time, + } + ) + # Create a GeoDataFrame with geometry - Segment_Is_Land['geometry'] = gpd.points_from_xy(Segment_Is_Land['longitude'], Segment_Is_Land['latitude']) + Segment_Is_Land["geometry"] = gpd.points_from_xy( + Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] + ) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF['is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF["is_land"] = Segment_Is_Land_Labels # Interpolate is_land labels to bckgrd_time interp_is_land = interp1d( - ICESat2_GDF['delta_time'], - ICESat2_GDF['is_land'].astype(int), + ICESat2_GDF["delta_time"], + ICESat2_GDF["is_land"].astype(int), bounds_error=False, - fill_value=(ICESat2_GDF['is_land'].astype(int).iloc[0], ICESat2_GDF['is_land'].astype(int).iloc[-1]) + fill_value=( + ICESat2_GDF["is_land"].astype(int).iloc[0], + ICESat2_GDF["is_land"].astype(int).iloc[-1], + ), ) is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - + # ##########Debug################################# # # Interpolate latitude and longitude to bckgrd_time # interp_lat = interp1d( @@ -108,7 +119,7 @@ def process_file(args): # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), # crs="EPSG:4326" # ) - + # # Save to a GeoPackage file # file_dir = os.path.dirname(file) # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") @@ -122,187 +133,311 @@ def process_file(args): # Compute total background rates for land and ocean land_mask = is_land_interpolated == 1 ocean_mask = is_land_interpolated == 0 - total_background_rate_land = np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - total_background_rate_ocean = np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + total_background_rate_land = ( + np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan + ) + total_background_rate_ocean = ( + np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan + ) # Solar elevation handling FILL_VALUE = 3.4028235e38 - solar_elevations = f[f'{beam}/geolocation/solar_elevation'][:].astype(np.float64) - valid_solar = solar_elevations[(solar_elevations != FILL_VALUE) & (np.abs(solar_elevations) < 1e30) & np.isfinite(solar_elevations)] + solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( + np.float64 + ) + valid_solar = solar_elevations[ + (solar_elevations != FILL_VALUE) + & (np.abs(solar_elevations) < 1e30) + & np.isfinite(solar_elevations) + ] solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan result = { - 'file': os.path.basename(file), - 'total_background_rate_land': total_background_rate_land, - 'total_background_rate_ocean': total_background_rate_ocean, - 'solar_elevation': solar_elevation, - 'mean_photon_height': mean_photon_height + "file": os.path.basename(file), + "total_background_rate_land": total_background_rate_land, + "total_background_rate_ocean": total_background_rate_ocean, + "solar_elevation": solar_elevation, + "mean_photon_height": mean_photon_height, } logger.info(f"Processed {file}: {result}") return result except Exception as e: - logger.error(f"Error processing {file}: {str(e)}") - return {'file': os.path.basename(file), 'error': str(e)} - -def fit_surface_model(df, classification, surface_type, model_func, p0, bounds, x_col='solar_elevation', y_col='solar_background_rate'): + logger.error(f"Error processing {file}: {e!s}") + return {"file": os.path.basename(file), "error": str(e)} + + +def fit_surface_model( + df, + classification, + surface_type, + model_func, + p0, + bounds, + x_col="solar_elevation", + y_col="solar_background_rate", +): """Fit a model to a specific classification and surface type.""" - subset = df[(df['classification'] == classification) & (df['surface_type'] == surface_type)].copy() + subset = df[ + (df["classification"] == classification) & (df["surface_type"] == surface_type) + ].copy() if len(subset) < 3: print(f"Warning: Insufficient data for {classification} {surface_type} model.") return None, None, None - mask = subset[x_col].notna() & subset[y_col].notna() & np.isfinite(subset[x_col]) & np.isfinite(subset[y_col]) + mask = ( + subset[x_col].notna() + & subset[y_col].notna() + & np.isfinite(subset[x_col]) + & np.isfinite(subset[y_col]) + ) valid_x = subset.loc[mask, x_col] valid_y = subset.loc[mask, y_col] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for {classification} {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." + ) return None, None, None try: - popt, _ = curve_fit(model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000) + popt, _ = curve_fit( + model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 + ) y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred)**2) / np.sum((valid_y - np.mean(valid_y))**2) + r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( + (valid_y - np.mean(valid_y)) ** 2 + ) return popt, r2, lambda x: model_func(x, *popt) except RuntimeError as e: print(f"Fit failed for {classification} {surface_type}: {e}") return None, None, None + def load_and_process_files(atl03_directory, beam, shoreline_data_path): """Load and process ATL03 files in parallel.""" - atl03_files = [os.path.join(atl03_directory, f) for f in os.listdir(atl03_directory) if f.endswith('.h5') and 'ATL03' in f] + atl03_files = [ + os.path.join(atl03_directory, f) + for f in os.listdir(atl03_directory) + if f.endswith(".h5") and "ATL03" in f + ] print(f"Found {len(atl03_files)} ATL03 files.") - + if not atl03_files: raise ValueError("No ATL03 files found in the directory.") with Pool() as pool: - results = pool.map(process_file, [(f, beam, shoreline_data_path) for f in atl03_files]) - + results = pool.map( + process_file, [(f, beam, shoreline_data_path) for f in atl03_files] + ) + print("Results from process_file:", results) # Debug output df = pd.DataFrame(results) - - - if 'error' in df.columns: - error_files = df[df['error'].notna()]['file'].tolist() + + if "error" in df.columns: + error_files = df[df["error"].notna()]["file"].tolist() print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df['error'].isna()].drop(columns=['error']) - + df = df[df["error"].isna()].drop(columns=["error"]) + if df.empty: - raise ValueError("No files were successfully processed. Check the error messages above.") - + raise ValueError( + "No files were successfully processed. Check the error messages above." + ) + print("DataFrame columns after processing:", df.columns) return df + def assign_surface_type(df): """Assign surface type based on background rates.""" - required_columns = ['total_background_rate_land', 'total_background_rate_ocean'] + required_columns = ["total_background_rate_land", "total_background_rate_ocean"] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - df['surface_type'] = np.where( - df['total_background_rate_land'].notna() & (df['total_background_rate_land'] > 0), - 'Land', - np.where(df['total_background_rate_ocean'].notna() & (df['total_background_rate_ocean'] > 0), 'Ocean', 'Mixed') + df["surface_type"] = np.where( + df["total_background_rate_land"].notna() + & (df["total_background_rate_land"] > 0), + "Land", + np.where( + df["total_background_rate_ocean"].notna() + & (df["total_background_rate_ocean"] > 0), + "Ocean", + "Mixed", + ), ) return df + def classify_data(df, twilight_threshold=6.0): """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df['classification'] = np.where( - df['solar_elevation'].isna(), 'Unknown', - np.where(df['solar_elevation'] > twilight_threshold, 'Daytime', - np.where(df['solar_elevation'] < -twilight_threshold, 'Nighttime', 'Twilight')) + df["classification"] = np.where( + df["solar_elevation"].isna(), + "Unknown", + np.where( + df["solar_elevation"] > twilight_threshold, + "Daytime", + np.where( + df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" + ), + ), + ) + daytime_df = df[df["classification"] == "Daytime"].copy() + nighttime_df = df[df["classification"] == "Nighttime"].copy() + twilight_df = df[df["classification"] == "Twilight"].copy() + unknown_df = df[df["classification"] == "Unknown"].copy() + print( + f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" ) - daytime_df = df[df['classification'] == 'Daytime'].copy() - nighttime_df = df[df['classification'] == 'Nighttime'].copy() - twilight_df = df[df['classification'] == 'Twilight'].copy() - unknown_df = df[df['classification'] == 'Unknown'].copy() - print(f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}") return df, daytime_df, nighttime_df, twilight_df, unknown_df + def calculate_non_solar_rate(nighttime_df): """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df['total_background_rate_land']) - non_solar_rate_ocean = np.nanmean(nighttime_df['total_background_rate_ocean']) + non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) + non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") return non_solar_rate + def compute_solar_background_rates(df, non_solar_rate): """Compute solar background rates for land and ocean.""" - df['solar_background_rate_land'] = (df['total_background_rate_land'] - non_solar_rate).clip(lower=0) - df['solar_background_rate_ocean'] = (df['total_background_rate_ocean'] - non_solar_rate).clip(lower=0) - df['solar_background_rate'] = np.where( - df['surface_type'] == 'Land', df['solar_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['solar_background_rate_ocean'], np.nan) + df["solar_background_rate_land"] = ( + df["total_background_rate_land"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate_ocean"] = ( + df["total_background_rate_ocean"] - non_solar_rate + ).clip(lower=0) + df["solar_background_rate"] = np.where( + df["surface_type"] == "Land", + df["solar_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan + ), ) return df + def fit_daytime_model(df, daytime_df, non_solar_rate): """Fit daytime models for land and ocean.""" daytime_models = {} - for surface_type in ['Land', 'Ocean']: - subset = daytime_df[daytime_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() if len(subset) < 3: print(f"Warning: Insufficient daytime {surface_type} data for modeling.") continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_x = subset.loc[mask, 'solar_elevation'] - valid_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_x = subset.loc[mask, "solar_elevation"] + valid_y = subset.loc[mask, "solar_background_rate"] if len(valid_x) < 3: - print(f"Warning: Fewer than 3 valid points for daytime {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for daytime {surface_type} model." + ) continue # Remove outliers log_y = np.log10(valid_y + 1) mean_log_y = np.mean(log_y) std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & (log_y < mean_log_y + 3 * std_log_y) + outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( + log_y < mean_log_y + 3 * std_log_y + ) valid_x = valid_x[outlier_mask] valid_y = valid_y[outlier_mask] print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") try: p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ([0, 0.01, non_solar_rate * 0.9], [1e8, 0.5, non_solar_rate * 1.1]) - popt_exp, _ = curve_fit(exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000) - r2_exp = 1 - np.sum((valid_y - exp_model(valid_x, *popt_exp))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_exp = ( + [0, 0.01, non_solar_rate * 0.9], + [1e8, 0.5, non_solar_rate * 1.1], + ) + popt_exp, _ = curve_fit( + exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 + ) + r2_exp = 1 - np.sum( + (valid_y - exp_model(valid_x, *popt_exp)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) x_shifted = valid_x - min(valid_x) + 1 p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ([0, 0.1, non_solar_rate * 0.9], [1e8, 1.0, non_solar_rate * 1.1]) - popt_power, _ = curve_fit(power_model, x_shifted, valid_y, p0=p0_power, bounds=bounds_power, maxfev=20000) - r2_power = 1 - np.sum((valid_y - power_model(x_shifted, *popt_power))**2) / np.sum((valid_y - np.mean(valid_y))**2) + bounds_power = ( + [0, 0.1, non_solar_rate * 0.9], + [1e8, 1.0, non_solar_rate * 1.1], + ) + popt_power, _ = curve_fit( + power_model, + x_shifted, + valid_y, + p0=p0_power, + bounds=bounds_power, + maxfev=20000, + ) + r2_power = 1 - np.sum( + (valid_y - power_model(x_shifted, *popt_power)) ** 2 + ) / np.sum((valid_y - np.mean(valid_y)) ** 2) if r2_power > r2_exp: - print(f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}") - daytime_models[surface_type] = {'params': popt_power, 'r2': r2_power, 'model': lambda x: power_model(x - min(valid_x) + 1, *popt_power)} + print( + f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_power, + "r2": r2_power, + "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), + } else: - print(f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}") - daytime_models[surface_type] = {'params': popt_exp, 'r2': r2_exp, 'model': lambda x: exp_model(x, *popt_exp)} + print( + f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" + ) + daytime_models[surface_type] = { + "params": popt_exp, + "r2": r2_exp, + "model": lambda x: exp_model(x, *popt_exp), + } # Apply model - daytime_mask = (df['classification'] == 'Daytime') & (df['surface_type'] == surface_type) - valid_daytime_mask = daytime_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_daytime_mask, 'solar_elevation'] + daytime_mask = (df["classification"] == "Daytime") & ( + df["surface_type"] == surface_type + ) + valid_daytime_mask = ( + daytime_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] if r2_power > r2_exp: x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model_shifted).clip(lower=0) + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model_shifted).clip(lower=0) else: - df.loc[valid_daytime_mask, 'solar_background_rate'] = daytime_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, 'solar_background_rate'] = non_solar_rate + df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( + non_solar_rate + ) except RuntimeError as e: - print(f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback.") - df.loc[(df['classification'] == 'Daytime') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." + ) + df.loc[ + (df["classification"] == "Daytime") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate return df, daytime_models @@ -310,142 +445,279 @@ def fit_twilight_model(df, twilight_df, non_solar_rate): """Fit twilight models for land and ocean.""" twilight_models = {} if len(twilight_df) > 3: - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = ( - df['solar_background_rate'].fillna(0).clip(lower=0) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + df["solar_background_rate"].fillna(0).clip(lower=0) ) - for surface_type in ['Land', 'Ocean']: - subset = twilight_df[twilight_df['surface_type'] == surface_type].copy() + for surface_type in ["Land", "Ocean"]: + subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() if len(subset) < 3: - print(f"Warning: Insufficient twilight {surface_type} data for modeling.") + print( + f"Warning: Insufficient twilight {surface_type} data for modeling." + ) continue - mask = subset['solar_elevation'].notna() & subset['solar_background_rate'].notna() & \ - np.isfinite(subset['solar_elevation']) & np.isfinite(subset['solar_background_rate']) - valid_twilight_x = subset.loc[mask, 'solar_elevation'] - valid_twilight_y = subset.loc[mask, 'solar_background_rate'] + mask = ( + subset["solar_elevation"].notna() + & subset["solar_background_rate"].notna() + & np.isfinite(subset["solar_elevation"]) + & np.isfinite(subset["solar_background_rate"]) + ) + valid_twilight_x = subset.loc[mask, "solar_elevation"] + valid_twilight_y = subset.loc[mask, "solar_background_rate"] if len(valid_twilight_x) < 3: - print(f"Warning: Fewer than 3 valid points for twilight {surface_type} model.") + print( + f"Warning: Fewer than 3 valid points for twilight {surface_type} model." + ) continue - print(f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points") + print( + f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" + ) try: initial_a = np.max(valid_twilight_y) - non_solar_rate initial_b = 0.01 # Smaller b for a flatter Gaussian initial_c = non_solar_rate p0 = [initial_a, initial_b, initial_c] - bounds = ([0, 0.001, non_solar_rate * 0.9], [1e6, 1.0, non_solar_rate * 1.1]) - - popt, _ = curve_fit(twilight_model, valid_twilight_x, valid_twilight_y, p0=p0, bounds=bounds, maxfev=50000) + bounds = ( + [0, 0.001, non_solar_rate * 0.9], + [1e6, 1.0, non_solar_rate * 1.1], + ) + + popt, _ = curve_fit( + twilight_model, + valid_twilight_x, + valid_twilight_y, + p0=p0, + bounds=bounds, + maxfev=50000, + ) y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred)**2) / np.sum((valid_twilight_y - np.mean(valid_twilight_y))**2) + r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( + (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 + ) if r_squared < 0: # Fallback to a constant model (mean of the data) mean_rate = np.mean(valid_twilight_y) twilight_models[surface_type] = { - 'params': [mean_rate], - 'r2': 0.0, # R² of a constant model is 0 by definition - 'model': lambda x: np.full_like(x, mean_rate) + "params": [mean_rate], + "r2": 0.0, # R² of a constant model is 0 by definition + "model": lambda x: np.full_like(x, mean_rate), } - print(f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s") - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - df.loc[valid_twilight_mask, 'solar_background_rate'] = mean_rate - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" + ) + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate else: - print(f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}") - twilight_models[surface_type] = {'params': popt, 'r2': r_squared, 'model': lambda x: twilight_model(x, *popt)} - - twilight_mask = (df['classification'] == 'Twilight') & (df['surface_type'] == surface_type) - valid_twilight_mask = twilight_mask & df['solar_elevation'].notna() & np.isfinite(df['solar_elevation']) - x_for_model = df.loc[valid_twilight_mask, 'solar_elevation'] - df.loc[valid_twilight_mask, 'solar_background_rate'] = twilight_models[surface_type]['model'](x_for_model).clip(lower=0) - df.loc[twilight_mask & ~valid_twilight_mask, 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" + ) + twilight_models[surface_type] = { + "params": popt, + "r2": r_squared, + "model": lambda x: twilight_model(x, *popt), + } + + twilight_mask = (df["classification"] == "Twilight") & ( + df["surface_type"] == surface_type + ) + valid_twilight_mask = ( + twilight_mask + & df["solar_elevation"].notna() + & np.isfinite(df["solar_elevation"]) + ) + x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] + df.loc[valid_twilight_mask, "solar_background_rate"] = ( + twilight_models[ + surface_type + ]["model"](x_for_model).clip(lower=0) + ) + df.loc[ + twilight_mask & ~valid_twilight_mask, "solar_background_rate" + ] = non_solar_rate except RuntimeError as e: - print(f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate.") - df.loc[(df['classification'] == 'Twilight') & (df['surface_type'] == surface_type), 'solar_background_rate'] = non_solar_rate + print( + f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." + ) + df.loc[ + (df["classification"] == "Twilight") + & (df["surface_type"] == surface_type), + "solar_background_rate", + ] = non_solar_rate else: - print("Too few twilight points for modeling. Setting twilight rates to non-solar rate.") - df.loc[df['classification'] == 'Twilight', 'solar_background_rate'] = non_solar_rate + print( + "Too few twilight points for modeling. Setting twilight rates to non-solar rate." + ) + df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( + non_solar_rate + ) return df, twilight_models -def create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir): + +def create_diagnostic_plot( + df, + daytime_df, + twilight_df, + daytime_models, + twilight_models, + non_solar_rate, + output_dir, +): """Create diagnostic plot of background rate vs. solar elevation.""" plt.figure(figsize=(10, 6)) - plt.scatter(df['solar_elevation'], df['total_background_rate'], c='gray', alpha=0.5, label='Total') - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Land']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Land']['solar_background_rate'], - c='blue', alpha=0.5, label='Daytime Land') - plt.scatter(daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_elevation'], - daytime_df[daytime_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='cyan', alpha=0.5, label='Daytime Ocean') - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Land']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Land']['solar_background_rate'], - c='orange', alpha=0.5, label='Twilight Land') - plt.scatter(twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_elevation'], - twilight_df[twilight_df['surface_type'] == 'Ocean']['solar_background_rate'], - c='yellow', alpha=0.5, label='Twilight Ocean') - plt.axhline(non_solar_rate, color='r', linestyle='--', label=f'Non-Solar: {non_solar_rate:.2e} ph/s') + plt.scatter( + df["solar_elevation"], + df["total_background_rate"], + c="gray", + alpha=0.5, + label="Total", + ) + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], + c="blue", + alpha=0.5, + label="Daytime Land", + ) + plt.scatter( + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], + daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="cyan", + alpha=0.5, + label="Daytime Ocean", + ) + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], + c="orange", + alpha=0.5, + label="Twilight Land", + ) + plt.scatter( + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], + twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], + c="yellow", + alpha=0.5, + label="Twilight Ocean", + ) + plt.axhline( + non_solar_rate, + color="r", + linestyle="--", + label=f"Non-Solar: {non_solar_rate:.2e} ph/s", + ) # Plot daytime fits for surface_type in daytime_models: - if daytime_models[surface_type]['model'] is not None: - x_fit = np.linspace(min(daytime_df['solar_elevation']), max(daytime_df['solar_elevation']), 100) - y_fit = daytime_models[surface_type]['model'](x_fit if 'power' not in daytime_models[surface_type] else x_fit - min(daytime_df['solar_elevation']) + 1) - plt.plot(x_fit, y_fit, label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if daytime_models[surface_type]["model"] is not None: + x_fit = np.linspace( + min(daytime_df["solar_elevation"]), + max(daytime_df["solar_elevation"]), + 100, + ) + y_fit = daytime_models[surface_type]["model"]( + x_fit + if "power" not in daytime_models[surface_type] + else x_fit - min(daytime_df["solar_elevation"]) + 1 + ) + plt.plot( + x_fit, + y_fit, + label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) # Plot twilight fits for surface_type in twilight_models: - if twilight_models[surface_type]['model'] is not None: - x_twilight = np.linspace(min(twilight_df['solar_elevation']), max(twilight_df['solar_elevation']), 100) - y_twilight = twilight_models[surface_type]['model'](x_twilight) - plt.plot(x_twilight, y_twilight, label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle='-' if surface_type == 'Land' else '--') + if twilight_models[surface_type]["model"] is not None: + x_twilight = np.linspace( + min(twilight_df["solar_elevation"]), + max(twilight_df["solar_elevation"]), + 100, + ) + y_twilight = twilight_models[surface_type]["model"](x_twilight) + plt.plot( + x_twilight, + y_twilight, + label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', + linestyle="-" if surface_type == "Land" else "--", + ) - plt.yscale('log') + plt.yscale("log") plt.ylim(1e3, 1e8) - plt.xlabel('Solar Elevation (degrees)') - plt.ylabel('Background Rate (ph/s)') + plt.xlabel("Solar Elevation (degrees)") + plt.ylabel("Background Rate (ph/s)") plt.legend() - plt.title('Background Rate vs. Solar Elevation (Land vs. Ocean)') + plt.title("Background Rate vs. Solar Elevation (Land vs. Ocean)") plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) plt.close() + def create_calibration_plot(df, daytime_df, output_dir): """Create a calibration plot using mean photon height as a proxy.""" if not daytime_df.empty: sample_file = daytime_df.iloc[0] - heights = sample_file['mean_photon_height'] + heights = sample_file["mean_photon_height"] if np.isfinite(heights): - solar_rate = sample_file['solar_background_rate'] + solar_rate = sample_file["solar_background_rate"] bin_size = 0.1 # Simulate histogram with 1000 points - hist, edges = np.histogram(np.array([heights] * 1000), bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size)) - photons_per_bin = solar_rate * (1000 / np.sum(hist)) * bin_size / (max(heights) - min(heights) + 2) + hist, edges = np.histogram( + np.array([heights] * 1000), + bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), + ) + photons_per_bin = ( + solar_rate + * (1000 / np.sum(hist)) + * bin_size + / (max(heights) - min(heights) + 2) + ) calibrated_counts = np.maximum(hist - photons_per_bin, 0) calibrated_heights = [] for i, count in enumerate(calibrated_counts): if count > 0: - calibrated_heights.extend(np.random.uniform(edges[i], edges[i+1], int(count))) + calibrated_heights.extend( + np.random.uniform(edges[i], edges[i + 1], int(count)) + ) plt.figure(figsize=(8, 6)) - plt.hist([heights] * 1000, bins=100, alpha=0.5, label='Original (Simulated)') - plt.hist(calibrated_heights, bins=100, alpha=0.5, label='Calibrated') - plt.xlabel('Photon Height (m)') - plt.ylabel('Count') + plt.hist( + [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" + ) + plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") + plt.xlabel("Photon Height (m)") + plt.ylabel("Count") plt.legend() plt.title(f'Calibration Example: {sample_file["file"]}') plt.savefig(os.path.join(output_dir, "calibration_example.png")) plt.close() + def save_results(df, output_dir): """Save the results to a CSV file.""" df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) -def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, output_dir="output", shoreline_data_path=None): + +def analyze_icesat2_data( + atl03_directory, + beam="gt1l", + twilight_threshold=6.0, + output_dir="output", + shoreline_data_path=None, +): """Main analysis function orchestrating the workflow.""" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -453,44 +725,61 @@ def analyze_icesat2_data(atl03_directory, beam='gt1l', twilight_threshold=6.0, o df = load_and_process_files(atl03_directory, beam, shoreline_data_path) if df.empty: raise ValueError("No data processed successfully. Check logs for errors.") - + df = assign_surface_type(df) - + # Compute total_background_rate based on surface_type - df['total_background_rate'] = np.where( - df['surface_type'] == 'Land', df['total_background_rate_land'], - np.where(df['surface_type'] == 'Ocean', df['total_background_rate_ocean'], np.nan) + df["total_background_rate"] = np.where( + df["surface_type"] == "Land", + df["total_background_rate_land"], + np.where( + df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan + ), + ) + + df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( + df, twilight_threshold ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data(df, twilight_threshold) - + non_solar_rate = calculate_non_solar_rate(nighttime_df) df = compute_solar_background_rates(df, non_solar_rate) - + # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates - daytime_df = df[df['classification'] == 'Daytime'].copy() - nighttime_df = df[df['classification'] == 'Nighttime'].copy() - twilight_df = df[df['classification'] == 'Twilight'].copy() - unknown_df = df[df['classification'] == 'Unknown'].copy() - + daytime_df = df[df["classification"] == "Daytime"].copy() + nighttime_df = df[df["classification"] == "Nighttime"].copy() + twilight_df = df[df["classification"] == "Twilight"].copy() + unknown_df = df[df["classification"] == "Unknown"].copy() + df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) save_results(df, output_dir) - create_diagnostic_plot(df, daytime_df, twilight_df, daytime_models, twilight_models, non_solar_rate, output_dir) + create_diagnostic_plot( + df, + daytime_df, + twilight_df, + daytime_models, + twilight_models, + non_solar_rate, + output_dir, + ) create_calibration_plot(df, daytime_df, output_dir) - return df, { - 'daytime_models': daytime_models, - 'twilight_models': twilight_models, - 'non_solar_rate': non_solar_rate + "daytime_models": daytime_models, + "twilight_models": twilight_models, + "non_solar_rate": non_solar_rate, } + if __name__ == "__main__": args = get_args() atl03_directory = os.path.join(args.workspace_path, args.atl03_path) shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) output_dir = os.path.join(args.workspace_path, args.output_path) - df, models = analyze_icesat2_data(atl03_directory, args.beam, output_dir=output_dir, shoreline_data_path=shoreline_data_path) + df, models = analyze_icesat2_data( + atl03_directory, + args.beam, + output_dir=output_dir, + shoreline_data_path=shoreline_data_path, + ) print("Analysis complete.") - \ No newline at end of file diff --git a/aok/core/kd_utils/SolarBckgrd/config.py b/aok/core/kd_utils/SolarBckgrd/config.py index ebfbcaa..ed34089 100644 --- a/aok/core/kd_utils/SolarBckgrd/config.py +++ b/aok/core/kd_utils/SolarBckgrd/config.py @@ -1,79 +1,119 @@ import argparse + def get_args(): - parser = argparse.ArgumentParser(description="Analyze ICESat-2 ATL03 data for solar background.") + parser = argparse.ArgumentParser( + description="Analyze ICESat-2 ATL03 data for solar background." + ) - #task name - parser.add_argument("--taskID", type=str, default='2022_granules', - help="task name") + # task name + parser.add_argument("--taskID", type=str, default="2022_granules", help="task name") # workspace path - parser.add_argument("--workspace_path", type=str, default='/work/users/w/a/wayne128/ICESat2/IS2_kd_py/', - help="Root path for all data processing.") + parser.add_argument( + "--workspace_path", + type=str, + default="/work/users/w/a/wayne128/ICESat2/IS2_kd_py/", + help="Root path for all data processing.", + ) - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCCENTRAL/', # help="Base path for ICESat-2 ATL03 data.") - + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCNORTH/', # help="Base path for ICESat-2 ATL03 data.") - + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCSOUTH/', # help="Base path for ICESat-2 ATL03 data.") - + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2018_granules/h5_files/', # help="Base path for ICESat-2 ATL03 data.") - + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2019_granules/h5_files/', - # help="Base path for ICESat-2 ATL03 data.") - + # help="Base path for ICESat-2 ATL03 data.") + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/', # help="Base path for ICESat-2 ATL03 data.") - + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2021_granules/h5_files/', # help="Base path for ICESat-2 ATL03 data.") - - parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/', - help="Base path for ICESat-2 ATL03 data.") - + + parser.add_argument( + "--atl03_path", + type=str, + default="Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/", + help="Base path for ICESat-2 ATL03 data.", + ) + # parser.add_argument("--parallel", action="store_true", help="Process files in parallel") parser.add_argument("--beam", default="gt1l", help="Beam to process (e.g., gt1l)") - - parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001234100_01721701_006_01.h5", - help="Name of the ATL03 H5 file to process.")#processed_ATL03_20221001114643_01641707_006_01 + parser.add_argument( + "--atl03_file", + type=str, + default="processed_ATL03_20221001234100_01721701_006_01.h5", + help="Name of the ATL03 H5 file to process.", + ) # processed_ATL03_20221001114643_01641707_006_01 + + parser.add_argument( + "--output_path", + type=str, + default="Dataset/Results/", + help="Directory for saving results.", + ) - parser.add_argument("--output_path", type=str, default='Dataset/Results/', - help="Directory for saving results.") - # Shoreline data - parser.add_argument("--shoreline_data", type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default='Dataset/Shorelines/ne_10m_land/ne_10m_land.shp', - help="Path to the global shoreline dataset.") - + parser.add_argument( + "--shoreline_data", + type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default="Dataset/Shorelines/ne_10m_land/ne_10m_land.shp", + help="Path to the global shoreline dataset.", + ) + # Bathymetry datasets - parser.add_argument("--gebco_path", nargs='+', type=str, - default='Dataset/Bathy/gebco_2024_geotiff/', - help="Path to GEBCO bathymetry datasets.") - + parser.add_argument( + "--gebco_path", + nargs="+", + type=str, + default="Dataset/Bathy/gebco_2024_geotiff/", + help="Path to GEBCO bathymetry datasets.", + ) + # Resolution settings - parser.add_argument("--horizontal_res", type=int, default=500, - help="Horizontal resolution for the analysis (in meters).") - parser.add_argument("--vertical_res", type=float, default=0.25, - help="Vertical resolution for the analysis.") - + parser.add_argument( + "--horizontal_res", + type=int, + default=500, + help="Horizontal resolution for the analysis (in meters).", + ) + parser.add_argument( + "--vertical_res", + type=float, + default=0.25, + help="Vertical resolution for the analysis.", + ) + # Analysis parameters - parser.add_argument("--subsurface_thresh", type=float, default=0.5, - help="Threshold for photons below the sea surface.") - parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-1, - help="Maximum depth thres for Kd calculations (in meters).") - + parser.add_argument( + "--subsurface_thresh", + type=float, + default=0.5, + help="Threshold for photons below the sea surface.", + ) + parser.add_argument( + "--ignore_subsurface_height_thres", + type=float, + default=-1, + help="Maximum depth thres for Kd calculations (in meters).", + ) + return parser.parse_args() + if __name__ == "__main__": args = get_args() # Access parameters like this: atl03_file_path = args.workspace_path + args.atl03_file - + print("Processing file:", atl03_file_path) - print("Output path:", args.output_path) \ No newline at end of file + print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/__init__.py b/aok/core/kd_utils/__init__.py index f8f285a..d10baab 100644 --- a/aok/core/kd_utils/__init__.py +++ b/aok/core/kd_utils/__init__.py @@ -1,8 +1,18 @@ # utils/__init__.py -from .data_processing import load_data, extract_file_params, Extract_sea_photons, create_photon_dataframe -from .visualization import plot_photon_height, plot_kd_photons -from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton -from .interpolation import interpolate_labels, apply_interpolation, geoid_correction, refraction_correction from .bathy_processing import * -from .sea_photons_analysis import * \ No newline at end of file +from .data_processing import ( + Extract_sea_photons, + create_photon_dataframe, + extract_file_params, + load_data, +) +from .interpolation import ( + apply_interpolation, + geoid_correction, + interpolate_labels, + refraction_correction, +) +from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton +from .sea_photons_analysis import * +from .visualization import plot_kd_photons, plot_photon_height diff --git a/aok/core/kd_utils/bathy_processing.py b/aok/core/kd_utils/bathy_processing.py index 73b028f..07e1f19 100644 --- a/aok/core/kd_utils/bathy_processing.py +++ b/aok/core/kd_utils/bathy_processing.py @@ -1,17 +1,18 @@ # utils/bathy_processing.py import os + +import numpy as np import pandas as pd import rasterio -import numpy as np +from rtree import index +from scipy.spatial import cKDTree +from scipy.stats import norm +from shapely.geometry import box + from kd_utils.sea_photons_analysis import ( get_sea_surface_height_adaptive, - get_sea_surface_height_static, horizontal_vertical_bin_dataset, ) -from rtree import index -from shapely.geometry import box, Point -from scipy.spatial import cKDTree -from scipy.stats import norm def apply_optional_histogram_quality_filter( @@ -20,7 +21,7 @@ def apply_optional_histogram_quality_filter( sea_surface_height, min_ratio=0.05, depth_min=6.0, - depth_max=7.0 + depth_max=7.0, ): """ Optional histogram quality check: @@ -29,28 +30,33 @@ def apply_optional_histogram_quality_filter( if subsurface_photon_dataset.empty: return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - quality_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height': sea_surface_height - }).dropna(subset=['sea_surface_height']).copy() + quality_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + .dropna(subset=["sea_surface_height"]) + .copy() + ) if quality_df.empty: return subsurface_photon_dataset.iloc[0:0].copy() ratios = [] for _, row in quality_df.iterrows(): - lat_bin = row['lat_bins'] - surface = row['sea_surface_height'] - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + lat_bin = row["lat_bins"] + surface = row["sea_surface_height"] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: ratios.append(0.0) continue - depth = surface - bin_data['photon_height'] + depth = surface - bin_data["photon_height"] underwater_mask = depth > 0 total_underwater = int(underwater_mask.sum()) if total_underwater == 0: @@ -60,16 +66,22 @@ def apply_optional_histogram_quality_filter( band_mask = underwater_mask & (depth >= depth_min) & (depth <= depth_max) ratios.append(float(band_mask.sum()) / float(total_underwater)) - quality_df['hist_quality_ratio'] = ratios - valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + quality_df["hist_quality_ratio"] = ratios + valid_bins = set( + quality_df.loc[ + quality_df["hist_quality_ratio"] >= min_ratio, "lat_bins" + ].tolist() + ) + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def apply_optional_surface_sigma_filter( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=0.5 + sigma_max=0.5, ): """ Optional Gaussian surface sigma filter: @@ -78,24 +90,26 @@ def apply_optional_surface_sigma_filter( if subsurface_photon_dataset.empty: return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset sigma_rows = [] - for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height, strict=False): if pd.isna(surface): continue - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: continue peak_data = bin_data[ - (bin_data['photon_height'] > surface - 1.0) & - (bin_data['photon_height'] < surface + 1.0) + (bin_data["photon_height"] > surface - 1.0) + & (bin_data["photon_height"] < surface + 1.0) ] if len(peak_data) > 10: - _, sigma = norm.fit(peak_data['photon_height']) + _, sigma = norm.fit(peak_data["photon_height"]) else: sigma = np.nan sigma_rows.append((lat_bin, sigma)) @@ -103,9 +117,13 @@ def apply_optional_surface_sigma_filter( if not sigma_rows: return subsurface_photon_dataset - sigma_df = pd.DataFrame(sigma_rows, columns=['lat_bins', 'surface_sigma']) - valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + sigma_df = pd.DataFrame(sigma_rows, columns=["lat_bins", "surface_sigma"]) + valid_bins = set( + sigma_df.loc[sigma_df["surface_sigma"] <= sigma_max, "lat_bins"].tolist() + ) + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def apply_optional_refraction_correction( @@ -113,29 +131,42 @@ def apply_optional_refraction_correction( subsurface_photon_dataset, sea_surface_height, water_temp_c=20.0, - wavelength_nm=532.0 + wavelength_nm=532.0, ): """ Optional refraction correction for subsurface photons. Updates photon positions/heights using Snell-based geometry. """ - required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} - if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): + required_cols = { + "lat_bins", + "photon_height", + "ref_elevation", + "ref_azimuth", + "lon", + "lat", + } + if subsurface_photon_dataset.empty or ( + not required_cols.issubset(set(subsurface_photon_dataset.columns)) + ): return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) - corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() - if corrected['sea_surface_height'].isna().all(): + surface_df = pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + corrected = subsurface_photon_dataset.merge( + surface_df, on="lat_bins", how="left" + ).copy() + if corrected["sea_surface_height"].isna().all(): return subsurface_photon_dataset - corrected['photon_height_pre_refraction'] = corrected['photon_height'] - corrected['lon_pre_refraction'] = corrected['lon'] - corrected['lat_pre_refraction'] = corrected['lat'] + corrected["photon_height_pre_refraction"] = corrected["photon_height"] + corrected["lon_pre_refraction"] = corrected["lon"] + corrected["lat_pre_refraction"] = corrected["lat"] # Refraction index parameterization from legacy module. a = -0.000001501562500 @@ -144,14 +175,26 @@ def apply_optional_refraction_correction( d = -0.000160475520686 e = 1.398067112092424 n1 = 1.00029 - n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e + n2 = ( + (a * water_temp_c**2) + + (b * wavelength_nm**2) + + (c * water_temp_c) + + (d * wavelength_nm) + + e + ) - ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) - ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) - z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) - ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) - x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) - y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) + ref_elev = pd.to_numeric(corrected["ref_elevation"], errors="coerce").to_numpy( + dtype=float + ) + ref_az = pd.to_numeric(corrected["ref_azimuth"], errors="coerce").to_numpy( + dtype=float + ) + z = pd.to_numeric(corrected["photon_height"], errors="coerce").to_numpy(dtype=float) + ws = pd.to_numeric(corrected["sea_surface_height"], errors="coerce").to_numpy( + dtype=float + ) + x = pd.to_numeric(corrected["lon"], errors="coerce").to_numpy(dtype=float) + y = pd.to_numeric(corrected["lat"], errors="coerce").to_numpy(dtype=float) # Convert to radians if values look like degrees. if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): @@ -159,7 +202,13 @@ def apply_optional_refraction_correction( if np.nanmax(np.abs(ref_az)) > (2 * np.pi): ref_az = np.deg2rad(ref_az) - valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) + valid_mask = ( + np.isfinite(ref_elev) + & np.isfinite(ref_az) + & np.isfinite(z) + & np.isfinite(ws) + & (z <= ws) + ) if not np.any(valid_mask): return subsurface_photon_dataset @@ -175,7 +224,7 @@ def apply_optional_refraction_correction( R = (S * n1) / n2 Gamma = (np.pi / 2.0) - theta1 phi = theta1 - theta2 - P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) + P_sq = np.maximum(R**2 + S**2 - 2 * R * S * np.cos(phi), 0.0) P = np.sqrt(P_sq) alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) alpha_arg = np.clip(alpha_arg, -1.0, 1.0) @@ -191,9 +240,9 @@ def apply_optional_refraction_correction( y_corr = y[valid_mask] + DN z_corr = z[valid_mask] + DZ - corrected.loc[valid_mask, 'lon'] = x_corr - corrected.loc[valid_mask, 'lat'] = y_corr - corrected.loc[valid_mask, 'photon_height'] = z_corr + corrected.loc[valid_mask, "lon"] = x_corr + corrected.loc[valid_mask, "lat"] = y_corr + corrected.loc[valid_mask, "photon_height"] = z_corr return corrected @@ -202,7 +251,7 @@ def apply_sea_surface_flattening( sea_surface_height, lat_bin_keys, horizontal_res, - flattening_window_m=500 + flattening_window_m=500, ): """ Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. @@ -214,157 +263,185 @@ def apply_sea_surface_flattening( if len(sea_surface_height) != len(lat_bin_keys): return subsurface_photon_dataset - surface_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height_local': sea_surface_height - }).dropna(subset=['sea_surface_height_local']).copy() + surface_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height_local": sea_surface_height} + ) + .dropna(subset=["sea_surface_height_local"]) + .copy() + ) if surface_df.empty: return subsurface_photon_dataset - surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') - surface_df = surface_df.dropna(subset=['lat_bins_num']) + surface_df["lat_bins_num"] = pd.to_numeric(surface_df["lat_bins"], errors="coerce") + surface_df = surface_df.dropna(subset=["lat_bins_num"]) if surface_df.empty: return subsurface_photon_dataset big_bin_size = max(1, int(round(flattening_window_m / max(horizontal_res, 1)))) - surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) + surface_df["big_bin"] = ( + surface_df["lat_bins_num"].astype(int) // big_bin_size + ).astype(int) mean_surface = ( - surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] + surface_df.groupby("big_bin", observed=False)["sea_surface_height_local"] .mean() - .rename('sea_surface_height_bigbin_mean') + .rename("sea_surface_height_bigbin_mean") .reset_index() ) - surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') - surface_df['sea_surface_flattening_offset'] = ( - surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] + surface_df = surface_df.merge(mean_surface, on="big_bin", how="left") + surface_df["sea_surface_flattening_offset"] = ( + surface_df["sea_surface_height_bigbin_mean"] + - surface_df["sea_surface_height_local"] ) flattened = subsurface_photon_dataset.copy() - flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') - flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) + flattened["lat_bins_num"] = pd.to_numeric(flattened["lat_bins"], errors="coerce") + flattened["big_bin"] = ( + flattened["lat_bins_num"].fillna(-1).astype(int) // big_bin_size + ).astype(int) flattened = flattened.merge( - surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], - on='lat_bins', - how='left' + surface_df[ + [ + "lat_bins", + "sea_surface_height_local", + "sea_surface_height_bigbin_mean", + "sea_surface_flattening_offset", + ] + ], + on="lat_bins", + how="left", ) - flattened['photon_height_original'] = flattened['photon_height'] - flattened['photon_height'] = ( - flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) + flattened["photon_height_original"] = flattened["photon_height"] + flattened["photon_height"] = flattened["photon_height"] - flattened[ + "sea_surface_flattening_offset" + ].fillna(0.0) + return flattened.drop(columns=["lat_bins_num", "big_bin"], errors="ignore") + + +# 1. Function to create an R-tree spatial index for raster bounds +def create_spatial_index(gebco_paths): + """ + Create an R-tree spatial index for raster bounds to quickly find relevant rasters. + + Parameters: + gebco_paths (list): List of paths to GEBCO raster files. + + Returns: + raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + """ + idx = index.Index() + raster_data_dict = {} + for i, path in enumerate(gebco_paths): + with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): + gebco_raster = rasterio.open(path) + raster_data = gebco_raster.read(1) + raster_data_dict[path] = (gebco_raster, raster_data) + bounds = gebco_raster.bounds + idx.insert( + i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path + ) + return raster_data_dict, idx + + +# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner +def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): + """ + Determine which raster datasets are relevant for the given coordinates using spatial index. + This function uses a more efficient approach by performing a bounding box query for batches of points. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + raster_data_dict (dict): Dictionary containing raster datasets and their data. + spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. + + Returns: + relevant_rasters (list): List of relevant raster datasets and their respective data. + """ + # Create a bounding box that covers all points + min_lon, max_lon = lons.min(), lons.max() + min_lat, max_lat = lats.min(), lats.max() + bounding_box = box(min_lon, min_lat, max_lon, max_lat) + + # Get all rasters that intersect with the bounding box + matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) + relevant_paths = {match.object for match in matches} + relevant_rasters = [ + (raster_data_dict[path][0], raster_data_dict[path][1]) + for path in relevant_paths + ] + return relevant_rasters + + +# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner +def get_seafloor_elevation(lons, lats, relevant_rasters): + """ + Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. + + Parameters: + lons (numpy.ndarray): Array of longitudes. + lats (numpy.ndarray): Array of latitudes. + relevant_rasters (list): List of relevant raster datasets and their respective data. + + Returns: + seafloor_elevations (numpy.ndarray): Array of seafloor elevations. + """ + points = np.vstack((lons, lats)).T + seafloor_elevations = np.full(len(lons), np.nan) + + for gebco_raster, raster_data in relevant_rasters: + # Use rasterio.sample to get values for multiple points in a batch + values = list(gebco_raster.sample(points)) + for idx, value in enumerate(values): + if np.isnan(seafloor_elevations[idx]) and value is not None: + seafloor_elevations[idx] = value[0] + + return seafloor_elevations + + +# 4. The main process function that ties everything together +def process_seafloor_data( + sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres +): + """ + Main function to process the seafloor data. + + Parameters: + gebco_paths (list): List of path to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. + """ + + # Create spatial index and load GEBCO Raster Data + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + + # Get the relevant rasters using spatial index + lons = sea_photon_dataset["longitude"].values + lats = sea_photon_dataset["latitude"].values + relevant_rasters = get_relevant_rasters_using_index( + lons, lats, raster_data_dict, spatial_index + ) + + # Get seafloor elevation for all points + sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( + lons, lats, relevant_rasters ) - return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') - -# 1. Function to create an R-tree spatial index for raster bounds -def create_spatial_index(gebco_paths): - """ - Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - - Returns: - raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - """ - idx = index.Index() - raster_data_dict = {} - for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): - gebco_raster = rasterio.open(path) - raster_data = gebco_raster.read(1) - raster_data_dict[path] = (gebco_raster, raster_data) - bounds = gebco_raster.bounds - idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) - return raster_data_dict, idx - -# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner -def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): - """ - Determine which raster datasets are relevant for the given coordinates using spatial index. - This function uses a more efficient approach by performing a bounding box query for batches of points. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - raster_data_dict (dict): Dictionary containing raster datasets and their data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - - Returns: - relevant_rasters (list): List of relevant raster datasets and their respective data. - """ - # Create a bounding box that covers all points - min_lon, max_lon = lons.min(), lons.max() - min_lat, max_lat = lats.min(), lats.max() - bounding_box = box(min_lon, min_lat, max_lon, max_lat) - - # Get all rasters that intersect with the bounding box - matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) - relevant_paths = {match.object for match in matches} - relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] - return relevant_rasters - - -# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner -def get_seafloor_elevation(lons, lats, relevant_rasters): - """ - Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - relevant_rasters (list): List of relevant raster datasets and their respective data. - - Returns: - seafloor_elevations (numpy.ndarray): Array of seafloor elevations. - """ - points = np.vstack((lons, lats)).T - seafloor_elevations = np.full(len(lons), np.nan) - - for gebco_raster, raster_data in relevant_rasters: - # Use rasterio.sample to get values for multiple points in a batch - values = list(gebco_raster.sample(points)) - for idx, value in enumerate(values): - if np.isnan(seafloor_elevations[idx]) and value is not None: - seafloor_elevations[idx] = value[0] - - return seafloor_elevations - - -# 4. The main process function that ties everything together -def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): - """ - Main function to process the seafloor data. - - Parameters: - gebco_paths (list): List of path to GEBCO raster files. - sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - - Returns: - filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. - """ - - # Create spatial index and load GEBCO Raster Data - raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - - # Get the relevant rasters using spatial index - lons = sea_photon_dataset['longitude'].values - lats = sea_photon_dataset['latitude'].values - relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) - - # Get seafloor elevation for all points - sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) - - # Calculate depth (assuming seafloor_elevation is negative below sea level) - # If your elevation is positive below sea level, remove the negative sign - sea_photon_dataset['depth'] = -sea_photon_dataset['seafloor_elevation'] - - # Filter out points: - # 1. below the seafloor (photon_height > seafloor_elevation) - # 2. where depth is less than 6 meters (depth >= 6) + + # Calculate depth (assuming seafloor_elevation is negative below sea level) + # If your elevation is positive below sea level, remove the negative sign + sea_photon_dataset["depth"] = -sea_photon_dataset["seafloor_elevation"] + + # Filter out points: + # 1. below the seafloor (photon_height > seafloor_elevation) + # 2. where depth is less than 6 meters (depth >= 6) filtered_sea_photon_dataset = sea_photon_dataset[ - (sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']) & - (sea_photon_dataset['depth'] >= Ignore_Subsurface_Height_Thres) + (sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"]) + & (sea_photon_dataset["depth"] >= Ignore_Subsurface_Height_Thres) ] - + return filtered_sea_photon_dataset @@ -373,58 +450,81 @@ def load_atl24_points(atl24_file_path): Load ATL24 point dataset and normalize to columns: longitude, latitude, seafloor_elevation_atl24. """ if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) lower_path = atl24_file_path.lower() - if lower_path.endswith(('.csv', '.txt')): + if lower_path.endswith((".csv", ".txt")): atl24_df = pd.read_csv(atl24_file_path) - elif lower_path.endswith('.parquet'): + elif lower_path.endswith(".parquet"): atl24_df = pd.read_parquet(atl24_file_path) - elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): + elif lower_path.endswith((".gpkg", ".shp", ".geojson")): try: import geopandas as gpd + atl24_gdf = gpd.read_file(atl24_file_path) atl24_df = pd.DataFrame(atl24_gdf) - if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): - atl24_df['longitude'] = atl24_gdf.geometry.x - atl24_df['latitude'] = atl24_gdf.geometry.y + if ( + "longitude" not in atl24_df.columns + or "latitude" not in atl24_df.columns + ) and ("geometry" in atl24_gdf.columns): + atl24_df["longitude"] = atl24_gdf.geometry.x + atl24_df["latitude"] = atl24_gdf.geometry.y except Exception: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) else: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) - lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] - lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] + lon_candidates = ["longitude", "lon", "x", "LONGITUDE", "LON"] + lat_candidates = ["latitude", "lat", "y", "LATITUDE", "LAT"] elev_candidates = [ - 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', - 'depth', 'water_depth' + "seafloor_elevation", + "bottom_elevation", + "bathymetry", + "elevation", + "z", + "depth", + "water_depth", ] lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) if lon_col is None or lat_col is None or elev_col is None: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) - - out_df = pd.DataFrame({ - 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), - 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), - 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') - }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) + + out_df = ( + pd.DataFrame( + { + "longitude": pd.to_numeric(atl24_df[lon_col], errors="coerce"), + "latitude": pd.to_numeric(atl24_df[lat_col], errors="coerce"), + "atl24_raw_bathy": pd.to_numeric(atl24_df[elev_col], errors="coerce"), + } + ) + .dropna(subset=["longitude", "latitude", "atl24_raw_bathy"]) + .copy() + ) - if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): - out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() + if "depth" in elev_col.lower() and ("elev" not in elev_col.lower()): + out_df["seafloor_elevation_atl24"] = -out_df["atl24_raw_bathy"].abs() else: - out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] + out_df["seafloor_elevation_atl24"] = out_df["atl24_raw_bathy"] - return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] + return out_df[["longitude", "latitude", "seafloor_elevation_atl24"]] def process_seafloor_data_atl24( sea_photon_dataset, atl24_file_path, Ignore_Subsurface_Height_Thres, - max_match_distance_deg=0.01 + max_match_distance_deg=0.01, ): """ Match ATL24 bathymetry points to photons, then filter below seafloor and shallow bins. @@ -434,39 +534,43 @@ def process_seafloor_data_atl24( if atl24_points.empty: return sea_photon_dataset, 0 - photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) - atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) + photon_coords = sea_photon_dataset[["longitude", "latitude"]].to_numpy(dtype=float) + atl24_coords = atl24_points[["longitude", "latitude"]].to_numpy(dtype=float) if len(photon_coords) == 0 or len(atl24_coords) == 0: return sea_photon_dataset, 0 tree = cKDTree(atl24_coords) - distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) + distances, indices = tree.query( + photon_coords, k=1, distance_upper_bound=max_match_distance_deg + ) valid_match = np.isfinite(distances) & (indices < len(atl24_points)) matched_count = int(valid_match.sum()) if matched_count == 0: return sea_photon_dataset, 0 matched_dataset = sea_photon_dataset.copy() - matched_dataset['seafloor_elevation'] = np.nan - matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] - matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() - matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] + matched_dataset["seafloor_elevation"] = np.nan + matched_dataset.loc[valid_match, "seafloor_elevation"] = atl24_points[ + "seafloor_elevation_atl24" + ].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=["seafloor_elevation"]).copy() + matched_dataset["depth"] = -matched_dataset["seafloor_elevation"] filtered_dataset = matched_dataset[ - (matched_dataset['photon_height'] > matched_dataset['seafloor_elevation']) & - (matched_dataset['depth'] >= Ignore_Subsurface_Height_Thres) + (matched_dataset["photon_height"] > matched_dataset["seafloor_elevation"]) + & (matched_dataset["depth"] >= Ignore_Subsurface_Height_Thres) ] return filtered_dataset, matched_count - - -# 5. The function to get the subsurface photon dataset + + +# 5. The function to get the subsurface photon dataset def get_subsurface_photon( binned_dataset_sea_surface, GEBCO_paths, subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -482,99 +586,113 @@ def get_subsurface_photon( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): - # get sea surface height - # threshold to determine how much depth below sea surface will be accounted - # Output: - # ->final_sea_surface_height - # ->sea_surface_height_abnormal_label - # ->sea_surface_dominated_label - # # static threshold - # sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - # get_sea_surface_height_static(binned_dataset_sea_surface, subsurface_thresh) - # adaptive threshold - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - get_sea_surface_height_adaptive(binned_dataset_sea_surface) - - if apply_histogram_quality_filter: - subsurface_photon_dataset = apply_optional_histogram_quality_filter( + # get sea surface height + # threshold to determine how much depth below sea surface will be accounted + # Output: + # ->final_sea_surface_height + # ->sea_surface_height_abnormal_label + # ->sea_surface_dominated_label + # # static threshold + # sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + # get_sea_surface_height_static(binned_dataset_sea_surface, subsurface_thresh) + # adaptive threshold + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = get_sea_surface_height_adaptive(binned_dataset_sea_surface) + + if apply_histogram_quality_filter: + subsurface_photon_dataset = apply_optional_histogram_quality_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + min_ratio=histogram_quality_min_ratio, + depth_min=histogram_quality_depth_min, + depth_max=histogram_quality_depth_max, + ) + + if apply_surface_sigma_filter: + subsurface_photon_dataset = apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=surface_sigma_max, + ) + + if apply_refraction_correction: + subsurface_photon_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, + ) + if apply_post_refraction_refit: + corrected_full_dataset = apply_optional_refraction_correction( binned_dataset_sea_surface, - subsurface_photon_dataset, + binned_dataset_sea_surface.copy(), sea_surface_height, - min_ratio=histogram_quality_min_ratio, - depth_min=histogram_quality_depth_min, - depth_max=histogram_quality_depth_max + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, ) - - if apply_surface_sigma_filter: - subsurface_photon_dataset = apply_optional_surface_sigma_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - sigma_max=surface_sigma_max + rebinned_corrected = horizontal_vertical_bin_dataset( + corrected_full_dataset, horizontal_res, vertical_res ) - - if apply_refraction_correction: - subsurface_photon_dataset = apply_optional_refraction_correction( - binned_dataset_sea_surface, - subsurface_photon_dataset, + ( sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm - ) - if apply_post_refraction_refit: - corrected_full_dataset = apply_optional_refraction_correction( - binned_dataset_sea_surface, - binned_dataset_sea_surface.copy(), - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm - ) - rebinned_corrected = horizontal_vertical_bin_dataset( - corrected_full_dataset, - horizontal_res, - vertical_res - ) - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - get_sea_surface_height_adaptive(rebinned_corrected) - - if use_atl24_filter: - atl24_filtered, atl24_match_count = process_seafloor_data_atl24( + sea_surface_label, + solo_sea_surface_label, subsurface_photon_dataset, - atl24_file_path, - abs(Ignore_Subsurface_Height_Thres), - max_match_distance_deg=atl24_max_match_distance_deg - ) - if atl24_match_count > 0: - filtered_seafloor_subsurface_photon_dataset = atl24_filtered - elif use_gebco_filter: - filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) - ) - else: - filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + ) = get_sea_surface_height_adaptive(rebinned_corrected) + + if use_atl24_filter: + atl24_filtered, atl24_match_count = process_seafloor_data_atl24( + subsurface_photon_dataset, + atl24_file_path, + abs(Ignore_Subsurface_Height_Thres), + max_match_distance_deg=atl24_max_match_distance_deg, + ) + if atl24_match_count > 0: + filtered_seafloor_subsurface_photon_dataset = atl24_filtered elif use_gebco_filter: filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + subsurface_photon_dataset, + GEBCO_paths, + abs(Ignore_Subsurface_Height_Thres), ) else: filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + elif use_gebco_filter: + filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + ) + else: + filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset + + if apply_flattening: + lat_bin_keys = list( + binned_dataset_sea_surface.groupby( + ["lat_bins"], observed=False + ).groups.keys() + ) + filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( + filtered_seafloor_subsurface_photon_dataset, + sea_surface_height, + lat_bin_keys, + horizontal_res=horizontal_res, + flattening_window_m=flattening_window_m, + ) + return ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) - if apply_flattening: - lat_bin_keys = list( - binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() - ) - filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( - filtered_seafloor_subsurface_photon_dataset, - sea_surface_height, - lat_bin_keys, - horizontal_res=horizontal_res, - flattening_window_m=flattening_window_m - ) - return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset - # Main processing function to apply the subsurface photon filtering beam-by-beam def process_subsurface_photon_filtering( binned_dataset_sea_surface, @@ -582,7 +700,7 @@ def process_subsurface_photon_filtering( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -598,50 +716,53 @@ def process_subsurface_photon_filtering( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): - # Initialize lists to store results for each beam - sea_surface_heights = [] - sea_surface_labels = [] - filtered_beam_datasets = [] - - # Group the binned dataset by 'beam_id' and process each group separately - for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): - print(f'Processing subsurface filtering for beam: {beam_id}') - + # Initialize lists to store results for each beam + sea_surface_heights = [] + sea_surface_labels = [] + filtered_beam_datasets = [] + + # Group the binned dataset by 'beam_id' and process each group separately + for beam_id, beam_data in binned_dataset_sea_surface.groupby("beam_id"): + print(f"Processing subsurface filtering for beam: {beam_id}") + # Apply get_subsurface_photon to the current beam's dataset - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - get_subsurface_photon( - beam_data, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=use_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=atl24_max_match_distance_deg, - use_gebco_filter=use_gebco_filter, - apply_histogram_quality_filter=apply_histogram_quality_filter, - histogram_quality_min_ratio=histogram_quality_min_ratio, - histogram_quality_depth_min=histogram_quality_depth_min, - histogram_quality_depth_max=histogram_quality_depth_max, - apply_surface_sigma_filter=apply_surface_sigma_filter, - surface_sigma_max=surface_sigma_max, - apply_refraction_correction=apply_refraction_correction, - refraction_water_temp_c=refraction_water_temp_c, - refraction_wavelength_nm=refraction_wavelength_nm, - apply_post_refraction_refit=apply_post_refraction_refit, - apply_flattening=apply_flattening, - flattening_window_m=flattening_window_m, - horizontal_res=horizontal_res, - vertical_res=vertical_res - ) - - # Append each result to the lists - sea_surface_heights.append(sea_surface_height) - sea_surface_labels.append(sea_surface_label) - filtered_beam_datasets.append(filtered_seafloor_subsurface_photon_dataset) - - # Combine all filtered beam datasets into a single DataFrame - combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) - - return sea_surface_heights, sea_surface_labels, combined_filtered_dataset + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) + + # Append each result to the lists + sea_surface_heights.append(sea_surface_height) + sea_surface_labels.append(sea_surface_label) + filtered_beam_datasets.append(filtered_seafloor_subsurface_photon_dataset) + + # Combine all filtered beam datasets into a single DataFrame + combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) + + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/aok/core/kd_utils/config.py b/aok/core/kd_utils/config.py index a3ce3af..c8b3b28 100644 --- a/aok/core/kd_utils/config.py +++ b/aok/core/kd_utils/config.py @@ -25,7 +25,6 @@ # # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# - # #China Bohai # # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# # # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# @@ -33,7 +32,6 @@ # ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# - # ATL03_h5_file_path = path + ATL03_h5_file # Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' @@ -66,182 +64,316 @@ import argparse + def get_args(): - parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") - + parser = argparse.ArgumentParser( + description="Configure ICESat-2 HLS analysis script." + ) + # General paths - parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', - help="Root path for all data processing.") - + parser.add_argument( + "--workspace_path", + type=str, + default="C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/", + help="Root path for all data processing.", + ) + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', # help="Base path for ICESat-2 ATL03 data.") - parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/', - help="Base path for ICESat-2 ATL03 data.") - - - + parser.add_argument( + "--atl03_path", + type=str, + default="Dataset/ATL03_ICESat2/", + help="Base path for ICESat-2 ATL03 data.", + ) + # #ChesapeakeBay # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", - # help="Name of the ATL03 H5 file to process.") - + # help="Name of the ATL03 H5 file to process.") + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # #ATL03_20210628230109_00811207_006_01_subsetted - + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", # help="Name of the ATL03 H5 file to process.") - - + # # WaxDelta # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # Bohai - parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", - help="Name of the ATL03 H5 file to process.") - + parser.add_argument( + "--atl03_file", + type=str, + default="ATL03_20190829161305_09560402_006_02_subsetted.h5", + help="Name of the ATL03 H5 file to process.", + ) + # # Cook Inlet # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - - + # #Core Sound # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + # # Pamlico Sound # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", # help="Name of the ATL03 H5 file to process.") - - parser.add_argument("--other_data_path", type=str, default='Dataset/', - help="Path to the current working directory.") - - parser.add_argument("--output_path", type=str, default='Results/', - help="Directory for saving results.") - + + parser.add_argument( + "--other_data_path", + type=str, + default="Dataset/", + help="Path to the current working directory.", + ) + + parser.add_argument( + "--output_path", + type=str, + default="Results/", + help="Directory for saving results.", + ) + # Shoreline data - parser.add_argument("--shoreline_data", type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default='Shorelines/ne_10m_land/ne_10m_land.shp', - help="Path to the global shoreline dataset.") - + parser.add_argument( + "--shoreline_data", + type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default="Shorelines/ne_10m_land/ne_10m_land.shp", + help="Path to the global shoreline dataset.", + ) + # Bathymetry datasets - parser.add_argument("--gebco_path", nargs='+', type=str, - default='Bathy/gebco_2024_geotiff/', - help="Path to GEBCO bathymetry datasets.") - + parser.add_argument( + "--gebco_path", + nargs="+", + type=str, + default="Bathy/gebco_2024_geotiff/", + help="Path to GEBCO bathymetry datasets.", + ) + # Resolution settings - parser.add_argument("--horizontal_res", type=int, default=500, - help="Horizontal resolution for the analysis (in meters).") - - parser.add_argument("--vertical_res", type=float, default=0.25, - help="Vertical resolution for the analysis.") - + parser.add_argument( + "--horizontal_res", + type=int, + default=500, + help="Horizontal resolution for the analysis (in meters).", + ) + + parser.add_argument( + "--vertical_res", + type=float, + default=0.25, + help="Vertical resolution for the analysis.", + ) + # Analysis parameters - parser.add_argument("--subsurface_thresh", type=float, default=1.0, - help="Threshold for photons below the sea surface.") - - parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, - help="Maximum depth thres for Kd calculations (in meters).") #5m - - parser.add_argument("--target_beams", type=str, default='', - help="Comma-separated beam IDs to process. Empty means auto-select strong beams.") - - parser.add_argument("--enable_solar_background_filter", action="store_true", - help="Optional: apply solar background filtering before binning.") - parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, - help="Solar elevation threshold (deg) to define daytime for optional filtering.") - parser.add_argument("--solar_background_quantile", type=float, default=0.8, - help="Quantile threshold of daytime background rate considered high background.") - parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, - help="Minimum photon signal confidence kept in high daytime background.") - - parser.add_argument("--enable_ir_ap_filter", action="store_true", - help="Optional: remove photons using IR/afterpulse proxy flags.") - parser.add_argument("--ir_ap_quality_max", type=int, default=0, - help="Maximum allowed quality_ph value when IR/AP filter is enabled.") - parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, - help="Minimum allowed photon_conf value when IR/AP filter is enabled.") - - parser.add_argument("--enable_gebco_filter", action="store_true", - help="Optional: remove photons below GEBCO seafloor and shallow bins.") - parser.add_argument("--enable_sea_surface_flattening", action="store_true", - help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") - parser.add_argument("--sea_surface_flattening_window_m", type=int, default=500, - help="Window size in meters for sea-surface flattening mean level.") - - parser.add_argument("--enable_convex_hull_filter", action="store_true", - help="Optional: apply convex hull area filtering before Kd fitting.") - parser.add_argument("--convex_hull_area_threshold", type=float, default=100, - help="Minimum convex hull area to keep a horizontal bin when enabled.") - - parser.add_argument("--enable_histogram_quality_filter", action="store_true", - help="Optional: discard along-track bins with weak 6-7 m signal.") - parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, - help="Minimum ratio of photons in depth band to keep a bin.") - parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, - help="Minimum depth (m below surface) of quality-check band.") - parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, - help="Maximum depth (m below surface) of quality-check band.") - - parser.add_argument("--enable_surface_sigma_filter", action="store_true", - help="Optional: discard bins where Gaussian surface sigma is too large.") - parser.add_argument("--surface_sigma_max", type=float, default=0.5, - help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") - - parser.add_argument("--enable_refraction_correction", action="store_true", - help="Optional: apply refraction correction to subsurface photons.") - parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, - help="Water temperature (deg C) used for refraction correction.") - parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, - help="Laser wavelength (nm) used for refraction correction.") - - parser.add_argument("--enable_post_refraction_refit", action="store_true", - help="Optional: rebuild histograms and re-fit surface after refraction correction.") - - parser.add_argument("--enable_atl24_filter", action="store_true", - help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") - parser.add_argument("--atl24_file", type=str, default='', - help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") - parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, - help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") - - parser.add_argument("--enable_paired_beam_combine", action="store_true", - help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") - + parser.add_argument( + "--subsurface_thresh", + type=float, + default=1.0, + help="Threshold for photons below the sea surface.", + ) + + parser.add_argument( + "--ignore_subsurface_height_thres", + type=float, + default=-6, + help="Maximum depth thres for Kd calculations (in meters).", + ) # 5m + + parser.add_argument( + "--target_beams", + type=str, + default="", + help="Comma-separated beam IDs to process. Empty means auto-select strong beams.", + ) + + parser.add_argument( + "--enable_solar_background_filter", + action="store_true", + help="Optional: apply solar background filtering before binning.", + ) + parser.add_argument( + "--solar_elevation_day_threshold", + type=float, + default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.", + ) + parser.add_argument( + "--solar_background_quantile", + type=float, + default=0.8, + help="Quantile threshold of daytime background rate considered high background.", + ) + parser.add_argument( + "--solar_background_min_signal_conf", + type=int, + default=2, + help="Minimum photon signal confidence kept in high daytime background.", + ) + + parser.add_argument( + "--enable_ir_ap_filter", + action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.", + ) + parser.add_argument( + "--ir_ap_quality_max", + type=int, + default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.", + ) + parser.add_argument( + "--ir_ap_min_signal_conf", + type=int, + default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.", + ) + + parser.add_argument( + "--enable_gebco_filter", + action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.", + ) + parser.add_argument( + "--enable_sea_surface_flattening", + action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.", + ) + parser.add_argument( + "--sea_surface_flattening_window_m", + type=int, + default=500, + help="Window size in meters for sea-surface flattening mean level.", + ) + + parser.add_argument( + "--enable_convex_hull_filter", + action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.", + ) + parser.add_argument( + "--convex_hull_area_threshold", + type=float, + default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.", + ) + + parser.add_argument( + "--enable_histogram_quality_filter", + action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.", + ) + parser.add_argument( + "--histogram_quality_min_ratio", + type=float, + default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.", + ) + parser.add_argument( + "--histogram_quality_depth_min", + type=float, + default=6.0, + help="Minimum depth (m below surface) of quality-check band.", + ) + parser.add_argument( + "--histogram_quality_depth_max", + type=float, + default=7.0, + help="Maximum depth (m below surface) of quality-check band.", + ) + + parser.add_argument( + "--enable_surface_sigma_filter", + action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.", + ) + parser.add_argument( + "--surface_sigma_max", + type=float, + default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.", + ) + + parser.add_argument( + "--enable_refraction_correction", + action="store_true", + help="Optional: apply refraction correction to subsurface photons.", + ) + parser.add_argument( + "--refraction_water_temp_c", + type=float, + default=20.0, + help="Water temperature (deg C) used for refraction correction.", + ) + parser.add_argument( + "--refraction_wavelength_nm", + type=float, + default=532.0, + help="Laser wavelength (nm) used for refraction correction.", + ) + + parser.add_argument( + "--enable_post_refraction_refit", + action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.", + ) + + parser.add_argument( + "--enable_atl24_filter", + action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.", + ) + parser.add_argument( + "--atl24_file", + type=str, + default="", + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.", + ) + parser.add_argument( + "--atl24_max_match_distance_deg", + type=float, + default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.", + ) + + parser.add_argument( + "--enable_paired_beam_combine", + action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.", + ) + # switch on consider the coastal water or inland water # if Inland water, we should use one mask and for coastal water, it should be another mask - + # consider bathymetry or not manual setting - - - #night vs daytime - - #solar elevation detemine remove or not for solar background - - - # 1-2 hours - - + + # night vs daytime + + # solar elevation detemine remove or not for solar background + + # 1-2 hours + return parser.parse_args() + if __name__ == "__main__": args = get_args() # Access parameters like this: atl03_file_path = args.workspace_path + args.atl03_file - + print("Processing file:", atl03_file_path) print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/data_processing.py b/aok/core/kd_utils/data_processing.py index 42df0a8..06dcb13 100644 --- a/aok/core/kd_utils/data_processing.py +++ b/aok/core/kd_utils/data_processing.py @@ -1,40 +1,26 @@ # utils/data_processing.py -import re -import os import io -import time -import math -import h5py import logging -import netCDF4 -import numpy as np +import math +import os +import re + import geopandas as gpd +import h5py +# from pyproj import Transformer +import numpy as np import pandas as pd -from datetime import datetime -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from pyproj import Proj, Transformer from scipy.spatial import ConvexHull -import argparse -import subprocess # import fiona -import utm -import pyproj -from pyproj import Transformer, Proj - -# from pyproj import Transformer -from matplotlib.widgets import LassoSelector -from matplotlib.path import Path - -from sklearn.cluster import DBSCAN from .interpolation import * - -#this function from icesat2_toolkit +# this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ @@ -58,9 +44,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, 'r') + fileID = h5py.File(FILENAME, "r") else: - fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + fileID = h5py.File(os.path.expanduser(FILENAME), "r") # Output HDF5 file information logging.info(fileID.filename) @@ -72,12 +58,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]['geolocation']['segment_id'] - fileID[gtx]['heights']['delta_time'] + fileID[gtx]["geolocation"]["segment_id"] + fileID[gtx]["heights"]["delta_time"] except KeyError: pass else: @@ -85,82 +71,81 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # for each included beam for gtx in IS2_atl03_beams: - # ------------------------------------------- # 1. make sure the beam-level dict exists IS2_atl03_attrs.setdefault(gtx, {}) # 2. always save the two “must-have” attributes - for key in ('atlas_beam_type', 'atlas_spot_number'): + for key in ("atlas_beam_type", "atlas_spot_number"): IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] - + # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]['heights'] = {} - IS2_atl03_mds[gtx]['geolocation'] = {} - IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} - IS2_atl03_mds[gtx]['geophys_corr'] = {} + IS2_atl03_mds[gtx]["heights"] = {} + IS2_atl03_mds[gtx]["geolocation"] = {} + IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} + IS2_atl03_mds[gtx]["geophys_corr"] = {} # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_mds[gtx]['heights'][key] = val[:] + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_mds[gtx]["heights"][key] = val[:] # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_mds[gtx]["geolocation"][key] = val[:] # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]['heights'] = {} - IS2_atl03_attrs[gtx]['geolocation'] = {} - IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} - IS2_atl03_attrs[gtx]['geophys_corr'] = {} - + IS2_atl03_attrs[gtx]["heights"] = {} + IS2_atl03_attrs[gtx]["geolocation"] = {} + IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} + IS2_atl03_attrs[gtx]["geophys_corr"] = {} + # Global Group Attributes - for att_name,att_val in fileID[gtx].attrs.items(): + for att_name, att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_attrs[gtx]['heights'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_attrs[gtx]["heights"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_attrs[gtx]['geolocation'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_attrs[gtx]["geolocation"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val # ICESat-2 Geophysical Corrections Group - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds['orbit_info'] = {} - IS2_atl03_attrs['orbit_info'] = {} - for key,val in fileID['orbit_info'].items(): - IS2_atl03_mds['orbit_info'][key] = val[:] + IS2_atl03_mds["orbit_info"] = {} + IS2_atl03_attrs["orbit_info"] = {} + for key, val in fileID["orbit_info"].items(): + IS2_atl03_mds["orbit_info"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID['orbit_info'].attrs.items(): - IS2_atl03_attrs['orbit_info'][att_name] = att_val + for att_name, att_val in fileID["orbit_info"].attrs.items(): + IS2_atl03_attrs["orbit_info"][att_name] = att_val # Variable Attributes - IS2_atl03_attrs['orbit_info'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + IS2_atl03_attrs["orbit_info"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["orbit_info"][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -168,91 +153,108 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds['ancillary_data'] = {} - IS2_atl03_attrs['ancillary_data'] = {} - ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', - 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', - 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', - 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', - 'start_orbit','start_region','start_rgt','version'] + IS2_atl03_mds["ancillary_data"] = {} + IS2_atl03_attrs["ancillary_data"] = {} + ancillary_keys = [ + "atlas_sdp_gps_epoch", + "data_end_utc", + "data_start_utc", + "end_cycle", + "end_geoseg", + "end_gpssow", + "end_gpsweek", + "end_orbit", + "end_region", + "end_rgt", + "granule_end_utc", + "granule_start_utc", + "release", + "start_cycle", + "start_geoseg", + "start_gpssow", + "start_gpsweek", + "start_orbit", + "start_region", + "start_rgt", + "version", + ] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data'][key] = {} - for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): - IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"][key] = {} + for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): + IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds['ancillary_data']['tep'] = {} - IS2_atl03_attrs['ancillary_data']['tep'] = {} - for key,val in fileID['ancillary_data']['tep'].items(): + IS2_atl03_mds["ancillary_data"]["tep"] = {} + IS2_atl03_attrs["ancillary_data"]["tep"] = {} + for key, val in fileID["ancillary_data"]["tep"].items(): # get each HDF5 variable - IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data']['tep'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1,cal2 = ('ancillary_data','calibrations') - for var in ['dead_time','first_photon_bias']: + cal1, cal2 = ("ancillary_data", "calibrations") + for var in ["dead_time", "first_photon_bias"]: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key,val in fileID[cal1][cal2][var].items(): + for key, val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1,tep2 = ('atlas_impulse_response','tep_histogram') + tep1, tep2 = ("atlas_impulse_response", "tep_histogram") IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ['pce1_spot1','pce2_spot3']: - IS2_atl03_mds[tep1][pce] = {tep2:{}} - IS2_atl03_attrs[tep1][pce] = {tep2:{}} + for pce in ["pce1_spot1", "pce2_spot3"]: + IS2_atl03_mds[tep1][pce] = {tep2: {}} + IS2_atl03_attrs[tep1][pce] = {tep2: {}} # for each TEP variable - for key,val in fileID[tep1][pce][tep2].items(): + for key, val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name,att_val in fileID.attrs.items(): + for att_name, att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) - + return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 @@ -260,22 +262,22 @@ def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = '0' + utm_band + utm_band = "0" + utm_band if lat >= 0: - epsg_code = 'epsg:326' + utm_band + epsg_code = "epsg:326" + utm_band return epsg_code - epsg_code = 'epsg:327' + utm_band + epsg_code = "epsg:327" + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - #To transform from WGS84 ellipsoidal height + # To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -293,7 +295,6 @@ def orthometric_correction(lat, lon, Z, epsg): return Y_utm, X_utm, Z_egm08 - def load_data(file_path, read_attributes=False): """ Wrapper around read_granule that lets you decide @@ -307,10 +308,8 @@ def load_data(file_path, read_attributes=False): If True, read_granule returns the full IS2_atl03_attrs tree. Defaults to False. """ - - - return read_granule(file_path, ATTRIBUTES = read_attributes) + return read_granule(file_path, ATTRIBUTES=read_attributes) # requires that the input gdf has ranged index values i @@ -321,16 +320,15 @@ def load_data(file_path, read_attributes=False): def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, 'is_land', - zero_int_array - 1, False) + ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -341,13 +339,15 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + land_polygon_gdf = gpd.read_file( + shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" + ) # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -359,7 +359,6 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: - print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -370,143 +369,185 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ - quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, - solar_elevation=None, background_rate=None): +def create_photon_dataframe( + lat_ph, + lon_ph, + ref_elev, + ref_azimuth, + geoid, + h_ph, + quality_ph, + is_land_label_interp1d, + signal_conf_photon, + x_atc, + relative_AT_dist, + solar_elevation=None, + background_rate=None, +): # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights h_ph_geoid_cor = h_ph[:] - geoid[:] - - # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - + # Perform orthometric correction to obtain UTM coordinates and corrected heights lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - + # Put the data into the dataframe - sea_photon_dataset = pd.DataFrame({ - 'latitude': lat_ph, - 'longitude': lon_ph, - 'lat': lat_utm, - 'lon': lon_utm, - 'photon_height': h_ph_geoid_cor, - 'quality_ph': quality_ph, - 'is_land_label': is_land_label_interp1d, - 'photon_conf': signal_conf_photon, - 'ref_elevation': ref_elev, - 'ref_azimuth': ref_azimuth, - 'relative_AT_dist': relative_AT_dist - }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) - - if solar_elevation is not None: - sea_photon_dataset['solar_elevation'] = solar_elevation - if background_rate is not None: - sea_photon_dataset['background_rate'] = background_rate - - return sea_photon_dataset - - -def interpolate_by_time(source_time, source_values, target_time): - """Linearly interpolate source_values from source_time to target_time.""" - source_time = np.asarray(source_time) - source_values = np.asarray(source_values) - target_time = np.asarray(target_time) - - valid_mask = np.isfinite(source_time) & np.isfinite(source_values) - if valid_mask.sum() < 2: - return np.full_like(target_time, np.nan, dtype=float) - - sorted_idx = np.argsort(source_time[valid_mask]) - sorted_time = source_time[valid_mask][sorted_idx] - sorted_values = source_values[valid_mask][sorted_idx] - - unique_time, unique_idx = np.unique(sorted_time, return_index=True) - unique_values = sorted_values[unique_idx] - if unique_time.size < 2: - return np.full_like(target_time, unique_values[0], dtype=float) - - return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) - - -def apply_optional_solar_background_filter( - sea_photon_dataset, - enabled=False, - day_threshold=6.0, - background_quantile=0.8, - min_signal_conf=2 -): - """ - Optionally filter photons under strong daytime solar background conditions. - Keeps high-confidence photons in high-background segments. - """ - if not enabled: - return sea_photon_dataset - - required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} - if not required_cols.issubset(set(sea_photon_dataset.columns)): - logger.warning("Solar background filter skipped because required columns are missing.") - return sea_photon_dataset - - filtered_dataset = sea_photon_dataset.copy() - daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold - daytime_background = filtered_dataset.loc[daytime_mask, 'background_rate'] - daytime_background = daytime_background[np.isfinite(daytime_background)] - - if daytime_background.empty: - logger.info("Solar background filter enabled, but no valid daytime photons were found.") - return filtered_dataset - - quantile_threshold = daytime_background.quantile(background_quantile) - noisy_daytime_mask = daytime_mask & (filtered_dataset['background_rate'] >= quantile_threshold) - keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) - - before_count = len(filtered_dataset) - filtered_dataset = filtered_dataset.loc[keep_mask].copy() - after_count = len(filtered_dataset) - logger.info( - "Solar background filter removed %s photons (from %s to %s).", - before_count - after_count, before_count, after_count - ) - return filtered_dataset - - -def apply_optional_ir_ap_filter( - sea_photon_dataset, - enabled=False, - quality_max=0, - min_signal_conf=0 -): - """ - Optionally remove likely flagged photons as an IR/afterpulse proxy. - This uses available ATL03 photon fields: - - quality_ph: keep <= quality_max - - photon_conf: keep >= min_signal_conf - """ - if not enabled: - return sea_photon_dataset - - filtered_dataset = sea_photon_dataset.copy() - keep_mask = np.ones(len(filtered_dataset), dtype=bool) - - if 'quality_ph' in filtered_dataset.columns: - quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce') - keep_mask &= quality_values <= quality_max - else: - logger.warning("IR/AP filter enabled, but quality_ph is missing.") - - if 'photon_conf' in filtered_dataset.columns: - conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') - keep_mask &= conf_values >= min_signal_conf - else: - logger.warning("IR/AP filter enabled, but photon_conf is missing.") - - before_count = len(filtered_dataset) - filtered_dataset = filtered_dataset.loc[keep_mask].copy() - after_count = len(filtered_dataset) - logger.info( - "IR/AP proxy filter removed %s photons (from %s to %s).", - before_count - after_count, before_count, after_count - ) - return filtered_dataset + sea_photon_dataset = pd.DataFrame( + { + "latitude": lat_ph, + "longitude": lon_ph, + "lat": lat_utm, + "lon": lon_utm, + "photon_height": h_ph_geoid_cor, + "quality_ph": quality_ph, + "is_land_label": is_land_label_interp1d, + "photon_conf": signal_conf_photon, + "ref_elevation": ref_elev, + "ref_azimuth": ref_azimuth, + "relative_AT_dist": relative_AT_dist, + }, + columns=[ + "latitude", + "longitude", + "lat", + "lon", + "photon_height", + "quality_ph", + "is_land_label", + "photon_conf", + "ref_elevation", + "ref_azimuth", + "relative_AT_dist", + ], + ) + + if solar_elevation is not None: + sea_photon_dataset["solar_elevation"] = solar_elevation + if background_rate is not None: + sea_photon_dataset["background_rate"] = background_rate + + return sea_photon_dataset + + +def interpolate_by_time(source_time, source_values, target_time): + """Linearly interpolate source_values from source_time to target_time.""" + source_time = np.asarray(source_time) + source_values = np.asarray(source_values) + target_time = np.asarray(target_time) + + valid_mask = np.isfinite(source_time) & np.isfinite(source_values) + if valid_mask.sum() < 2: + return np.full_like(target_time, np.nan, dtype=float) + + sorted_idx = np.argsort(source_time[valid_mask]) + sorted_time = source_time[valid_mask][sorted_idx] + sorted_values = source_values[valid_mask][sorted_idx] + + unique_time, unique_idx = np.unique(sorted_time, return_index=True) + unique_values = sorted_values[unique_idx] + if unique_time.size < 2: + return np.full_like(target_time, unique_values[0], dtype=float) + + return np.interp( + target_time, + unique_time, + unique_values, + left=unique_values[0], + right=unique_values[-1], + ) + + +def apply_optional_solar_background_filter( + sea_photon_dataset, + enabled=False, + day_threshold=6.0, + background_quantile=0.8, + min_signal_conf=2, +): + """ + Optionally filter photons under strong daytime solar background conditions. + Keeps high-confidence photons in high-background segments. + """ + if not enabled: + return sea_photon_dataset + + required_cols = {"solar_elevation", "background_rate", "photon_conf"} + if not required_cols.issubset(set(sea_photon_dataset.columns)): + logger.warning( + "Solar background filter skipped because required columns are missing." + ) + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + daytime_mask = filtered_dataset["solar_elevation"] >= day_threshold + daytime_background = filtered_dataset.loc[daytime_mask, "background_rate"] + daytime_background = daytime_background[np.isfinite(daytime_background)] + + if daytime_background.empty: + logger.info( + "Solar background filter enabled, but no valid daytime photons were found." + ) + return filtered_dataset + + quantile_threshold = daytime_background.quantile(background_quantile) + noisy_daytime_mask = daytime_mask & ( + filtered_dataset["background_rate"] >= quantile_threshold + ) + keep_mask = (~noisy_daytime_mask) | ( + filtered_dataset["photon_conf"] >= min_signal_conf + ) + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "Solar background filter removed %s photons (from %s to %s).", + before_count - after_count, + before_count, + after_count, + ) + return filtered_dataset + + +def apply_optional_ir_ap_filter( + sea_photon_dataset, enabled=False, quality_max=0, min_signal_conf=0 +): + """ + Optionally remove likely flagged photons as an IR/afterpulse proxy. + This uses available ATL03 photon fields: + - quality_ph: keep <= quality_max + - photon_conf: keep >= min_signal_conf + """ + if not enabled: + return sea_photon_dataset + + filtered_dataset = sea_photon_dataset.copy() + keep_mask = np.ones(len(filtered_dataset), dtype=bool) + + if "quality_ph" in filtered_dataset.columns: + quality_values = pd.to_numeric(filtered_dataset["quality_ph"], errors="coerce") + keep_mask &= quality_values <= quality_max + else: + logger.warning("IR/AP filter enabled, but quality_ph is missing.") + + if "photon_conf" in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset["photon_conf"], errors="coerce") + keep_mask &= conf_values >= min_signal_conf + else: + logger.warning("IR/AP filter enabled, but photon_conf is missing.") + + before_count = len(filtered_dataset) + filtered_dataset = filtered_dataset.loc[keep_mask].copy() + after_count = len(filtered_dataset) + logger.info( + "IR/AP proxy filter removed %s photons (from %s to %s).", + before_count - after_count, + before_count, + after_count, + ) + return filtered_dataset def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path): @@ -524,105 +565,126 @@ def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path) Segment_ref_azimuth = {} background_rate = {} background_counts = {} - + # Initialize a list to store data for each beam beam_datasets = [] - - + # Loop over each strong beam in target_strong_beams for gtx in target_strong_beams: - print('Processing strong beam ID:', gtx) - + print("Processing strong beam ID:", gtx) + # Access the data for the current beam IS2_val = IS2_atl03_mds[gtx] - + # Initialize dictionaries to store segment data for the beam - Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] n_seg = len(Segment_ID[gtx]) - n_pe, = IS2_val['heights']['delta_time'].shape - Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 - Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] - Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] - Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] - delta_time = IS2_val['geolocation']['delta_time'] - segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() - segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() - ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() - ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() - geoid = IS2_val['geophys_corr']['geoid'][:].copy() - h_ph = IS2_val['heights']['h_ph'][:].copy() - photon_delta_time = IS2_val['heights']['delta_time'][:].copy() - lat_ph = IS2_val['heights']['lat_ph'][:].copy() - lon_ph = IS2_val['heights']['lon_ph'][:].copy() - signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() - x_atc = IS2_val['heights']['dist_ph_along'][:].copy() - y_atc = IS2_val['heights']['dist_ph_across'][:].copy() - quality_ph = IS2_val['heights']['quality_ph'] - - # Optional variables for solar background sensitivity testing - photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) - photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) - if 'solar_elevation' in IS2_val['geolocation']: - segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() - photon_solar_elevation = interpolate_by_time( - delta_time, segment_solar_elevation, photon_delta_time - ) - if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: - bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() - bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() - photon_background_rate = interpolate_by_time( - bckgrd_time, bckgrd_rate, photon_delta_time - ) - - # Adjust x_atc based on segment distances + (n_pe,) = IS2_val["heights"]["delta_time"].shape + Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 + Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] + Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] + Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] + delta_time = IS2_val["geolocation"]["delta_time"] + segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() + segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() + ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() + ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() + geoid = IS2_val["geophys_corr"]["geoid"][:].copy() + h_ph = IS2_val["heights"]["h_ph"][:].copy() + photon_delta_time = IS2_val["heights"]["delta_time"][:].copy() + lat_ph = IS2_val["heights"]["lat_ph"][:].copy() + lon_ph = IS2_val["heights"]["lon_ph"][:].copy() + signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() + x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() + y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() + quality_ph = IS2_val["heights"]["quality_ph"] + + # Optional variables for solar background sensitivity testing + photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) + photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) + if "solar_elevation" in IS2_val["geolocation"]: + segment_solar_elevation = IS2_val["geolocation"]["solar_elevation"][ + : + ].copy() + photon_solar_elevation = interpolate_by_time( + delta_time, segment_solar_elevation, photon_delta_time + ) + if ( + "bckgrd_atlas" in IS2_val + and "bckgrd_rate" in IS2_val["bckgrd_atlas"] + and "delta_time" in IS2_val["bckgrd_atlas"] + ): + bckgrd_rate = IS2_val["bckgrd_atlas"]["bckgrd_rate"][:].copy() + bckgrd_time = IS2_val["bckgrd_atlas"]["delta_time"][:].copy() + photon_background_rate = interpolate_by_time( + bckgrd_time, bckgrd_rate, photon_delta_time + ) + + # Adjust x_atc based on segment distances for seg_index in range(n_seg): idx = Segment_Index_begin[gtx][seg_index] cnt = Segment_PE_count[gtx][seg_index] - x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - # Calculate relative distances + # Calculate relative distances relative_AT_dist = (x_atc - x_atc[0]) / 1000 - relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 - + relative_seg_dist = ( + Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] + ) / 1000 + # Create a GeoDataFrame to hold segment data for shoreline check - Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - + # Determine if it is land by the land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels # Apply interpolations for required data - is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) - ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) - ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) + is_land_label_interp1d = apply_interpolation( + interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph + ) + ph_ref_elev = apply_interpolation( + interpolate_labels(segment_lat, ref_elev), lat_ph + ) + ph_ref_azimuth = apply_interpolation( + interpolate_labels(segment_lat, ref_azimuth), lat_ph + ) ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) - - # Create photon DataFrame for the current beam - sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, - ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, - geoid=ph_geoid,h_ph=h_ph, - quality_ph=quality_ph, - is_land_label_interp1d=is_land_label_interp1d, - signal_conf_photon=signal_conf_photon, - x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. - relative_AT_dist=relative_AT_dist, - solar_elevation=photon_solar_elevation, - background_rate=photon_background_rate) + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe( + lat_ph=lat_ph, + lon_ph=lon_ph, + ref_elev=ph_ref_elev, + ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid, + h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate, + ) # Filter out land photons - sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] - + sea_photon_dataset = sea_photon_dataset[ + sea_photon_dataset["is_land_label"] != 1 + ] + # Add a new column to indicate the beam ID - sea_photon_dataset['beam_id'] = gtx - + sea_photon_dataset["beam_id"] = gtx + # Append the processed dataset for the current beam to the list beam_datasets.append(sea_photon_dataset) - + # Concatenate all beam data into a single DataFrame all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) - + return all_beams_dataset @@ -631,103 +693,116 @@ def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000) Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, convex hull areas, and convex hull points. """ - lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) + lat_bins_grouped = photon_dataset.groupby("lat_bins", observed=False) filtered_dataset = photon_dataset.copy() - + convex_hulls = {} convex_hull_areas = {} - + for lat_bin, bin_data in lat_bins_grouped: if len(bin_data) >= 3: - points = bin_data[['lat', 'photon_height']].to_numpy() + points = bin_data[["lat", "photon_height"]].to_numpy() hull = ConvexHull(points) area = hull.volume convex_hull_areas[lat_bin] = area - + # Store the ConvexHull points if area meets the threshold if area >= hull_area_threshold: convex_hulls[lat_bin] = points[hull.vertices] else: # Remove bins with hull area below the threshold - filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] - - return filtered_dataset, convex_hull_areas, convex_hulls - - - - - - + filtered_dataset = filtered_dataset[ + filtered_dataset["lat_bins"] != lat_bin + ] + return filtered_dataset, convex_hull_areas, convex_hulls ########################## -##Discard Functions Below +##Discard Functions Below ########################## -# Extracts key information from filenames, +# Extracts key information from filenames, # which could include processed status, product ID, timestamps, identifiers, or metadata. def extract_file_params(file_path): - ''' + """ Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" - Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', '2023', '11', '15', '001', '12', '_data') - ''' + """ # Defines a regex pattern to match strings in the file path - rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' - r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') + rx = re.compile( + r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" + r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" + ) # Searches for all matches of the regex pattern in the given params = rx.findall(file_path).pop() return params def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): - ''' + """ create a mask for filtering sea surface, sea floor, and threshold_height - ''' - mask = (binned_data['height'] <= sea_surface_height) & \ - (binned_data['height'] >= seafloor_height) & \ - (binned_data['height'] >= threshold_height) + """ + mask = ( + (binned_data["height"] <= sea_surface_height) + & (binned_data["height"] >= seafloor_height) + & (binned_data["height"] >= threshold_height) + ) return mask -def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): - ''' +def generate_polygons_from_binned_data( + lat, height, mask, lat_interval, height_interval +): + """ horizontal_vertical_bin_dataset Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals - ''' + """ # Create latitude bins based on the specified interval lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) - + # Create height bins within the range [-12, 2] based on the specified interval height_bins = np.arange(-12, 2 + height_interval, height_interval) - + # Identify which bin each masked latitude and height value belongs to lat_bin_indices = np.digitize(lat[mask], lat_bins) - + # Determine which height bin each masked point belongs to height_bin_indices = np.digitize(height[mask], height_bins) - + # Combine latitude and height bin indices for each point - combined_bins = list(zip(lat_bin_indices, height_bin_indices)) - - # Find unique bin combinations and count the number of points in each bin + combined_bins = list(zip(lat_bin_indices, height_bin_indices, strict=False)) + + # Find unique bin combinations and count the number of points in each bin unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) - - # Filter bins to include only those within valid index ranges - valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] + + # Filter bins to include only those within valid index ranges + valid_bins = [ + (bin[0], bin[1]) + for bin in unique_bins + if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins) + ] polygons = [] for bin_lat, bin_height in valid_bins: - if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): - bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) - if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) + if ( + bin_lat > 0 + and bin_lat < len(lat_bins) + and bin_height > 0 + and bin_height < len(height_bins) + ): + bin_points = np.array( + [ + (lat[mask][i], height[mask][i]) + for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat + and height_bin_indices[i] == bin_height + ] + ) if len(bin_points) > 2: hull = ConvexHull(bin_points) polygons.append(bin_points[hull.vertices]) return lat_bins, height_bins, valid_bins, polygons - - - diff --git a/aok/core/kd_utils/interpolation.py b/aok/core/kd_utils/interpolation.py index 59b04b4..a205f2d 100644 --- a/aok/core/kd_utils/interpolation.py +++ b/aok/core/kd_utils/interpolation.py @@ -2,18 +2,24 @@ import scipy.interpolate + def interpolate_labels(segment_lat, labels): model = scipy.interpolate.interp1d(segment_lat, labels, fill_value="extrapolate") return model + def apply_interpolation(model, lat_ph): return model(lat_ph) + def geoid_correction(lat_ph, segment_lat, geoid): model = interpolate_labels(segment_lat, geoid) return apply_interpolation(model, lat_ph) + def refraction_correction(lat_ph, segment_lat, ref_elev, ref_azimuth): elev_model = interpolate_labels(segment_lat, ref_elev) azimuth_model = interpolate_labels(segment_lat, ref_azimuth) - return apply_interpolation(elev_model, lat_ph), apply_interpolation(azimuth_model, lat_ph) + return apply_interpolation(elev_model, lat_ph), apply_interpolation( + azimuth_model, lat_ph + ) diff --git a/aok/core/kd_utils/kd_utils.py b/aok/core/kd_utils/kd_utils.py index 39c3687..5aa45ca 100644 --- a/aok/core/kd_utils/kd_utils.py +++ b/aok/core/kd_utils/kd_utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding: utf-8 ##### # This group of functions processes ICESat-2 data and creates a bathymetric model. @@ -14,40 +13,32 @@ # 8. Calculate bathymetric height (get_bath_height()) # 9. Produce figures (produce_figures()) ##### -import os -# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" +from datetime import datetime +# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" import io +import logging +import math import os import re import time -import math + +import geopandas as gpd import h5py -import logging +import hdbscan + +# from pyproj import Transformer +import matplotlib.pyplot as plt import netCDF4 import numpy as np -import geopandas as gpd - import pandas as pd -from datetime import datetime -import matplotlib.pyplot as plt -import hdbscan -from sklearn.preprocessing import StandardScaler, MinMaxScaler -import argparse -import subprocess # import fiona -import utm -import pyproj -from pyproj import Transformer, Proj - -# from pyproj import Transformer -from matplotlib.widgets import LassoSelector -from matplotlib.path import Path +from pyproj import Proj, Transformer +from sklearn.preprocessing import MinMaxScaler -from sklearn.cluster import DBSCAN -#this function from icesat2_toolkit +# this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ @@ -71,9 +62,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, 'r') + fileID = h5py.File(FILENAME, "r") else: - fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + fileID = h5py.File(os.path.expanduser(FILENAME), "r") # Output HDF5 file information logging.info(fileID.filename) @@ -85,12 +76,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]['geolocation']['segment_id'] - fileID[gtx]['heights']['delta_time'] + fileID[gtx]["geolocation"]["segment_id"] + fileID[gtx]["heights"]["delta_time"] except KeyError: pass else: @@ -100,71 +91,71 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): for gtx in IS2_atl03_beams: # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]['heights'] = {} - IS2_atl03_mds[gtx]['geolocation'] = {} - IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} - IS2_atl03_mds[gtx]['geophys_corr'] = {} + IS2_atl03_mds[gtx]["heights"] = {} + IS2_atl03_mds[gtx]["geolocation"] = {} + IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} + IS2_atl03_mds[gtx]["geophys_corr"] = {} # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_mds[gtx]['heights'][key] = val[:] + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_mds[gtx]["heights"][key] = val[:] # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_mds[gtx]["geolocation"][key] = val[:] # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]['heights'] = {} - IS2_atl03_attrs[gtx]['geolocation'] = {} - IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} - IS2_atl03_attrs[gtx]['geophys_corr'] = {} + IS2_atl03_attrs[gtx]["heights"] = {} + IS2_atl03_attrs[gtx]["geolocation"] = {} + IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} + IS2_atl03_attrs[gtx]["geophys_corr"] = {} # Global Group Attributes - for att_name,att_val in fileID[gtx].attrs.items(): + for att_name, att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_attrs[gtx]['heights'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_attrs[gtx]["heights"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_attrs[gtx]['geolocation'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_attrs[gtx]["geolocation"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val # ICESat-2 Geophysical Corrections Group - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds['orbit_info'] = {} - IS2_atl03_attrs['orbit_info'] = {} - for key,val in fileID['orbit_info'].items(): - IS2_atl03_mds['orbit_info'][key] = val[:] + IS2_atl03_mds["orbit_info"] = {} + IS2_atl03_attrs["orbit_info"] = {} + for key, val in fileID["orbit_info"].items(): + IS2_atl03_mds["orbit_info"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID['orbit_info'].attrs.items(): - IS2_atl03_attrs['orbit_info'][att_name] = att_val + for att_name, att_val in fileID["orbit_info"].attrs.items(): + IS2_atl03_attrs["orbit_info"][att_name] = att_val # Variable Attributes - IS2_atl03_attrs['orbit_info'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + IS2_atl03_attrs["orbit_info"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["orbit_info"][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -172,112 +163,131 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds['ancillary_data'] = {} - IS2_atl03_attrs['ancillary_data'] = {} - ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', - 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', - 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', - 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', - 'start_orbit','start_region','start_rgt','version'] + IS2_atl03_mds["ancillary_data"] = {} + IS2_atl03_attrs["ancillary_data"] = {} + ancillary_keys = [ + "atlas_sdp_gps_epoch", + "data_end_utc", + "data_start_utc", + "end_cycle", + "end_geoseg", + "end_gpssow", + "end_gpsweek", + "end_orbit", + "end_region", + "end_rgt", + "granule_end_utc", + "granule_start_utc", + "release", + "start_cycle", + "start_geoseg", + "start_gpssow", + "start_gpsweek", + "start_orbit", + "start_region", + "start_rgt", + "version", + ] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data'][key] = {} - for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): - IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"][key] = {} + for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): + IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds['ancillary_data']['tep'] = {} - IS2_atl03_attrs['ancillary_data']['tep'] = {} - for key,val in fileID['ancillary_data']['tep'].items(): + IS2_atl03_mds["ancillary_data"]["tep"] = {} + IS2_atl03_attrs["ancillary_data"]["tep"] = {} + for key, val in fileID["ancillary_data"]["tep"].items(): # get each HDF5 variable - IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data']['tep'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1,cal2 = ('ancillary_data','calibrations') - for var in ['dead_time','first_photon_bias']: + cal1, cal2 = ("ancillary_data", "calibrations") + for var in ["dead_time", "first_photon_bias"]: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key,val in fileID[cal1][cal2][var].items(): + for key, val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1,tep2 = ('atlas_impulse_response','tep_histogram') + tep1, tep2 = ("atlas_impulse_response", "tep_histogram") IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ['pce1_spot1','pce2_spot3']: - IS2_atl03_mds[tep1][pce] = {tep2:{}} - IS2_atl03_attrs[tep1][pce] = {tep2:{}} + for pce in ["pce1_spot1", "pce2_spot3"]: + IS2_atl03_mds[tep1][pce] = {tep2: {}} + IS2_atl03_attrs[tep1][pce] = {tep2: {}} # for each TEP variable - for key,val in fileID[tep1][pce][tep2].items(): + for key, val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name,att_val in fileID.attrs.items(): + for att_name, att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) + # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = '0' + utm_band + utm_band = "0" + utm_band if lat >= 0: - epsg_code = 'epsg:326' + utm_band + epsg_code = "epsg:326" + utm_band return epsg_code - epsg_code = 'epsg:327' + utm_band + epsg_code = "epsg:327" + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - #To transform from WGS84 ellipsoidal height + # To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -333,8 +343,10 @@ def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): # Iterate through ph_index_beg, from the first to second to last number # and set the photons between ph_index_beg i to ph_index_beg i + 1 to # segment id i - for i in range(0, len(atl03_ph_index_beg) - 1): - ph_segment_id[atl03_ph_index_beg[i]:atl03_ph_index_beg[i + 1]] = atl03_segment_id[i] + for i in range(len(atl03_ph_index_beg) - 1): + ph_segment_id[atl03_ph_index_beg[i] : atl03_ph_index_beg[i + 1]] = ( + atl03_segment_id[i] + ) # Return list of segment_id at the photon level return ph_segment_id @@ -388,34 +400,51 @@ def ref_linear_interp(x, y): def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): """Bin data along vertical and horizontal scales for later segmentation""" - + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-50, 10) - valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + valid_mask = (dataset["photon_height"] > valid_range[0]) & ( + dataset["photon_height"] < valid_range[1] + ) # Apply the valid_mask to filter unwanted values filtered_dataset = dataset[valid_mask] # Calculate the number of height bins - height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) - height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + height_range = abs( + filtered_dataset["photon_height"].max() + - filtered_dataset["photon_height"].min() + ) + height_bin_number = max( + 1, round(height_range / vertical_res) + ) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + lat_bins = pd.cut( + filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) # Create bins for height - height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, - labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), - filtered_dataset['photon_height'].max(), - num=height_bin_number), decimals=1)) + height_bins = pd.cut( + filtered_dataset["photon_height"], + bins=height_bin_number, + labels=np.round( + np.linspace( + filtered_dataset["photon_height"].min(), + filtered_dataset["photon_height"].max(), + num=height_bin_number, + ), + decimals=1, + ), + ) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, 'lat_bins'] = lat_bins - filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset.loc[:, "lat_bins"] = lat_bins + filtered_dataset.loc[:, "height_bins"] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -423,22 +452,29 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): # thinking about grid searching to detect bathymetric directly # rather than bin and then search -def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density_threshold): +def horizontal_vertical_grid_density_cal( + dataset, lat_res, vertical_res, density_threshold +): """Bin data along vertical and horizontal scales - and calculate high-density points using a grid method""" + and calculate high-density points using a grid method""" # Calculate the number of bins required both vertically - lat_bin_number = round(abs(dataset['lat'].min() - dataset['lat'].max()) / lat_res) + lat_bin_number = round(abs(dataset["lat"].min() - dataset["lat"].max()) / lat_res) # and horizontally based on resolution size - height_bin_number = round(abs(dataset['photon_height'].min() - dataset['photon_height'].max()) / vertical_res) + height_bin_number = round( + abs(dataset["photon_height"].min() - dataset["photon_height"].max()) + / vertical_res + ) # Create the grid grid = np.zeros((lat_bin_number, height_bin_number)) # Iterate over the dataset and assign points to cells for _, row in dataset.iterrows(): - lat_index = int((row['lat'] - dataset['lat'].min()) / lat_res) - height_index = int((row['photon_height'] - dataset['photon_height'].min()) / vertical_res) + lat_index = int((row["lat"] - dataset["lat"].min()) / lat_res) + height_index = int( + (row["photon_height"] - dataset["photon_height"].min()) / vertical_res + ) grid[lat_index, height_index] += 1 # Identify high-density cells @@ -448,10 +484,14 @@ def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density dataset_copy = dataset.copy() # Assign the cell indices as bins to the dataset - dataset_copy['lat_bins'] = pd.cut(dataset['lat'], bins=lat_bin_number, - labels=np.arange(lat_bin_number)) - dataset_copy['height_bins'] = pd.cut(dataset['photon_height'], bins=height_bin_number, - labels=np.arange(height_bin_number)) + dataset_copy["lat_bins"] = pd.cut( + dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) + dataset_copy["height_bins"] = pd.cut( + dataset["photon_height"], + bins=height_bin_number, + labels=np.arange(height_bin_number), + ) # Reset the index of the copied dataset dataset_copy = dataset_copy.reset_index(drop=True) @@ -465,17 +505,21 @@ def horizontal_bin_dataset(dataset, lat_res): for later segmentation""" # Calculate the number of bins required horizontally based on resolution size - lat_bin_number = round(abs(dataset['lat_utm'].min() - dataset['lat_utm'].max()) / lat_res) + lat_bin_number = round( + abs(dataset["lat_utm"].min() - dataset["lat_utm"].max()) / lat_res + ) # Cut lat bins # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) - lat_bins = pd.cut(dataset['lat_utm'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + lat_bins = pd.cut( + dataset["lat_utm"], bins=lat_bin_number, labels=np.array(range(lat_bin_number)) + ) # Create a copy of the dataset dataset_copy = dataset.copy() # Add lat bins to the dataframe - dataset_copy['lat_bins'] = lat_bins + dataset_copy["lat_bins"] = lat_bins # Reset the index of the copied dataset dataset_copy = dataset_copy.reset_index(drop=True) @@ -492,33 +536,34 @@ def get_rm_sea_surface_bin(binned_dataset): flag = 1 # group dataset by lat bins - grouped_data = binned_dataset.groupby(['lat_bins'], group_keys=True) + grouped_data = binned_dataset.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Loop through groups and return average sea height for k, v in data_groups.items(): - - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby('height_bins',observed=False).count()) + new_df = pd.DataFrame(v.groupby("height_bins", observed=False).count()) # Return the bin with the highest count - largest_h_bin = new_df['lat'].argmax() + largest_h_bin = new_df["lat"].argmax() # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] # get all values below this bin # Use boolean indexing to select only the values below the peak bin - new_photon_array_without_peak_bin = v.loc[v['height_bins'] < largest_h_index] + new_photon_array_without_peak_bin = v.loc[v["height_bins"] < largest_h_index] if flag == 1: photon_array_without_peak_bin = new_photon_array_without_peak_bin flag = 2 else: - photon_array_without_peak_bin = photon_array_without_peak_bin.append(new_photon_array_without_peak_bin) + photon_array_without_peak_bin = photon_array_without_peak_bin.append( + new_photon_array_without_peak_bin + ) del new_df @@ -527,36 +572,37 @@ def get_rm_sea_surface_bin(binned_dataset): def get_sea_surface_height(binned_data, threshold): """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - #set flag for the df save + + # set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - # Group dataset by latitude bins - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups and return average sea height for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) # Return the bin with the highest count - largest_h_bin = new_df['lat'].argmax() + largest_h_bin = new_df["lat"].argmax() # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + photons_sea_surface = v.loc[ + v["height_bins"] == largest_h_index, "photon_height" + ] lat_bin_sea_median = photons_sea_surface.median() # Append to sea height list @@ -567,119 +613,131 @@ def get_sea_surface_height(binned_data, threshold): # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = \ - v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & - (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + photons_sea_surface_up = v.loc[ + (v["photon_height"] > (lat_bin_sea_median - threshold)) + & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) + ] # Calculate the photon ratio between surface and whole photons - if v['photon_height'].shape[0] > 0: - new_photons_ratio_sea_surface = \ - photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + if v["photon_height"].shape[0] > 0: + new_photons_ratio_sea_surface = ( + photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] + ) else: new_photons_ratio_sea_surface = np.nan - sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + sea_surface_subsurface_photons_ratio.append(1 - new_photons_ratio_sea_surface) # Filter out sea height bin values outside 2 SD of mean. mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | - (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = \ - np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) - + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): # Get all values below this bin - NewPhotonDFBelowThresholdPeak = \ - v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] - + NewPhotonDFBelowThresholdPeak = v.loc[ + v["photon_height"] < (final_sea_surface_height[i] - threshold) + ] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex=False - else: - PhotonDFBelowThresholdPeak=\ - pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) - - - return final_sea_surface_height, \ - sea_surface_height_abnormal_label, \ - sea_surface_dominated_label,\ - PhotonDFBelowThresholdPeak + firstTimeIndex = False + else: + PhotonDFBelowThresholdPeak = pd.concat( + [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] + ) + + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowThresholdPeak, + ) # -#Arbitrary cutoff below the max value - 0.5 m below peak +# Arbitrary cutoff below the max value - 0.5 m below peak # (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) -def get_photon_below_sea_surface(binned_data,threshold): - '''Calculate mean sea height for easier calculation of depth and cleaner figures''' - - #set flag for the df save - firstTimeIndex=1 - +def get_photon_below_sea_surface(binned_data, threshold): + """Calculate mean sea height for easier calculation of depth and cleaner figures""" + + # set flag for the df save + firstTimeIndex = 1 + # Create sea height list sea_surface_height = [] -# mean_lat_bins_seq=[] - - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + # mean_lat_bins_seq=[] + + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) - + # Loop through groups and return average sea height - for k,v in data_groups.items(): - - lat_bin_average=v['lat_utm'].mean() - + for k, v in data_groups.items(): + lat_bin_average = v["lat_utm"].mean() + # Create new dataframe based on occurance of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) - + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) + # Return the bin with the highest count - largest_h_bin = new_df['lat_utm'].argmax() - + largest_h_bin = new_df["lat_utm"].argmax() + # Select the index of the bin with the highest count largest_h = new_df.index[largest_h_bin] - + # Calculate the median value of all values within this bin - lat_bin_sea_median = v.loc[v['height_bins']==largest_h, 'photon_height'].median() - + lat_bin_sea_median = v.loc[ + v["height_bins"] == largest_h, "photon_height" + ].median() + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) -# mean_lat_bins_seq.append(lat_bin_average) + # mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Filter out sea height bin values outside 2 SD of mean. mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - Final_sea_height = np.where((sea_surface_height > (mean + 2*sd)) | (sea_surface_height < (mean - 2*sd)), np.nan, - sea_surface_height).tolist() - Abnormal_sea_height_label=np.where(np.isnan(Final_sea_height), 1, 0) - + Final_sea_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + Abnormal_sea_height_label = np.where(np.isnan(Final_sea_height), 1, 0) # Loop through groups again and return photons below 0.5m of sea height - for k,v in data_groups.items(): - + for k, v in data_groups.items(): # get all values below this bin # Use calculated sea height to determine photons at 0.5m below peak - NewPhotonArrayBelowThresholdPeak= \ - v.loc[v['photon_height']<(sea_surface_height[k]-threshold)] - - if firstTimeIndex ==1: - PhotonArrayBelowThresholdPeak=NewPhotonArrayBelowThresholdPeak - firstTimeIndex=2 - - else: - PhotonArrayBelowThresholdPeak=PhotonArrayBelowThresholdPeak.append(NewPhotonArrayBelowThresholdPeak) - + NewPhotonArrayBelowThresholdPeak = v.loc[ + v["photon_height"] < (sea_surface_height[k] - threshold) + ] - return PhotonArrayBelowThresholdPeak + if firstTimeIndex == 1: + PhotonArrayBelowThresholdPeak = NewPhotonArrayBelowThresholdPeak + firstTimeIndex = 2 + else: + PhotonArrayBelowThresholdPeak = PhotonArrayBelowThresholdPeak.append( + NewPhotonArrayBelowThresholdPeak + ) + + return PhotonArrayBelowThresholdPeak # Function to get elevation for multiple points @@ -689,12 +747,15 @@ def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): rows, cols = np.array(rows), np.array(cols) # Ensure the indices are within bounds - valid_mask = (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + valid_mask = ( + (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + ) elevations = np.full(lons.shape, np.nan) elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] return elevations + def get_water_temp(date_year, date_month, date_day, latitude, longitude): """ Pull down surface water temperature along the track from the JPL GHRSST opendap website. @@ -712,7 +773,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -723,8 +784,11 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * - (new_lat_max - new_lat_min) + new_lat_min) + new_lat = round( + ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) + * (new_lat_max - new_lat_min) + + new_lat_min + ) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -733,24 +797,41 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * - (new_lon_max - new_lon_min) + new_lon_min) + new_lon = round( + ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) + * (new_lon_max - new_lon_min) + + new_lon_min + ) # Access the SST data using the JPL OpenDap interface - url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ - + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ - + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + url = ( + "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" + + str(year) + + "/" + + str(zero_day_of_year) + + "/" + + str(date) + + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" + ) dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction(WTemp, WSmodel, Wavelength, - Photon_ref_elev, Ph_ref_azimuth, - PhotonZ, PhotonX, PhotonY, Ph_Conf): +def refraction_correction( + WTemp, + WSmodel, + Wavelength, + Photon_ref_elev, + Ph_ref_azimuth, + PhotonZ, + PhotonX, + PhotonY, + Ph_Conf, +): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -780,18 +861,18 @@ def refraction_correction(WTemp, WSmodel, Wavelength, n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = (1 - (n1 / n2)) + correction_coef = 1 - (n1 / n2) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) # eq 3. S # Approximate water Surface = 1.5 @@ -810,7 +891,7 @@ def refraction_correction(WTemp, WSmodel, Wavelength, phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -834,25 +915,36 @@ def refraction_correction(WTemp, WSmodel, Wavelength, outY = PhotonY + DN outZ = PhotonZ + DZ - ''' + """ print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - ''' - return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, - Photon_ref_elev) # We are most interested in out-x, out-y, out-z - - -def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surface_height, vertical_res): - """ Detect bathymetric level per bin based on percentile_thresh """ + """ + return ( + outX, + outY, + outZ, + Ph_Conf, + PhotonX, + PhotonY, + PhotonZ, + Ph_ref_azimuth, + Photon_ref_elev, + ) # We are most interested in out-x, out-y, out-z + + +def get_bath_height_percentile_thresh( + binned_data, percentile_thresh, sea_surface_height, vertical_res +): + """Detect bathymetric level per bin based on percentile_thresh""" # Create sea height list bath_height = [] @@ -862,34 +954,42 @@ def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surfac # Group data by latitude # Filter out surface data that are two bins below median surface value calculated above - binned_data_bath = binned_data[(binned_data['photon_height'] < - sea_surface_height - (vertical_res * 2))] - grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + binned_data_bath = binned_data[ + (binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) + ] + grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Create a percentile threshold of photon counts in each grid, # grouped by both x and y axes. count_threshold = np.percentile( - binned_data.groupby(['lat_bins', 'height_bins']).size().reset_index().groupby('lat_bins')[[0]].max(), - percentile_thresh) + binned_data.groupby(["lat_bins", "height_bins"]) + .size() + .reset_index() + .groupby("lat_bins")[[0]] + .max(), + percentile_thresh, + ) # Loop through groups and return average bathy height for k, v in data_groups.items(): - new_df = pd.DataFrame(v.groupby('height_bins').count()) - bath_bin = new_df['lat'].argmax() + new_df = pd.DataFrame(v.groupby("height_bins").count()) + bath_bin = new_df["lat"].argmax() bath_bin_h = new_df.index[bath_bin] # Set threshold of photon counts per bin # here this script determines whether there is bathymetry signals by # the photon counts per bin below sea surface height - if new_df.iloc[bath_bin]['lat'] >= count_threshold: - - geo_photon_height.append(v.loc[v['height_bins'] == - bath_bin_h, 'cor_photon_height'].values) - geo_longitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lon'].values) - geo_latitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lat'].values) - - bath_bin_median = v.loc[v['height_bins'] == bath_bin_h, 'cor_photon_height'].median() + if new_df.iloc[bath_bin]["lat"] >= count_threshold: + geo_photon_height.append( + v.loc[v["height_bins"] == bath_bin_h, "cor_photon_height"].values + ) + geo_longitude.append(v.loc[v["height_bins"] == bath_bin_h, "lon"].values) + geo_latitude.append(v.loc[v["height_bins"] == bath_bin_h, "lat"].values) + + bath_bin_median = v.loc[ + v["height_bins"] == bath_bin_h, "cor_photon_height" + ].median() bath_height.append(bath_bin_median) del new_df @@ -902,17 +1002,23 @@ def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surfac geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() geo_depth = sea_surface_height - geo_photon_list geo_df = pd.DataFrame( - {'lon': geo_longitude_list, 'lat': geo_latitude_list, - 'photon_height': geo_photon_list, - 'depth': geo_depth}) + { + "lon": geo_longitude_list, + "lat": geo_latitude_list, + "photon_height": geo_photon_list, + "depth": geo_depth, + } + ) del geo_longitude_list, geo_latitude_list, geo_photon_list return bath_height, geo_df -def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_height, vertical_res): - """ Calculate bathymetric level per lat bin based on horizontal resolution """ +def get_bath_height_HDBSCAN( + lat_binned_data, percentile_thresh, sea_surface_height, vertical_res +): + """Calculate bathymetric level per lat bin based on horizontal resolution""" # Create sea height list bath_height = [] geo_photon_height = [] @@ -922,15 +1028,15 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig # Group data by latitude # Filter out surface data that are two bins (2 times height resolution) # below median sea surface value calculated above - binned_data_bath = lat_binned_data[(lat_binned_data['photon_height'] < - sea_surface_height - (vertical_res * 2))] + binned_data_bath = lat_binned_data[ + (lat_binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) + ] - grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Loop through groups and return average bathymetric height for k, v in data_groups.items(): - # assign each group of dataset to a new dataframe new_df = pd.DataFrame(v) @@ -944,7 +1050,9 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig del new_df continue - lat_height_pairs = list(zip(new_df['lat_utm'], new_df['cor_photon_height'])) + lat_height_pairs = list( + zip(new_df["lat_utm"], new_df["cor_photon_height"], strict=False) + ) # Convert to a numpy array for sklearn lat_height_pairs_array = np.array(lat_height_pairs) @@ -968,9 +1076,13 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig continue # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) - cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, - min_samples=3, cluster_selection_epsilon=20, - leaf_size=25, core_dist_n_jobs=-1) + cluster = hdbscan.HDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=3, + cluster_selection_epsilon=20, + leaf_size=25, + core_dist_n_jobs=-1, + ) cluster.fit(data_scaled) # Get the labels assigned to each point by the HDBSCAN model @@ -980,7 +1092,7 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig unique, counts = np.unique(labels, return_counts=True) # Create a dictionary that maps each cluster label to the count of points assigned to it - clusters_dict = dict(zip(unique, counts)) + clusters_dict = dict(zip(unique, counts, strict=False)) # Print the dictionary to see the size of each cluster print(clusters_dict) @@ -994,20 +1106,20 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig else: # Add labels to the DataFrame new_df_copy = new_df.copy() - new_df_copy['cluster'] = labels + new_df_copy["cluster"] = labels # Subset the DataFrame to get only the data points in the largest cluster - bath_cluster_data = new_df[new_df_copy['cluster'] == max_cluster_label] + bath_cluster_data = new_df[new_df_copy["cluster"] == max_cluster_label] # calculate the median height value as the average bathymetric height - bath_bin_median = bath_cluster_data['cor_photon_height'].median() + bath_bin_median = bath_cluster_data["cor_photon_height"].median() bath_height.append(bath_bin_median) # Extract the longitude, latitude, and photon height for each lat bin # and add it to respective lists - geo_photon_height.append(bath_cluster_data['cor_photon_height'].values) - geo_longitude.append(bath_cluster_data['lon_utm'].values) - geo_latitude.append(bath_cluster_data['lat_utm'].values) + geo_photon_height.append(bath_cluster_data["cor_photon_height"].values) + geo_longitude.append(bath_cluster_data["lon_utm"].values) + geo_latitude.append(bath_cluster_data["lat_utm"].values) del new_df @@ -1021,12 +1133,17 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig # Create a DataFrame geo_df = pd.DataFrame( - {'lon': geo_longitude_list, 'lat': geo_latitude_list, - 'photon_height': geo_photon_list, - 'depth': geo_depth.tolist()}) + { + "lon": geo_longitude_list, + "lat": geo_latitude_list, + "photon_height": geo_photon_list, + "depth": geo_depth.tolist(), + } + ) return bath_height, geo_df + # requires that the input gdf has ranged index values i # will need to change if index is changed to time or something # this currently checks point in polygon for EVERY point @@ -1035,16 +1152,15 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, 'is_land', - zero_int_array - 1, False) + ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -1055,13 +1171,15 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + land_polygon_gdf = gpd.read_file( + shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" + ) # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -1073,7 +1191,6 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: - print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -1084,29 +1201,44 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -# -def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, - y_limit_top, y_limit_bottom, percentile, file, geo_df, - ref_y, ref_z, beam, epsg_num): +def produce_figures( + binned_data, + bath_height, + sea_height, + solo_sea_surface_label, + y_limit_top, + y_limit_bottom, + percentile, + file, + geo_df, + ref_y, + ref_z, + beam, + epsg_num, +): """Create figures""" # Create bins for latitude - bath_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(bath_height))+20 + bath_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 + ) - sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(sea_height))+10 + sea_surface_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 + ) # Create new dataframes for median values - bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + sea_surface_label_df = pd.DataFrame( + {"x": sea_surface_x_axis_bins, "y": sea_surface_label} + ) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -1117,62 +1249,107 @@ def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') - plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', - alpha=0.1, c='red', label='Classified Photons') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") + plt.scatter( + geo_df.lat, + geo_df.photon_height, + s=0.8, + marker="o", + alpha=0.1, + c="red", + label="Classified Photons", + ) # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter(bath_median_df.x, bath_median_df.y, - marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') - - plt.scatter(sea_median_df.x, sea_median_df.y, - marker='o', c='b', alpha=1, s=2, label='Median sea surface') - - plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, - marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') - plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, - marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + plt.scatter( + bath_median_df.x, + bath_median_df.y, + marker="o", + c="r", + alpha=0.8, + s=2, + label="Median bathymetry", + ) + + plt.scatter( + sea_median_df.x, + sea_median_df.y, + marker="o", + c="b", + alpha=1, + s=2, + label="Median sea surface", + ) + + plt.scatter( + sea_surface_label_df.iloc[idx_1].x, + sea_surface_label_df.iloc[idx_1].y, + marker="o", + c="pink", + alpha=1, + s=3, + label="solo_sea_surface", + ) + plt.scatter( + sea_surface_label_df.iloc[idx_0].x, + sea_surface_label_df.iloc[idx_0].y, + marker="o", + c="g", + alpha=1, + s=3, + label="non_solo_sea_surface", + ) # Insert titles and subtitles - plt.title('Icesat2 Bathymetry\n' + file) - plt.xlabel('Latitude', fontsize=25) - plt.ylabel('Photon Height (m)', fontsize=25) + plt.title("Icesat2 Bathymetry\n" + file) + plt.xlabel("Latitude", fontsize=25) + plt.ylabel("Photon Height (m)", fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={'size': 20}) + plt.legend(loc="upper left", prop={"size": 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace('.h5', '') + file = file.replace(".h5", "") # Define where to save file plt.tight_layout() - plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + - str(beam) + '_' + str(percentile) + - '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + plt.savefig( + "C:/Workstation/ICESat2_HLS/" + + file + + "_gt" + + str(beam) + + "_" + + str(percentile) + + "_EPSG" + + str(epsg_num) + + "_" + + timestr + + ".pdf" + ) # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs("EPSG:" + str(epsg_num), - "EPSG:4326", always_xy=True) + transformer = Transformer.from_crs( + "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True + ) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform( - geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - geo_df['lon_wgs84'] = lon_wgs84 - geo_df['lat_wgs84'] = lat_wgs84 + geo_df["lon_wgs84"] = lon_wgs84 + geo_df["lat_wgs84"] = lat_wgs84 - geodf = gpd.GeoDataFrame(geo_df, - geometry=gpd.points_from_xy(geo_df.lon_wgs84, - geo_df.lat_wgs84)) + geodf = gpd.GeoDataFrame( + geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) + ) geodf.set_crs(epsg=4326, inplace=True) diff --git a/aok/core/kd_utils/main.py b/aok/core/kd_utils/main.py index e38729f..d5e9a7f 100644 --- a/aok/core/kd_utils/main.py +++ b/aok/core/kd_utils/main.py @@ -4,10 +4,9 @@ import re import sys +from config import get_args import pandas as pd -from config import get_args -from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.bathy_processing import process_subsurface_photon_filtering from kd_utils.data_processing import ( Extract_sea_photons, @@ -16,33 +15,36 @@ filter_photon_dataset_by_hull_area, load_data, ) +from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.sea_photons_analysis import process_sea_photon_binning from kd_utils.visualization import plot_convex_hulls, plot_kd_photons - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) PAIR_ID_MAP = { - 'gt1l': 'gt1_pair', - 'gt1r': 'gt1_pair', - 'gt2l': 'gt2_pair', - 'gt2r': 'gt2_pair', - 'gt3l': 'gt3_pair', - 'gt3r': 'gt3_pair', + "gt1l": "gt1_pair", + "gt1r": "gt1_pair", + "gt2l": "gt2_pair", + "gt2r": "gt2_pair", + "gt3l": "gt3_pair", + "gt3r": "gt3_pair", } def get_target_beams(all_beams, beam_attrs, target_beams_arg): strong_beams = [ - gtx for gtx in all_beams - if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' + gtx + for gtx in all_beams + if beam_attrs[gtx]["atlas_beam_type"].decode("utf-8") == "strong" ] if not target_beams_arg: return strong_beams - requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] + requested = [b.strip() for b in target_beams_arg.split(",") if b.strip()] return [b for b in requested if b in strong_beams] @@ -55,22 +57,30 @@ def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=Fals return dataset combined = dataset.copy() - combined['source_beam_id'] = combined['beam_id'] - combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) + combined["source_beam_id"] = combined["beam_id"] + combined["beam_id"] = ( + combined["beam_id"].map(PAIR_ID_MAP).fillna(combined["beam_id"]) + ) - if 'relative_AT_dist' in combined.columns: - rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 - pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') + if "relative_AT_dist" in combined.columns: + rel_m = pd.to_numeric(combined["relative_AT_dist"], errors="coerce") * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype("Int64") pair_bins = pair_bins.fillna(-1).astype(int) - combined['lat_bins'] = pair_bins + combined["lat_bins"] = pair_bins return combined def run_pipeline(args): - atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) - shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) - gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) + atl03_h5_file_path = os.path.join( + args.workspace_path, args.atl03_path, args.atl03_file + ) + shoreline_data_path = os.path.join( + args.workspace_path, args.other_data_path, args.shoreline_data + ) + gebco_full_path = os.path.join( + args.workspace_path, args.other_data_path, args.gebco_path + ) atl24_file_path = args.atl24_file if atl24_file_path and (not os.path.isabs(atl24_file_path)): atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) @@ -86,12 +96,16 @@ def run_pipeline(args): is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) if not target_strong_beams: - logger.error("No target strong beams found. Check input file and --target_beams.") + logger.error( + "No target strong beams found. Check input file and --target_beams." + ) sys.exit(1) logger.info("Strong beams selected: %s", target_strong_beams) plot_target_beam = [target_strong_beams[0]] - sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + sea_photon_dataset = Extract_sea_photons( + is2_mds, target_strong_beams, shoreline_data_path + ) sea_photon_dataset = apply_optional_ir_ap_filter( sea_photon_dataset, enabled=args.enable_ir_ap_filter, @@ -107,7 +121,9 @@ def run_pipeline(args): ) binned_dataset_sea_surface = process_sea_photon_binning( - sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res + sea_photon_dataset, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, ) post_refraction_refit_enabled = args.enable_post_refraction_refit @@ -118,48 +134,56 @@ def run_pipeline(args): ) post_refraction_refit_enabled = False - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - process_subsurface_photon_filtering( - binned_dataset_sea_surface, - gebco_file_path_lists, - args.subsurface_thresh, - args.ignore_subsurface_height_thres, - use_atl24_filter=args.enable_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, - use_gebco_filter=args.enable_gebco_filter, - apply_histogram_quality_filter=args.enable_histogram_quality_filter, - histogram_quality_min_ratio=args.histogram_quality_min_ratio, - histogram_quality_depth_min=args.histogram_quality_depth_min, - histogram_quality_depth_max=args.histogram_quality_depth_max, - apply_surface_sigma_filter=args.enable_surface_sigma_filter, - surface_sigma_max=args.surface_sigma_max, - apply_refraction_correction=args.enable_refraction_correction, - refraction_water_temp_c=args.refraction_water_temp_c, - refraction_wavelength_nm=args.refraction_wavelength_nm, - apply_post_refraction_refit=post_refraction_refit_enabled, - apply_flattening=args.enable_sea_surface_flattening, - flattening_window_m=args.sea_surface_flattening_window_m, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) - final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ - filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) - ].copy() + final_filtered_subsurface_photon_dataset = ( + filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset["beam_id"].isin( + target_strong_beams + ) + ].copy() + ) if args.enable_convex_hull_filter: - final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = ( filter_photon_dataset_by_hull_area( final_filtered_subsurface_photon_dataset, - hull_area_threshold=args.convex_hull_area_threshold + hull_area_threshold=args.convex_hull_area_threshold, ) + ) if plot_target_beam: plot_convex_hulls( final_filtered_subsurface_photon_dataset, plot_target_beam, convex_hulls, - convex_hull_areas + convex_hull_areas, ) subsurface_output_path = os.path.join( @@ -171,35 +195,37 @@ def run_pipeline(args): kd_input_dataset = optionally_combine_paired_beams_for_kd( final_filtered_subsurface_photon_dataset, horizontal_res=args.horizontal_res, - enabled=args.enable_paired_beam_combine + enabled=args.enable_paired_beam_combine, ) subsurface_photon_df_added_kd = process_kd_calculation(kd_input_dataset) - kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") + kd_output_path = os.path.join( + output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv" + ) subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) - if 'relative_AT_dist' in kd_input_dataset.columns: + if "relative_AT_dist" in kd_input_dataset.columns: unique_photon_dataset = kd_input_dataset[ - ['relative_AT_dist', 'lat_bins', 'photon_height'] + ["relative_AT_dist", "lat_bins", "photon_height"] ].drop_duplicates() - unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( - 'lat_bins', observed=False - )['relative_AT_dist'].transform('mean') + unique_photon_dataset["relative_AT_dist_center"] = ( + unique_photon_dataset.groupby( + "lat_bins", observed=False + )["relative_AT_dist"].transform("mean") + ) def find_closest(group): - center = group['relative_AT_dist_center'].iloc[0] + center = group["relative_AT_dist_center"].iloc[0] group = group.copy() - group['dist_to_center'] = abs(group['relative_AT_dist'] - center) - return group.loc[[group['dist_to_center'].idxmin()]] + group["dist_to_center"] = abs(group["relative_AT_dist"] - center) + return group.loc[[group["dist_to_center"].idxmin()]] - closest_to_center = unique_photon_dataset.groupby('lat_bins', observed=False).apply( - find_closest, include_groups=True - ) + closest_to_center = unique_photon_dataset.groupby( + "lat_bins", observed=False + ).apply(find_closest, include_groups=True) closest_to_center.index = closest_to_center.index.droplevel(0) kd_df_merged_distance = closest_to_center.merge( - subsurface_photon_df_added_kd, - on='lat_bins', - how='left' - ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + subsurface_photon_df_added_kd, on="lat_bins", how="left" + ).drop(columns=["relative_AT_dist_center", "dist_to_center"], errors="ignore") plot_kd_photons( output_path, @@ -212,7 +238,7 @@ def find_closest(group): logger.info("SUCCESS! Kd output: %s", kd_output_path) -if __name__ == '__main__': +if __name__ == "__main__": cli_args = get_args() try: run_pipeline(cli_args) diff --git a/aok/core/kd_utils/sea_photons_analysis.py b/aok/core/kd_utils/sea_photons_analysis.py index 90a2510..8d06e49 100644 --- a/aok/core/kd_utils/sea_photons_analysis.py +++ b/aok/core/kd_utils/sea_photons_analysis.py @@ -1,31 +1,32 @@ - -import numpy as np -import geopandas as gpd from datetime import datetime -import pandas as pd + import netCDF4 +import numpy as np import pandas as pd from scipy.signal import find_peaks from scipy.stats import norm + # Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): # Initialize list to store results from each beam binned_beam_datasets = [] # Group the dataset by 'beam_id' and process each group separately - for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): - print(f'Processing binning for beam: {beam_id}') + for beam_id, beam_data in sea_photon_dataset.groupby("beam_id"): + print(f"Processing binning for beam: {beam_id}") # Apply binning to the current beam dataset - binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) + binned_beam_data = horizontal_vertical_bin_dataset( + beam_data, horizontal_res, vertical_res + ) # Append the binned data for the current beam to the list binned_beam_datasets.append(binned_beam_data) # Combine all binned beam datasets into a single DataFrame binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) - + return binned_dataset_sea_surface @@ -33,35 +34,52 @@ def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res) def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): """Bin data along vertical and horizontal scales for later segmentation""" - + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-70, 5) - valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + valid_mask = (dataset["photon_height"] > valid_range[0]) & ( + dataset["photon_height"] < valid_range[1] + ) # Apply the valid_mask to filter unwanted values # and create a copy to avoid SettingWithCopyWarning filtered_dataset = dataset[valid_mask].copy() # Calculate the number of height bins - height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) - height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + height_range = abs( + filtered_dataset["photon_height"].max() + - filtered_dataset["photon_height"].min() + ) + height_bin_number = max( + 1, round(height_range / vertical_res) + ) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + lat_bins = pd.cut( + filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) # Create bins for height - height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, - labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), - filtered_dataset['photon_height'].max(), - num=height_bin_number), decimals=1)) + height_bins = pd.cut( + filtered_dataset["photon_height"], + bins=height_bin_number, + labels=np.round( + np.linspace( + filtered_dataset["photon_height"].min(), + filtered_dataset["photon_height"].max(), + num=height_bin_number, + ), + decimals=1, + ), + ) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, 'lat_bins'] = lat_bins - filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset.loc[:, "lat_bins"] = lat_bins + filtered_dataset.loc[:, "height_bins"] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -70,71 +88,76 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): def get_sea_surface_height_static(binned_data, threshold): """ Calculate sea surface height and filter subsurface photons using adaptive detection. - + Parameters: binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height - + Returns: final_sea_surface_height (list): Detected sea surface heights sea_surface_height_abnormal_label (array): Labels for abnormal heights sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface """ - - #set flag for the df save + + # set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - # Group dataset along horizental (latitude bins) - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups to detect sea surface for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) # Check if new_df is not empty before finding the bin with the highest photon count - if not new_df.empty: + if not new_df.empty: # Find the vertical bin with the highest photon count - largest_h_bin = new_df['lat'].argmax() - + largest_h_bin = new_df["lat"].argmax() + # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] - + # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + photons_sea_surface = v.loc[ + v["height_bins"] == largest_h_index, "photon_height" + ] lat_bin_sea_median = photons_sea_surface.median() - + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = \ - v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & - (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] - + photons_sea_surface_up = v.loc[ + (v["photon_height"] > (lat_bin_sea_median - threshold)) + & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) + ] + # Calculate the photon ratio between surface and whole photons - if v['photon_height'].shape[0] > 0: - new_photons_ratio_sea_surface = \ - photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + if v["photon_height"].shape[0] > 0: + new_photons_ratio_sea_surface = ( + photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] + ) else: new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) - + + sea_surface_subsurface_photons_ratio.append( + 1 - new_photons_ratio_sea_surface + ) + else: # Append NaNs if the group is empty sea_surface_height.append(np.nan) @@ -145,65 +168,71 @@ def get_sea_surface_height_static(binned_data, threshold): mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | - (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = \ - np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) - + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin if not np.isnan(final_sea_surface_height[i]): - NewPhotonDFBelowThresholdPeak = \ - v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] - + NewPhotonDFBelowThresholdPeak = v.loc[ + v["photon_height"] < (final_sea_surface_height[i] - threshold) + ] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex=False - else: - PhotonDFBelowThresholdPeak=\ - pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) - - - return final_sea_surface_height, \ - sea_surface_height_abnormal_label, \ - sea_surface_dominated_label,\ - PhotonDFBelowThresholdPeak + firstTimeIndex = False + else: + PhotonDFBelowThresholdPeak = pd.concat( + [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] + ) + + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowThresholdPeak, + ) def get_sea_surface_height_adaptive(binned_data): """ Calculate sea surface height and filter subsurface photons with enhanced wave removal. - + Parameters: binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height - + Returns: final_sea_surface_height (list): Detected sea surface heights sea_surface_height_abnormal_label (array): Labels for abnormal heights sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed """ - + firstTimeIndex = True sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) for k, v in data_groups.items(): - lat_bin_average = v['lat'].mean() - + lat_bin_average = v["lat"].mean() + if len(v) < 20: # Increase minimum photon count for robustness sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -211,18 +240,22 @@ def get_sea_surface_height_adaptive(binned_data): continue # Finer histogram for better peak resolution - hist, bin_edges = np.histogram(v['photon_height'], - bins=100, # Finer bins - density=True, - range=(v['photon_height'].min(), v['photon_height'].max())) + hist, bin_edges = np.histogram( + v["photon_height"], + bins=100, # Finer bins + density=True, + range=(v["photon_height"].min(), v["photon_height"].max()), + ) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 # Enhanced peak detection - peaks, properties = find_peaks(hist, - height=np.max(hist)*0.3, # Stricter peak height - distance=10, # Wider separation - prominence=np.max(hist)*0.1) # Require prominent peaks - + peaks, properties = find_peaks( + hist, + height=np.max(hist) * 0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist) * 0.1, + ) # Require prominent peaks + if len(peaks) == 0: sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -234,21 +267,25 @@ def get_sea_surface_height_adaptive(binned_data): surface_height = bin_centers[surface_peak_idx] # Wider window for Gaussian fit to capture wave effects - peak_data = v[(v['photon_height'] > surface_height - 1.0) & - (v['photon_height'] < surface_height + 1.0)] + peak_data = v[ + (v["photon_height"] > surface_height - 1.0) + & (v["photon_height"] < surface_height + 1.0) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) else: mu, sigma = surface_height, 0.2 # More conservative default # Stricter threshold: 3σ below mean, plus minimum depth - # Waves can extend beyond 2.5σ, especially in rough conditions. + # Waves can extend beyond 2.5σ, especially in rough conditions. # 3σ is a common statistical cutoff for excluding outliers adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # Ensure at least 1m below # Extended surface layer to remove wave effects - surface_photons = v[(v['photon_height'] > adaptive_threshold) & - (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound + surface_photons = v[ + (v["photon_height"] > adaptive_threshold) + & (v["photon_height"] < mu + 3.0 * sigma) + ] # Wider upper bound ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan sea_surface_height.append(mu) @@ -258,38 +295,51 @@ def get_sea_surface_height_adaptive(binned_data): # Outlier filtering mean = np.nanmean(sea_surface_height) sd = np.nanstd(sea_surface_height) - final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | - (sea_surface_height < mean - 2*sd), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) - sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > mean + 2 * sd) | (sea_surface_height < mean - 2 * sd), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) # Filter subsurface photons with diagnostics PhotonDFBelowSurface = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): if not np.isnan(final_sea_surface_height[i]): - peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - 1.0) & - (v['photon_height'] < final_sea_surface_height[i] + 1.0)] + peak_data = v[ + (v["photon_height"] > final_sea_surface_height[i] - 1.0) + & (v["photon_height"] < final_sea_surface_height[i] + 1.0) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) else: - adaptive_threshold = final_sea_surface_height[i] - 1.0 # Stricter default + adaptive_threshold = ( + final_sea_surface_height[i] - 1.0 + ) # Stricter default + + NewPhotonDFBelowSurface = v[v["photon_height"] < adaptive_threshold] - NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] - if firstTimeIndex: PhotonDFBelowSurface = NewPhotonDFBelowSurface firstTimeIndex = False else: - PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) + PhotonDFBelowSurface = pd.concat( + [PhotonDFBelowSurface, NewPhotonDFBelowSurface] + ) - return (final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowSurface) + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface, + ) def get_water_temp(date_year, date_month, date_day, latitude, longitude): @@ -309,7 +359,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -320,8 +370,11 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * - (new_lat_max - new_lat_min) + new_lat_min) + new_lat = round( + ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) + * (new_lat_max - new_lat_min) + + new_lat_min + ) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -330,24 +383,41 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * - (new_lon_max - new_lon_min) + new_lon_min) + new_lon = round( + ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) + * (new_lon_max - new_lon_min) + + new_lon_min + ) # Access the SST data using the JPL OpenDap interface - url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ - + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ - + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + url = ( + "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" + + str(year) + + "/" + + str(zero_day_of_year) + + "/" + + str(date) + + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" + ) dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction(WTemp, WSmodel, Wavelength, - Photon_ref_elev, Ph_ref_azimuth, - PhotonZ, PhotonX, PhotonY, Ph_Conf): +def refraction_correction( + WTemp, + WSmodel, + Wavelength, + Photon_ref_elev, + Ph_ref_azimuth, + PhotonZ, + PhotonX, + PhotonY, + Ph_Conf, +): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -377,18 +447,18 @@ def refraction_correction(WTemp, WSmodel, Wavelength, n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = (1 - (n1 / n2)) + correction_coef = 1 - (n1 / n2) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) # eq 3. S # Approximate water Surface = 1.5 @@ -407,7 +477,7 @@ def refraction_correction(WTemp, WSmodel, Wavelength, phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -431,19 +501,27 @@ def refraction_correction(WTemp, WSmodel, Wavelength, outY = PhotonY + DN outZ = PhotonZ + DZ - ''' + """ print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - ''' - return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, - Photon_ref_elev) # We are most interested in out-x, out-y, out-z - + """ + return ( + outX, + outY, + outZ, + Ph_Conf, + PhotonX, + PhotonY, + PhotonZ, + Ph_ref_azimuth, + Photon_ref_elev, + ) # We are most interested in out-x, out-y, out-z diff --git a/aok/core/kd_utils/sliderule_adapter.py b/aok/core/kd_utils/sliderule_adapter.py index 47c6e35..a706b0e 100644 --- a/aok/core/kd_utils/sliderule_adapter.py +++ b/aok/core/kd_utils/sliderule_adapter.py @@ -1,5 +1,4 @@ import logging -from typing import Dict, List, Optional import numpy as np import pandas as pd @@ -11,22 +10,22 @@ def _get_beam_id_from_track_pair(df: pd.DataFrame) -> pd.Series: - if 'gt' in df.columns: - return df['gt'].astype(str) - if 'track' in df.columns and 'pair' in df.columns: - suffix = df['pair'].map({0: 'l', 1: 'r'}).fillna('x') - return 'gt' + df['track'].astype(int).astype(str) + suffix - return pd.Series(['unknown'] * len(df), index=df.index, dtype='object') + if "gt" in df.columns: + return df["gt"].astype(str) + if "track" in df.columns and "pair" in df.columns: + suffix = df["pair"].map({0: "l", 1: "r"}).fillna("x") + return "gt" + df["track"].astype(int).astype(str) + suffix + return pd.Series(["unknown"] * len(df), index=df.index, dtype="object") -def _infer_strong_beam_suffix(df: pd.DataFrame) -> Optional[str]: - if 'sc_orient' not in df.columns or df.empty: +def _infer_strong_beam_suffix(df: pd.DataFrame) -> str | None: + if "sc_orient" not in df.columns or df.empty: return None - orient = df['sc_orient'].dropna() + orient = df["sc_orient"].dropna() if orient.empty: return None # Same logic as your Florida script. - return 'l' if int(orient.iloc[0]) == 0 else 'r' + return "l" if int(orient.iloc[0]) == 0 else "r" def _normalize_signal_conf(values: pd.Series) -> pd.Series: @@ -39,23 +38,24 @@ def to_scalar(v): return float(v) except Exception: return np.nan + return values.apply(to_scalar) def _relative_distance_km_per_beam(df: pd.DataFrame) -> pd.Series: out = pd.Series(np.nan, index=df.index, dtype=float) - for beam_id, beam_df in df.groupby('beam_id'): - beam_df = beam_df.sort_values('delta_time') - if 'segment_dist' in beam_df.columns: - rel_km = (beam_df['segment_dist'] - beam_df['segment_dist'].min()) / 1000.0 + for beam_id, beam_df in df.groupby("beam_id"): + beam_df = beam_df.sort_values("delta_time") + if "segment_dist" in beam_df.columns: + rel_km = (beam_df["segment_dist"] - beam_df["segment_dist"].min()) / 1000.0 out.loc[beam_df.index] = rel_km.values continue - lons = beam_df['longitude'].to_numpy(dtype=float) - lats = beam_df['latitude'].to_numpy(dtype=float) + lons = beam_df["longitude"].to_numpy(dtype=float) + lats = beam_df["latitude"].to_numpy(dtype=float) if len(lons) == 0: continue - geod = Geod(ellps='WGS84') + geod = Geod(ellps="WGS84") cumulative_m = np.zeros(len(lons), dtype=float) for i in range(1, len(lons)): _, _, dist_m = geod.inv(lons[i - 1], lats[i - 1], lons[i], lats[i]) @@ -66,18 +66,21 @@ def _relative_distance_km_per_beam(df: pd.DataFrame) -> pd.Series: def _project_to_utm(df: pd.DataFrame) -> pd.DataFrame: projected = df.copy() - projected['lon'] = np.nan - projected['lat'] = np.nan - for beam_id, beam_df in projected.groupby('beam_id'): + projected["lon"] = np.nan + projected["lat"] = np.nan + for beam_id, beam_df in projected.groupby("beam_id"): if beam_df.empty: continue - lon0 = float(beam_df['longitude'].iloc[0]) - lat0 = float(beam_df['latitude'].iloc[0]) + lon0 = float(beam_df["longitude"].iloc[0]) + lat0 = float(beam_df["latitude"].iloc[0]) epsg = convert_wgs_to_utm(lon0, lat0) proj = Proj(epsg) - x, y = proj(beam_df['longitude'].to_numpy(dtype=float), beam_df['latitude'].to_numpy(dtype=float)) - projected.loc[beam_df.index, 'lon'] = x - projected.loc[beam_df.index, 'lat'] = y + x, y = proj( + beam_df["longitude"].to_numpy(dtype=float), + beam_df["latitude"].to_numpy(dtype=float), + ) + projected.loc[beam_df.index, "lon"] = x + projected.loc[beam_df.index, "lat"] = y return projected @@ -89,52 +92,77 @@ def sliderule_to_framework_dataset( Convert SlideRule ATL03 photon GeoDataFrame to the framework's sea_photon_dataset schema. """ if atl03_gdf is None or len(atl03_gdf) == 0: - return pd.DataFrame(columns=[ - 'beam_id', 'latitude', 'longitude', 'lat', 'lon', 'photon_height', - 'quality_ph', 'photon_conf', 'ref_elevation', 'ref_azimuth', - 'relative_AT_dist', 'solar_elevation', 'background_rate', 'is_land_label' - ]) + return pd.DataFrame( + columns=[ + "beam_id", + "latitude", + "longitude", + "lat", + "lon", + "photon_height", + "quality_ph", + "photon_conf", + "ref_elevation", + "ref_azimuth", + "relative_AT_dist", + "solar_elevation", + "background_rate", + "is_land_label", + ] + ) src = pd.DataFrame(atl03_gdf).copy() - src['beam_id'] = _get_beam_id_from_track_pair(src) + src["beam_id"] = _get_beam_id_from_track_pair(src) if strong_beams_only: strong_suffix = _infer_strong_beam_suffix(src) - if strong_suffix in ('l', 'r'): - src = src[src['beam_id'].str.endswith(strong_suffix)] - - out = pd.DataFrame({ - 'beam_id': src['beam_id'].astype(str), - 'latitude': pd.to_numeric(src.get('lat_ph', np.nan), errors='coerce'), - 'longitude': pd.to_numeric(src.get('lon_ph', np.nan), errors='coerce'), - 'photon_height': pd.to_numeric(src.get('h_ph', np.nan), errors='coerce'), - 'quality_ph': pd.to_numeric(src.get('quality_ph', np.nan), errors='coerce'), - 'ref_elevation': pd.to_numeric(src.get('ref_elev', np.nan), errors='coerce'), - 'ref_azimuth': pd.to_numeric(src.get('ref_azimuth', np.nan), errors='coerce'), - 'solar_elevation': pd.to_numeric(src.get('solar_elevation', np.nan), errors='coerce'), - 'background_rate': pd.to_numeric(src.get('bckgrd_rate', np.nan), errors='coerce'), - 'delta_time': pd.to_numeric(src.get('delta_time', np.nan), errors='coerce'), - 'segment_dist': pd.to_numeric(src.get('segment_dist', np.nan), errors='coerce'), - }) - - if 'signal_conf_ph' in src.columns: - out['photon_conf'] = _normalize_signal_conf(src['signal_conf_ph']) + if strong_suffix in ("l", "r"): + src = src[src["beam_id"].str.endswith(strong_suffix)] + + out = pd.DataFrame( + { + "beam_id": src["beam_id"].astype(str), + "latitude": pd.to_numeric(src.get("lat_ph", np.nan), errors="coerce"), + "longitude": pd.to_numeric(src.get("lon_ph", np.nan), errors="coerce"), + "photon_height": pd.to_numeric(src.get("h_ph", np.nan), errors="coerce"), + "quality_ph": pd.to_numeric(src.get("quality_ph", np.nan), errors="coerce"), + "ref_elevation": pd.to_numeric( + src.get("ref_elev", np.nan), errors="coerce" + ), + "ref_azimuth": pd.to_numeric( + src.get("ref_azimuth", np.nan), errors="coerce" + ), + "solar_elevation": pd.to_numeric( + src.get("solar_elevation", np.nan), errors="coerce" + ), + "background_rate": pd.to_numeric( + src.get("bckgrd_rate", np.nan), errors="coerce" + ), + "delta_time": pd.to_numeric(src.get("delta_time", np.nan), errors="coerce"), + "segment_dist": pd.to_numeric( + src.get("segment_dist", np.nan), errors="coerce" + ), + } + ) + + if "signal_conf_ph" in src.columns: + out["photon_conf"] = _normalize_signal_conf(src["signal_conf_ph"]) else: - out['photon_conf'] = np.nan + out["photon_conf"] = np.nan - out = out.dropna(subset=['latitude', 'longitude', 'photon_height']).copy() + out = out.dropna(subset=["latitude", "longitude", "photon_height"]).copy() out = _project_to_utm(out) - out['relative_AT_dist'] = _relative_distance_km_per_beam(out) - out['is_land_label'] = 0 + out["relative_AT_dist"] = _relative_distance_km_per_beam(out) + out["is_land_label"] = 0 - return out.drop(columns=['delta_time', 'segment_dist'], errors='ignore') + return out.drop(columns=["delta_time", "segment_dist"], errors="ignore") def fetch_sliderule_atl03( - region: List[Dict[str, float]], + region: list[dict[str, float]], t0: str, t1: str, - rgt: Optional[int] = None, - ph_fields: Optional[List[str]] = None, + rgt: int | None = None, + ph_fields: list[str] | None = None, sliderule_url: str = "slideruleearth.io", verbose: bool = False, ): @@ -142,14 +170,25 @@ def fetch_sliderule_atl03( Fetch ATL03 photons from SlideRule. """ try: - from sliderule import sliderule, icesat2 + from sliderule import icesat2, sliderule except Exception as e: - raise ImportError("sliderule package is required for SlideRule ingestion.") from e + raise ImportError( + "sliderule package is required for SlideRule ingestion." + ) from e sliderule.init(sliderule_url, verbose=verbose) fields = ph_fields or [ - "delta_time", "lat_ph", "lon_ph", "h_ph", "quality_ph", "signal_conf_ph", - "segment_dist", "ref_elev", "ref_azimuth", "solar_elevation", "bckgrd_rate", + "delta_time", + "lat_ph", + "lon_ph", + "h_ph", + "quality_ph", + "signal_conf_ph", + "segment_dist", + "ref_elev", + "ref_azimuth", + "solar_elevation", + "bckgrd_rate", ] params = {"poly": region, "t0": t0, "t1": t1, "atl03_ph_fields": fields} if rgt is not None: @@ -158,10 +197,10 @@ def fetch_sliderule_atl03( def fetch_and_prepare_sliderule_dataset( - region: List[Dict[str, float]], + region: list[dict[str, float]], t0: str, t1: str, - rgt: Optional[int] = None, + rgt: int | None = None, strong_beams_only: bool = True, sliderule_url: str = "slideruleearth.io", verbose: bool = False, @@ -178,4 +217,6 @@ def fetch_and_prepare_sliderule_dataset( sliderule_url=sliderule_url, verbose=verbose, ) - return sliderule_to_framework_dataset(atl03_gdf, strong_beams_only=strong_beams_only) + return sliderule_to_framework_dataset( + atl03_gdf, strong_beams_only=strong_beams_only + ) diff --git a/aok/core/kd_utils/visualization.py b/aok/core/kd_utils/visualization.py index 64034d4..29c39b0 100644 --- a/aok/core/kd_utils/visualization.py +++ b/aok/core/kd_utils/visualization.py @@ -1,66 +1,103 @@ # utils/visualization.py -import numpy as np import os -import pandas as pd import time + import geopandas as gpd -from pyproj import Transformer, Proj import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pyproj import Transformer from scipy.spatial import ConvexHull + def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): fig, ax = plt.subplots() - ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') - ax.set_xlabel('Relative AT Distance') - ax.set_ylabel('Height (h_ph)') - ax.set_title('Scatter Plot of ATL03 Photons') + ax.scatter( + sea_photon_dataset["latitude"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="ATL03 Photons", + ) + ax.set_xlabel("Relative AT Distance") + ax.set_ylabel("Height (h_ph)") + ax.set_title("Scatter Plot of ATL03 Photons") ax.set_ylim(hlims) ax.legend() plt.show() -def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): + +def plot_filtered_seafloor_photons( + filtered_seafloor_subsurface_dataset, + sea_photon_dataset, + sea_surface_height, + output_path, +): """ Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. Parameters: - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. - - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. - sea_surface_height: Array of sea surface height values. - output_path: Path to save the output plot. """ - + # get sea_surface_x_axis_bins - sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), - filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), - len(sea_surface_height)) - + sea_surface_x_axis_bins = np.linspace( + filtered_seafloor_subsurface_dataset["relative_AT_dist"].min(), + filtered_seafloor_subsurface_dataset["relative_AT_dist"].max(), + len(sea_surface_height), + ) + hlims = [-25, 10] fig, ax = plt.subplots() # Scatter plot of subsurface photons - ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.scatter( + sea_photon_dataset["relative_AT_dist"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) # Overlay the seafloor elevation data as points - ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') - - ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], - linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') + ax.plot( + filtered_seafloor_subsurface_dataset["relative_AT_dist"], + filtered_seafloor_subsurface_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) + + ax.plot( + sea_surface_x_axis_bins, + [x - 0.5 for x in sea_surface_height], + linewidth=0.8, + color="#DD571C", + alpha=0.4, + label="0.5 m Below Surface Peak", + ) # Set labels and title - ax.set_xlabel('Distance From Start of Track (km)') - ax.set_ylabel('Photon Height (m)') - ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') + ax.set_xlabel("Distance From Start of Track (km)") + ax.set_ylabel("Photon Height (m)") + ax.set_title("Scatter Plot of Filtered Sea Photon Dataset") ax.set_ylim(hlims) # Add a legend ax.legend() # Save the plot - plt.legend(loc='upper right') - plt.savefig(output_path, dpi=400, format='jpeg') + plt.legend(loc="upper right") + plt.savefig(output_path, dpi=400, format="jpeg") # Show the plot plt.show() @@ -70,125 +107,210 @@ def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull """ Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. """ - #filter beam type + # filter beam type # photon_dataset - photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] - + photon_dataset = photon_dataset[photon_dataset["beam_id"].isin(target_beam_ids)] + fig, ax = plt.subplots(figsize=(10, 6)) hlims = [-10, 2] - + for lat_bin, hull_points in convex_hulls.items(): hull = ConvexHull(hull_points) - ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') - + ax.fill( + hull_points[hull.vertices, 0], + hull_points[hull.vertices, 1], + alpha=0.3, + label=f"Bin {lat_bin}", + ) + # Calculate centroid to place the label centroid = np.mean(hull_points[hull.vertices], axis=0) - ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', - horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') - - ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.text( + centroid[0], + centroid[1], + f"{convex_hull_areas[lat_bin]:.2f}", + horizontalalignment="center", + verticalalignment="center", + fontsize=10, + color="black", + ) + + ax.scatter( + photon_dataset["lat"], + photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) ax.set_ylim(hlims) - ax.set_xlabel('Latitude') - ax.set_ylabel('Photon Height') - ax.set_title('ConvexHull of Photons within each Lat Bin') + ax.set_xlabel("Latitude") + ax.set_ylabel("Photon Height") + ax.set_title("ConvexHull of Photons within each Lat Bin") plt.show() + # plot the kd and photon -def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): - #filter beam type +def plot_kd_photons( + OutputPath, + timestamp, + target_beam_ids, + subsurface_photon_dataset, + Kd_DF_MergedDistance, +): + # filter beam type # photon_dataset - subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] - Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] - + subsurface_photon_dataset = subsurface_photon_dataset[ + subsurface_photon_dataset["beam_id"].isin(target_beam_ids) + ] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[ + Kd_DF_MergedDistance["beam_id"].isin(target_beam_ids) + ] + hlims = [-45, 5] fig, ax1 = plt.subplots(figsize=(10, 6)) - ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], - s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') - ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) - ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize - ax1.set_ylabel('Photon Height', color='b', fontsize=18) - ax1.tick_params(axis='y', labelcolor='b', labelsize=18) - ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + ax1.scatter( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["photon_height"], + s=1.5, + c="k", + alpha=0.2, + edgecolors="none", + label="Subsurface ATL03 Photon Height", + ) + ax1.set_xlabel("Relative Along-Track Distance", fontsize=18) + ax1.tick_params(axis="x", labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel("Photon Height", color="b", fontsize=18) + ax1.tick_params(axis="y", labelcolor="b", labelsize=18) + ax1.plot( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') # ax1.set_xlim([1800, 2100]) # Set x-axis limits # ax1.set_ylim(hlims) - + ax2 = ax1.twinx() - ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], - label='Kd values', color='r', alpha=0.6) - ax2.set_ylabel('Kd Value', color='r', fontsize=18) - ax2.tick_params(axis='y', labelcolor='r',labelsize=18) + ax2.scatter( + Kd_DF_MergedDistance["relative_AT_dist"], + Kd_DF_MergedDistance["kd"], + label="Kd values", + color="r", + alpha=0.6, + ) + ax2.set_ylabel("Kd Value", color="r", fontsize=18) + ax2.tick_params(axis="y", labelcolor="r", labelsize=18) # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() - fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) + fig.legend( + handles1 + handles2, + labels1 + labels2, + loc="upper right", + bbox_to_anchor=(0.85, 0.85), + ) # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) fig.tight_layout() - plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') + plt.savefig( + os.path.join(OutputPath, f"{timestamp}_IS2_subsurface_kd_2.jpg"), + dpi=400, + format="jpeg", + ) plt.show() - -def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): +def plot_bin_polygon_data( + lat, + height, + seafloor_height, + mask, + lat_bins, + height_bins, + valid_bins, + polygons, + y_min, + y_max, +): plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title("Original Height Data") - plt.scatter(lat, height, c=height, cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.scatter(lat, height, c=height, cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.subplot(1, 2, 2) plt.title("Masked Region (ROI)") - plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.scatter(lat[mask], height[mask], c=height[mask], cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") for polygon in polygons: - plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) + plt.gca().add_patch( + plt.Polygon(polygon, edgecolor="red", facecolor="none", linewidth=1) + ) for lb in lat_bins: - plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) + plt.axvline(x=lb, color="gray", linestyle="--", linewidth=0.5) for hb in height_bins: - plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) + plt.axhline(y=hb, color="gray", linestyle="--", linewidth=0.5) - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.tight_layout() plt.show() -# -def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, - y_limit_top, y_limit_bottom, percentile, file, geo_df, - ref_y, ref_z, beam, epsg_num): + +def produce_figures( + binned_data, + bath_height, + sea_height, + solo_sea_surface_label, + y_limit_top, + y_limit_bottom, + percentile, + file, + geo_df, + ref_y, + ref_z, + beam, + epsg_num, +): """Create figures""" # Create bins for latitude - bath_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(bath_height))+20 + bath_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 + ) - sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(sea_height))+10 + sea_surface_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 + ) # Create new dataframes for median values - bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + sea_surface_label_df = pd.DataFrame( + {"x": sea_surface_x_axis_bins, "y": sea_surface_label} + ) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -199,65 +321,110 @@ def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') - plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', - alpha=0.1, c='red', label='Classified Photons') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") + plt.scatter( + geo_df.lat, + geo_df.photon_height, + s=0.8, + marker="o", + alpha=0.1, + c="red", + label="Classified Photons", + ) # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter(bath_median_df.x, bath_median_df.y, - marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') - - plt.scatter(sea_median_df.x, sea_median_df.y, - marker='o', c='b', alpha=1, s=2, label='Median sea surface') - - plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, - marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') - plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, - marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + plt.scatter( + bath_median_df.x, + bath_median_df.y, + marker="o", + c="r", + alpha=0.8, + s=2, + label="Median bathymetry", + ) + + plt.scatter( + sea_median_df.x, + sea_median_df.y, + marker="o", + c="b", + alpha=1, + s=2, + label="Median sea surface", + ) + + plt.scatter( + sea_surface_label_df.iloc[idx_1].x, + sea_surface_label_df.iloc[idx_1].y, + marker="o", + c="pink", + alpha=1, + s=3, + label="solo_sea_surface", + ) + plt.scatter( + sea_surface_label_df.iloc[idx_0].x, + sea_surface_label_df.iloc[idx_0].y, + marker="o", + c="g", + alpha=1, + s=3, + label="non_solo_sea_surface", + ) # Insert titles and subtitles - plt.title('Icesat2 Bathymetry\n' + file) - plt.xlabel('Latitude', fontsize=25) - plt.ylabel('Photon Height (m)', fontsize=25) + plt.title("Icesat2 Bathymetry\n" + file) + plt.xlabel("Latitude", fontsize=25) + plt.ylabel("Photon Height (m)", fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={'size': 20}) + plt.legend(loc="upper left", prop={"size": 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace('.h5', '') + file = file.replace(".h5", "") # Define where to save file plt.tight_layout() - plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + - str(beam) + '_' + str(percentile) + - '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + plt.savefig( + "C:/Workstation/ICESat2_HLS/" + + file + + "_gt" + + str(beam) + + "_" + + str(percentile) + + "_EPSG" + + str(epsg_num) + + "_" + + timestr + + ".pdf" + ) # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs("EPSG:" + str(epsg_num), - "EPSG:4326", always_xy=True) + transformer = Transformer.from_crs( + "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True + ) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform( - geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - geo_df['lon_wgs84'] = lon_wgs84 - geo_df['lat_wgs84'] = lat_wgs84 + geo_df["lon_wgs84"] = lon_wgs84 + geo_df["lat_wgs84"] = lat_wgs84 - geodf = gpd.GeoDataFrame(geo_df, - geometry=gpd.points_from_xy(geo_df.lon_wgs84, - geo_df.lat_wgs84)) + geodf = gpd.GeoDataFrame( + geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) + ) geodf.set_crs(epsg=4326, inplace=True) # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") \ No newline at end of file + # driver="GPKG") diff --git a/code_of_conduct.md b/code_of_conduct.md index 4e142be..28d02ee 100644 --- a/code_of_conduct.md +++ b/code_of_conduct.md @@ -130,4 +130,4 @@ For answers to common questions about this code of conduct, see the FAQ at [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html [Mozilla CoC]: https://github.com/mozilla/diversity [FAQ]: https://www.contributor-covenant.org/faq -[translations]: https://www.contributor-covenant.org/translations \ No newline at end of file +[translations]: https://www.contributor-covenant.org/translations diff --git a/icesat-2_kdph_py/config.py b/icesat-2_kdph_py/config.py index 863cc84..e8dc43b 100644 --- a/icesat-2_kdph_py/config.py +++ b/icesat-2_kdph_py/config.py @@ -25,7 +25,6 @@ # # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# - # #China Bohai # # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# # # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# @@ -33,7 +32,6 @@ # ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# - # ATL03_h5_file_path = path + ATL03_h5_file # Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' @@ -66,13 +64,20 @@ import argparse + def get_args(): - parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") - + parser = argparse.ArgumentParser( + description="Configure ICESat-2 HLS analysis script." + ) + # General paths - parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', - help="Root path for all data processing.") - + parser.add_argument( + "--workspace_path", + type=str, + default="C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/", + help="Root path for all data processing.", + ) + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', # help="Base path for ICESat-2 ATL03 data.") @@ -81,218 +86,379 @@ def get_args(): # help="Base path for ICESat-2 ATL03 data.") # Wax Delta (v007) — required for IR/AP filter sensitivity test - parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Wax_Delta/', - help="Base path for ICESat-2 ATL03 data.") - - - + parser.add_argument( + "--atl03_path", + type=str, + default="Dataset/ATL03_ICESat2/Wax_Delta/", + help="Base path for ICESat-2 ATL03 data.", + ) + # #ChesapeakeBay # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", - # help="Name of the ATL03 H5 file to process.") - + # help="Name of the ATL03 H5 file to process.") + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # #ATL03_20210628230109_00811207_006_01_subsetted - + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", # help="Name of the ATL03 H5 file to process.") - - + # # WaxDelta # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # Bohai (v006) — baseline sensitivity test site # parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") # Wax Delta (v007) — required for IR/AP filter; use with atl03_path = Dataset/ATL03_ICESat2/Wax_Delta/ - parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_007_01_subsetted.h5", - help="Name of the ATL03 H5 file to process.") + parser.add_argument( + "--atl03_file", + type=str, + default="ATL03_20231103172707_06982106_007_01_subsetted.h5", + help="Name of the ATL03 H5 file to process.", + ) # # Cook Inlet # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - - + # #Core Sound # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + # # Pamlico Sound # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", # help="Name of the ATL03 H5 file to process.") - - parser.add_argument("--other_data_path", type=str, default='Dataset/', - help="Path to the current working directory.") - - parser.add_argument("--output_path", type=str, default='Results/', - help="Directory for saving results.") - + + parser.add_argument( + "--other_data_path", + type=str, + default="Dataset/", + help="Path to the current working directory.", + ) + + parser.add_argument( + "--output_path", + type=str, + default="Results/", + help="Directory for saving results.", + ) + # Shoreline data - parser.add_argument("--shoreline_data", type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default='Shorelines/ne_10m_land/ne_10m_land.shp', - help="Path to the global shoreline dataset.") - + parser.add_argument( + "--shoreline_data", + type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default="Shorelines/ne_10m_land/ne_10m_land.shp", + help="Path to the global shoreline dataset.", + ) + # Bathymetry datasets - parser.add_argument("--gebco_path", nargs='+', type=str, - default='Bathy/gebco_2024_geotiff/', - help="Path to GEBCO bathymetry datasets.") - + parser.add_argument( + "--gebco_path", + nargs="+", + type=str, + default="Bathy/gebco_2024_geotiff/", + help="Path to GEBCO bathymetry datasets.", + ) + # Resolution settings - parser.add_argument("--horizontal_res", type=int, default=500, - help="Horizontal resolution for the analysis (in meters).") - - parser.add_argument("--vertical_res", type=float, default=0.25, - help="Vertical resolution for the analysis.") - + parser.add_argument( + "--horizontal_res", + type=int, + default=500, + help="Horizontal resolution for the analysis (in meters).", + ) + + parser.add_argument( + "--vertical_res", + type=float, + default=0.25, + help="Vertical resolution for the analysis.", + ) + # Analysis parameters - parser.add_argument("--subsurface_thresh", type=float, default=1.0, - help="Threshold for photons below the sea surface.") - - parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, - help="Maximum depth thres for Kd calculations (in meters).") #5m + parser.add_argument( + "--subsurface_thresh", + type=float, + default=1.0, + help="Threshold for photons below the sea surface.", + ) + + parser.add_argument( + "--ignore_subsurface_height_thres", + type=float, + default=-6, + help="Maximum depth thres for Kd calculations (in meters).", + ) # 5m # Empty string '' auto-selects the first strong beam. Pass comma-separated IDs to override. # Valid IDs: gt1l, gt1r, gt2l, gt2r, gt3l, gt3r — only strong beams for the granule are kept. # To process all strong beams, use --target_beams "gt1r,gt2r,gt3r". - parser.add_argument("--target_beams", type=str, default='', - help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.") - - parser.add_argument("--disable_solar_background_filter", action="store_true", - help="Disable solar background filtering (enabled by default; " - "self-gates on solar elevation so has no effect on nighttime passes).") - parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, - help="Solar elevation threshold (deg) to define daytime for optional filtering.") - parser.add_argument("--solar_bg_median_window_deg", type=float, default=10.0, - help="Rolling median window width (degrees of solar elevation) for " - "the solar-elevation-aware background filter.") - parser.add_argument("--solar_bg_noise_multiplier", type=float, default=1.5, - help="Ratio of actual/expected background rate above which a photon " - "is flagged as noisy by the solar background filter.") - parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, - help="Minimum photon signal confidence kept in high daytime background.") - - parser.add_argument("--enable_ir_ap_filter", action="store_true", - help="Optional: remove photons using IR/afterpulse proxy flags.") - parser.add_argument("--ir_ap_quality_max", type=int, default=0, - help="Maximum allowed quality_ph value when IR/AP filter is enabled.") - parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, - help="Minimum allowed photon_conf value when IR/AP filter is enabled.") - - parser.add_argument("--enable_gebco_filter", action="store_true", - help="Optional: remove photons below GEBCO seafloor and shallow bins.") - parser.add_argument("--enable_sea_surface_flattening", action="store_true", - help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") - parser.add_argument("--sea_surface_flattening_window_m", type=int, default=5000, - help="Window size in meters for sea-surface flattening mean level. " - "Must be larger than horizontal_res to have any effect.") - - parser.add_argument("--enable_convex_hull_filter", action="store_true", - help="Optional: apply convex hull area filtering before Kd fitting.") - parser.add_argument("--convex_hull_area_threshold", type=float, default=100, - help="Minimum convex hull area to keep a horizontal bin when enabled.") - - parser.add_argument("--enable_histogram_quality_filter", action="store_true", - help="Optional: discard along-track bins with weak 6-7 m signal.") - parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, - help="Minimum ratio of photons in depth band to keep a bin.") - parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, - help="Minimum depth (m below surface) of quality-check band.") - parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, - help="Maximum depth (m below surface) of quality-check band.") - parser.add_argument("--histogram_quality_ref_depth_min", type=float, default=0.0, - help="Min depth (m) of near-surface reference band for decay ratio.") - parser.add_argument("--histogram_quality_ref_depth_max", type=float, default=1.0, - help="Max depth (m) of near-surface reference band for decay ratio.") - - parser.add_argument("--enable_surface_sigma_filter", action="store_true", - help="Optional: discard bins where Gaussian surface sigma is too large.") - parser.add_argument("--surface_sigma_max", type=float, default=0.5, - help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") - - parser.add_argument("--enable_refraction_correction", action="store_true", - help="Optional: apply refraction correction to subsurface photons.") - parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, - help="Water temperature (deg C) used for refraction correction.") - parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, - help="Laser wavelength (nm) used for refraction correction.") - - parser.add_argument("--enable_post_refraction_refit", action="store_true", - help="Optional: rebuild histograms and re-fit surface after refraction correction.") - - parser.add_argument("--enable_atl24_filter", action="store_true", - help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") - parser.add_argument("--atl24_file", type=str, default='', - help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") - parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, - help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") - - parser.add_argument("--decay_zone_threshold", type=float, default=0.0, - help="Fraction of peak photon count below which depth bins are excluded " - "in Step 16 decay zone detection. 0.0 = original behaviour (only " - "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.") - - parser.add_argument("--kd_fit_method", type=str, default="log_linear", - choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], - help="Beer's Law fitting strategy for Kd calculation. " - "log_linear = original log-space linear regression; " - "bg_subtract = subtract noise floor then log-linear; " - "breakpoint = segmented regression with breakpoint detection; " - "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.") - - parser.add_argument("--enable_wave_adaptive_fit", action="store_true", - help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " - "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.") - parser.add_argument("--wave_exclusion_multiplier", type=float, default=4.0, - help="Number of surface sigmas to skip from the actual water surface when " - "wave-adaptive fit is enabled. The adaptive surface detection already " - "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " - "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.") - parser.add_argument("--wave_sigma_calm_threshold", type=float, default=0.1, - help="Surface sigma (m) below which a bin is considered calm and no " - "wave trimming is applied. Default 0.1 m.") - - parser.add_argument("--enable_paired_beam_combine", action="store_true", - help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") - - parser.add_argument("--no_plot", action="store_true", - help="Suppress all plot generation (useful for batch runs).") + parser.add_argument( + "--target_beams", + type=str, + default="", + help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.", + ) + + parser.add_argument( + "--disable_solar_background_filter", + action="store_true", + help="Disable solar background filtering (enabled by default; " + "self-gates on solar elevation so has no effect on nighttime passes).", + ) + parser.add_argument( + "--solar_elevation_day_threshold", + type=float, + default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.", + ) + parser.add_argument( + "--solar_bg_median_window_deg", + type=float, + default=10.0, + help="Rolling median window width (degrees of solar elevation) for " + "the solar-elevation-aware background filter.", + ) + parser.add_argument( + "--solar_bg_noise_multiplier", + type=float, + default=1.5, + help="Ratio of actual/expected background rate above which a photon " + "is flagged as noisy by the solar background filter.", + ) + parser.add_argument( + "--solar_background_min_signal_conf", + type=int, + default=2, + help="Minimum photon signal confidence kept in high daytime background.", + ) + + parser.add_argument( + "--enable_ir_ap_filter", + action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.", + ) + parser.add_argument( + "--ir_ap_quality_max", + type=int, + default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.", + ) + parser.add_argument( + "--ir_ap_min_signal_conf", + type=int, + default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.", + ) + + parser.add_argument( + "--enable_gebco_filter", + action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.", + ) + parser.add_argument( + "--enable_sea_surface_flattening", + action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.", + ) + parser.add_argument( + "--sea_surface_flattening_window_m", + type=int, + default=5000, + help="Window size in meters for sea-surface flattening mean level. " + "Must be larger than horizontal_res to have any effect.", + ) + + parser.add_argument( + "--enable_convex_hull_filter", + action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.", + ) + parser.add_argument( + "--convex_hull_area_threshold", + type=float, + default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.", + ) + + parser.add_argument( + "--enable_histogram_quality_filter", + action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.", + ) + parser.add_argument( + "--histogram_quality_min_ratio", + type=float, + default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.", + ) + parser.add_argument( + "--histogram_quality_depth_min", + type=float, + default=6.0, + help="Minimum depth (m below surface) of quality-check band.", + ) + parser.add_argument( + "--histogram_quality_depth_max", + type=float, + default=7.0, + help="Maximum depth (m below surface) of quality-check band.", + ) + parser.add_argument( + "--histogram_quality_ref_depth_min", + type=float, + default=0.0, + help="Min depth (m) of near-surface reference band for decay ratio.", + ) + parser.add_argument( + "--histogram_quality_ref_depth_max", + type=float, + default=1.0, + help="Max depth (m) of near-surface reference band for decay ratio.", + ) + + parser.add_argument( + "--enable_surface_sigma_filter", + action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.", + ) + parser.add_argument( + "--surface_sigma_max", + type=float, + default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.", + ) + + parser.add_argument( + "--enable_refraction_correction", + action="store_true", + help="Optional: apply refraction correction to subsurface photons.", + ) + parser.add_argument( + "--refraction_water_temp_c", + type=float, + default=20.0, + help="Water temperature (deg C) used for refraction correction.", + ) + parser.add_argument( + "--refraction_wavelength_nm", + type=float, + default=532.0, + help="Laser wavelength (nm) used for refraction correction.", + ) + + parser.add_argument( + "--enable_post_refraction_refit", + action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.", + ) + + parser.add_argument( + "--enable_atl24_filter", + action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.", + ) + parser.add_argument( + "--atl24_file", + type=str, + default="", + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.", + ) + parser.add_argument( + "--atl24_max_match_distance_deg", + type=float, + default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.", + ) + + parser.add_argument( + "--decay_zone_threshold", + type=float, + default=0.0, + help="Fraction of peak photon count below which depth bins are excluded " + "in Step 16 decay zone detection. 0.0 = original behaviour (only " + "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.", + ) + + parser.add_argument( + "--kd_fit_method", + type=str, + default="log_linear", + choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], + help="Beer's Law fitting strategy for Kd calculation. " + "log_linear = original log-space linear regression; " + "bg_subtract = subtract noise floor then log-linear; " + "breakpoint = segmented regression with breakpoint detection; " + "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.", + ) + + parser.add_argument( + "--enable_wave_adaptive_fit", + action="store_true", + help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " + "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.", + ) + parser.add_argument( + "--wave_exclusion_multiplier", + type=float, + default=4.0, + help="Number of surface sigmas to skip from the actual water surface when " + "wave-adaptive fit is enabled. The adaptive surface detection already " + "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " + "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.", + ) + parser.add_argument( + "--wave_sigma_calm_threshold", + type=float, + default=0.1, + help="Surface sigma (m) below which a bin is considered calm and no " + "wave trimming is applied. Default 0.1 m.", + ) + + parser.add_argument( + "--enable_paired_beam_combine", + action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.", + ) + + parser.add_argument( + "--no_plot", + action="store_true", + help="Suppress all plot generation (useful for batch runs).", + ) # switch on consider the coastal water or inland water # if Inland water, we should use one mask and for coastal water, it should be another mask - + # consider bathymetry or not manual setting - - - #night vs daytime - - #solar elevation detemine remove or not for solar background - - - # 1-2 hours - - + + # night vs daytime + + # solar elevation detemine remove or not for solar background + + # 1-2 hours + return parser.parse_args() + if __name__ == "__main__": args = get_args() # Access parameters like this: atl03_file_path = args.workspace_path + args.atl03_file - + print("Processing file:", atl03_file_path) print("Output path:", args.output_path) diff --git a/icesat-2_kdph_py/kd_utils/Kd_analysis.py b/icesat-2_kdph_py/kd_utils/Kd_analysis.py index e706d23..9cf2210 100644 --- a/icesat-2_kdph_py/kd_utils/Kd_analysis.py +++ b/icesat-2_kdph_py/kd_utils/Kd_analysis.py @@ -1,61 +1,62 @@ # utils/Kd_analysis.py # updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. +import logging + import numpy as np import pandas as pd -import logging from scipy.optimize import curve_fit from sklearn.linear_model import LinearRegression # def log_model(z, kd, e0): # return np.log(e0) - kd * z -## This is wrong because the input is already a bined data, +## This is wrong because the input is already a bined data, ## it is not necessary to do a histogram again # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -66,22 +67,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -95,47 +96,47 @@ # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -146,22 +147,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -199,13 +200,15 @@ def find_exponential_decay_zone(hist_df, decay_threshold=0.0): # Apply decay threshold: exclude bins below threshold fraction of peak signal if decay_threshold > 0.0: - peak_count = df['photon_counts'].max() + peak_count = df["photon_counts"].max() if peak_count > 0: - df.loc[df['photon_counts'] < decay_threshold * peak_count, 'photon_counts'] = 0 + df.loc[ + df["photon_counts"] < decay_threshold * peak_count, "photon_counts" + ] = 0 - df['log_photon_counts'] = np.log(df['photon_counts'].replace(0, np.nan)) - df.loc[np.isinf(df['log_photon_counts']), 'log_photon_counts'] = np.nan - return df.dropna(subset=['zdepth', 'log_photon_counts']) + df["log_photon_counts"] = np.log(df["photon_counts"].replace(0, np.nan)) + df.loc[np.isinf(df["log_photon_counts"]), "log_photon_counts"] = np.nan + return df.dropna(subset=["zdepth", "log_photon_counts"]) def fit_beers_law(valid_data): @@ -227,11 +230,11 @@ def fit_beers_law(valid_data): tuple[float, float] (kd, e0). kd is set to np.nan if negative or if fewer than 4 points. """ - if valid_data['log_photon_counts'].notna().sum() <= 3: + if valid_data["log_photon_counts"].notna().sum() <= 3: return np.nan, np.nan - zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) - log_counts_valid = valid_data['log_photon_counts'].values + zdepth_valid = valid_data["zdepth"].values.reshape(-1, 1) + log_counts_valid = valid_data["log_photon_counts"].values model = LinearRegression() model.fit(zdepth_valid, log_counts_valid) @@ -264,25 +267,25 @@ def fit_beers_law_bg_subtract(hist_df, bg_fraction=0.2): tuple[float, float, float] (kd, e0, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') + df = hist_df.copy().sort_values("zdepth") n_bins = len(df) if n_bins < 5: return np.nan, np.nan, np.nan # Estimate noise floor from the deepest bg_fraction of bins n_bg = max(int(n_bins * bg_fraction), 2) - noise_floor = float(df.tail(n_bg)['photon_counts'].median()) + noise_floor = float(df.tail(n_bg)["photon_counts"].median()) # Subtract noise and keep only positive residuals - df['signal'] = df['photon_counts'] - noise_floor - df = df[df['signal'] > 0].copy() + df["signal"] = df["photon_counts"] - noise_floor + df = df[df["signal"] > 0].copy() if len(df) < 4: return np.nan, np.nan, noise_floor - df['log_signal'] = np.log(df['signal']) + df["log_signal"] = np.log(df["signal"]) - zdepth = df['zdepth'].values.reshape(-1, 1) - log_signal = df['log_signal'].values + zdepth = df["zdepth"].values.reshape(-1, 1) + log_signal = df["log_signal"].values model = LinearRegression() model.fit(zdepth, log_signal) @@ -317,14 +320,14 @@ def fit_beers_law_breakpoint(hist_df): tuple[float, float, float, float] (kd, e0, breakpoint_depth, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') - df = df[df['photon_counts'] > 0].copy() + df = hist_df.copy().sort_values("zdepth") + df = df[df["photon_counts"] > 0].copy() if len(df) < 5: return np.nan, np.nan, np.nan, np.nan - df['log_counts'] = np.log(df['photon_counts']) - depths = df['zdepth'].values - log_counts = df['log_counts'].values + df["log_counts"] = np.log(df["photon_counts"]) + depths = df["zdepth"].values + log_counts = df["log_counts"].values n = len(depths) best_rss = np.inf @@ -392,16 +395,16 @@ def fit_beers_law_nonlinear(hist_df): tuple[float, float, float] (kd, e0, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') - depths = df['zdepth'].values - counts = df['photon_counts'].values.astype(float) + df = hist_df.copy().sort_values("zdepth") + depths = df["zdepth"].values + counts = df["photon_counts"].values.astype(float) if len(depths) < 5: return np.nan, np.nan, np.nan # Initial guesses A0 = float(counts.max()) - N0 = float(np.median(counts[len(counts) * 3 // 4:])) # deepest 25% + N0 = float(np.median(counts[len(counts) * 3 // 4 :])) # deepest 25% # Quick log-linear Kd estimate for initial guess (ignore noise) pos = counts > 0 if pos.sum() >= 2: @@ -413,10 +416,12 @@ def fit_beers_law_nonlinear(hist_df): try: popt, _ = curve_fit( - _beer_plus_noise, depths, counts, + _beer_plus_noise, + depths, + counts, p0=[A0, kd0, N0], bounds=([0, 0, 0], [np.inf, np.inf, np.inf]), - maxfev=5000 + maxfev=5000, ) A_fit, kd_fit, N_fit = popt if kd_fit <= 0: @@ -429,9 +434,12 @@ def fit_beers_law_nonlinear(hist_df): # --------------------------------------------------------------------------- # Hybrid: Stabilised breakpoint zone detection + log-linear fit on decay only # --------------------------------------------------------------------------- -def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, - expected_noise_floor=None, - noise_floor_tolerance=3.0): +def fit_beers_law_hybrid( + hist_df, + min_breakpoint_depth=2.0, + expected_noise_floor=None, + noise_floor_tolerance=3.0, +): """ Two-stage approach matching the flowchart intent: Step 16 — Find where exponential decay transitions to noise floor @@ -474,22 +482,24 @@ def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, (kd, e0, breakpoint_depth, noise_floor). breakpoint_depth and noise_floor are np.nan if fallback to full fit. """ - df = hist_df.copy().sort_values('zdepth') - df = df[df['photon_counts'] > 0].copy() + df = hist_df.copy().sort_values("zdepth") + df = df[df["photon_counts"] > 0].copy() n = len(df) if n < 6: return np.nan, np.nan, np.nan, np.nan - df['log_counts'] = np.log(df['photon_counts']) - depths = df['zdepth'].values - log_counts = df['log_counts'].values + df["log_counts"] = np.log(df["photon_counts"]) + depths = df["zdepth"].values + log_counts = df["log_counts"].values # ------------------------------------------------------------------ # Reference: BIC for a single-line model (no breakpoint) # ------------------------------------------------------------------ model_full = LinearRegression() model_full.fit(depths.reshape(-1, 1), log_counts) - rss_full = float(np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2)) + rss_full = float( + np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2) + ) k_full = 2 # slope + intercept bic_full = n * np.log(rss_full / n + 1e-10) + k_full * np.log(n) @@ -567,11 +577,12 @@ def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, # --------------------------------------------------------------------------- # Dispatcher: select fitting strategy by name # --------------------------------------------------------------------------- -KD_FIT_METHODS = ('log_linear', 'bg_subtract', 'breakpoint', 'nonlinear', 'hybrid') +KD_FIT_METHODS = ("log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid") -def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, - expected_noise_floor=None): +def _fit_kd_with_method( + hist_df, method="log_linear", decay_threshold=0.0, expected_noise_floor=None +): """ Run the requested fitting strategy on a single histogram. @@ -580,38 +591,43 @@ def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, tuple[float, float, float] (kd, e0, noise_floor). noise_floor is np.nan for log_linear. """ - if method == 'log_linear': + if method == "log_linear": valid = find_exponential_decay_zone(hist_df, decay_threshold=decay_threshold) kd, e0 = fit_beers_law(valid) return kd, e0, np.nan - elif method == 'bg_subtract': + if method == "bg_subtract": return fit_beers_law_bg_subtract(hist_df) - elif method == 'breakpoint': + if method == "breakpoint": kd, e0, bp, nf = fit_beers_law_breakpoint(hist_df) return kd, e0, nf - elif method == 'nonlinear': + if method == "nonlinear": return fit_beers_law_nonlinear(hist_df) - elif method == 'hybrid': + if method == "hybrid": kd, e0, bp, nf = fit_beers_law_hybrid( hist_df, expected_noise_floor=expected_noise_floor ) return kd, e0, nf - else: - raise ValueError(f"Unknown kd_fit_method: {method!r}. " - f"Choose from {KD_FIT_METHODS}") + raise ValueError( + f"Unknown kd_fit_method: {method!r}. " f"Choose from {KD_FIT_METHODS}" + ) # another solution is to calculate kd without hist -def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_threshold=0.0, - kd_fit_method='log_linear', expected_noise_floor=None, - surface_sigma=None, - wave_exclusion_multiplier=0.0, - wave_sigma_calm_threshold=0.1): +def CalculateKdFromFilteredSubsurfacePhoton( + df, + vertical_res=0.8, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + expected_noise_floor=None, + surface_sigma=None, + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Calculate Kd for a single along-track bin by fitting Beer's Law in log-space. @@ -623,25 +639,47 @@ def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_thr 17 — fit_beers_law: fit Beer's Law curve; kd = -slope, e0 = exp(intercept). """ # Early exit if DataFrame is empty or missing required column - if df.empty or 'lat_bins' not in df.columns: - return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], - 'noise_floor': [np.nan], 'surface_sigma': [np.nan], - 'latitude': [np.nan], 'longitude': [np.nan]}) + if df.empty or "lat_bins" not in df.columns: + return pd.DataFrame( + { + "lat_bins": [np.nan], + "kd": [np.nan], + "e0": [np.nan], + "noise_floor": [np.nan], + "surface_sigma": [np.nan], + "latitude": [np.nan], + "longitude": [np.nan], + } + ) # Retrieve latitude and longitude - lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan - latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan - longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + lat_bin_value = df["lat_bins"].iloc[0] if not df["lat_bins"].empty else np.nan + latitude = ( + df["latitude"].mean() + if "latitude" in df.columns + else df["lat"].mean() + if "lat" in df.columns + else np.nan + ) + longitude = ( + df["longitude"].mean() + if "longitude" in df.columns + else df["lon"].mean() + if "lon" in df.columns + else np.nan + ) # Use value_counts to get photon counts in each height bin - height_counts = df['height_bins'].value_counts().sort_index() + height_counts = df["height_bins"].value_counts().sort_index() bin_centers = height_counts.index.astype(float) # Create a DataFrame for the height bins and counts - hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) + hist_df = pd.DataFrame( + {"zdepth": bin_centers, "photon_counts": height_counts.values} + ) # Reverse zdepth for model alignment - hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + hist_df["zdepth"] = hist_df["zdepth"].max() - hist_df["zdepth"] # Wave-adaptive fit: skip shallow bins contaminated by wave smearing/bubbles. # zdepth=0 in the histogram corresponds to the adaptive surface detection @@ -653,35 +691,51 @@ def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_thr if surface_sigma is not None and wave_exclusion_multiplier > 0: if not np.isnan(surface_sigma) and surface_sigma > wave_sigma_calm_threshold: adaptive_offset = max(3.0 * surface_sigma, 1.0) - wave_skip = max(0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset) - trimmed = hist_df[hist_df['zdepth'] >= wave_skip] + wave_skip = max( + 0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset + ) + trimmed = hist_df[hist_df["zdepth"] >= wave_skip] if len(trimmed) >= 4: - logging.info("Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " - "skip=%.2f m, %d->%d bins", - surface_sigma, adaptive_offset, wave_skip, - len(hist_df), len(trimmed)) + logging.info( + "Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " + "skip=%.2f m, %d->%d bins", + surface_sigma, + adaptive_offset, + wave_skip, + len(hist_df), + len(trimmed), + ) hist_df = trimmed # else: keep original hist_df (too few bins after trim) kd, e0, noise_floor = _fit_kd_with_method( - hist_df, method=kd_fit_method, decay_threshold=decay_zone_threshold, - expected_noise_floor=expected_noise_floor + hist_df, + method=kd_fit_method, + decay_threshold=decay_zone_threshold, + expected_noise_floor=expected_noise_floor, ) - return pd.DataFrame({ - 'lat_bins': [lat_bin_value], - 'kd': [kd], - 'e0': [e0], - 'noise_floor': [noise_floor], - 'surface_sigma': [surface_sigma if surface_sigma is not None else np.nan], - 'latitude': [latitude], - 'longitude': [longitude] - }) + return pd.DataFrame( + { + "lat_bins": [lat_bin_value], + "kd": [kd], + "e0": [e0], + "noise_floor": [noise_floor], + "surface_sigma": [surface_sigma if surface_sigma is not None else np.nan], + "latitude": [latitude], + "longitude": [longitude], + } + ) # Original kd calculation function remains unchanged -def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', - wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): +def calculate_kd( + filtered_seafloor_subsurface_photon_dataset, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Loop over along-track bins and call CalculateKdFromFilteredSubsurfacePhoton for each bin. @@ -701,29 +755,51 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho 18 — Save K_dph attenuation coefficient value and statistical terms (kd, e0 columns in the returned DataFrame). """ - logging.info("Calculating Kd from filtered subsurface photon dataset (method=%s)", kd_fit_method) + logging.info( + "Calculating Kd from filtered subsurface photon dataset (method=%s)", + kd_fit_method, + ) - empty_df = pd.DataFrame({'lat_bins': [], 'kd': [], 'e0': [], 'noise_floor': [], 'surface_sigma': [], 'latitude': [], 'longitude': []}) + empty_df = pd.DataFrame( + { + "lat_bins": [], + "kd": [], + "e0": [], + "noise_floor": [], + "surface_sigma": [], + "latitude": [], + "longitude": [], + } + ) - groups = list(filtered_seafloor_subsurface_photon_dataset.groupby('lat_bins', observed=False)) + groups = list( + filtered_seafloor_subsurface_photon_dataset.groupby("lat_bins", observed=False) + ) if not groups: return empty_df - if kd_fit_method == 'hybrid': + if kd_fit_method == "hybrid": # ------------------------------------------------------------------ # Pass 1: estimate per-bin noise floors (no expected_noise_floor) # ------------------------------------------------------------------ logging.info("Hybrid pass 1: estimating per-bin noise floors") pass1_results = [] for _, group in groups: - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - pass1_results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + pass1_results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) if not pass1_results: return empty_df pass1_df = pd.concat(pass1_results, ignore_index=True) @@ -736,15 +812,17 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # within ±half_window bins. Falls back to beam-level median when # the local window has < 3 valid estimates. HALF_WINDOW = 10 # ±10 bins = ±5 km at 500 m horizontal_res - MIN_LOCAL = 3 # minimum valid noise floors to use local median + MIN_LOCAL = 3 # minimum valid noise floors to use local median - nf_array = pass1_df['noise_floor'].values.copy() + nf_array = pass1_df["noise_floor"].values.copy() n_bins = len(nf_array) # Beam-level fallback - valid_nf_all = pass1_df['noise_floor'].dropna() + valid_nf_all = pass1_df["noise_floor"].dropna() valid_nf_all = valid_nf_all[valid_nf_all > 0] - beam_median_nf = float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + beam_median_nf = ( + float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + ) if beam_median_nf is not None: # Build per-bin expected noise floor via sliding window @@ -759,16 +837,21 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho else: expected_nf_per_bin[i] = beam_median_nf # fallback - logging.info("Hybrid pass 2: sliding window noise floor " - "(half_window=%d bins, beam_median=%.3f, " - "local range=%.3f-%.3f)", - HALF_WINDOW, beam_median_nf, - float(np.nanmin(expected_nf_per_bin)), - float(np.nanmax(expected_nf_per_bin))) + logging.info( + "Hybrid pass 2: sliding window noise floor " + "(half_window=%d bins, beam_median=%.3f, " + "local range=%.3f-%.3f)", + HALF_WINDOW, + beam_median_nf, + float(np.nanmin(expected_nf_per_bin)), + float(np.nanmax(expected_nf_per_bin)), + ) else: expected_nf_per_bin = None - logging.info("Hybrid: insufficient noise floor estimates (%d), " - "skipping pass 2", len(valid_nf_all)) + logging.info( + "Hybrid: insufficient noise floor estimates (%d), " "skipping pass 2", + len(valid_nf_all), + ) # ------------------------------------------------------------------ # Pass 2: re-fit with per-bin expected noise floor constraint @@ -776,15 +859,22 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho if expected_nf_per_bin is not None: results = [] for idx, (_, group) in enumerate(groups): - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - expected_noise_floor=float(expected_nf_per_bin[idx]), - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + expected_noise_floor=float(expected_nf_per_bin[idx]), + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) else: SubsurfacePhotonDFAddedKd = pass1_df @@ -794,7 +884,7 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # Uses 3×IQR to accommodate real spatial variation (e.g. turbidity # gradients near river mouths) while removing fitting artefacts. # ------------------------------------------------------------------ - kd_vals = SubsurfacePhotonDFAddedKd['kd'].dropna() + kd_vals = SubsurfacePhotonDFAddedKd["kd"].dropna() if len(kd_vals) >= 5: q1 = float(kd_vals.quantile(0.25)) q3 = float(kd_vals.quantile(0.75)) @@ -803,12 +893,18 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho upper = q3 + 3.0 * iqr # Also apply physical cap: Kd > 5.0 m⁻¹ is unrealistic upper = min(upper, 5.0) - outlier_mask = (SubsurfacePhotonDFAddedKd['kd'] < lower) | (SubsurfacePhotonDFAddedKd['kd'] > upper) + outlier_mask = (SubsurfacePhotonDFAddedKd["kd"] < lower) | ( + SubsurfacePhotonDFAddedKd["kd"] > upper + ) n_outliers = int(outlier_mask.sum()) if n_outliers > 0: - logging.info("Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", - n_outliers, lower, upper) - SubsurfacePhotonDFAddedKd.loc[outlier_mask, 'kd'] = np.nan + logging.info( + "Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", + n_outliers, + lower, + upper, + ) + SubsurfacePhotonDFAddedKd.loc[outlier_mask, "kd"] = np.nan else: # ------------------------------------------------------------------ @@ -816,13 +912,21 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # ------------------------------------------------------------------ results = [] for _, group in groups: - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) if not results: return empty_df SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) @@ -831,8 +935,13 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # Updated function to apply kd calculation beam-by-beam -def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', - wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): +def process_kd_calculation( + Final_filtered_subsurface_photon_dataset, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Beam-by-beam wrapper that calls calculate_kd for every beam and concatenates the results into a single Kd output DataFrame. @@ -849,18 +958,22 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_ kd_beam_datasets = [] # Group by 'beam_id' to process each beam independently - for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby( + "beam_id" + ): logging.info(f"Calculating Kd for beam: {beam_id}") # Apply the calculate_kd function to the current beam's dataset SubsurfacePhotonDFAddedKd = calculate_kd( - beam_data, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + beam_data, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, wave_exclusion_multiplier=wave_exclusion_multiplier, wave_sigma_calm_threshold=wave_sigma_calm_threshold, ) # Add a column to track the beam_id in the results - SubsurfacePhotonDFAddedKd['beam_id'] = beam_id + SubsurfacePhotonDFAddedKd["beam_id"] = beam_id # Append the result to the list kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) @@ -871,9 +984,20 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_ "All bins may have been discarded by an upstream filter (e.g. histogram quality). " "Returning empty DataFrame." ) - return pd.DataFrame(columns=['lat_bins', 'kd', 'e0', 'noise_floor', 'surface_sigma', 'latitude', 'longitude', 'beam_id']) + return pd.DataFrame( + columns=[ + "lat_bins", + "kd", + "e0", + "noise_floor", + "surface_sigma", + "latitude", + "longitude", + "beam_id", + ] + ) # Combine results from all beams into a single DataFrame combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) - return combined_kd_dataset \ No newline at end of file + return combined_kd_dataset diff --git a/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py b/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py index 46f6a58..a1ea0df 100644 --- a/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py +++ b/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py @@ -1,17 +1,17 @@ -import rasterio import numpy as np -import pandas as pd +import rasterio from rtree import index -from shapely.geometry import box, Point +from shapely.geometry import box + # 1. Function to create an R-tree spatial index for raster bounds def create_spatial_index(gebco_paths): """ Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. - + Returns: raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. @@ -19,26 +19,29 @@ def create_spatial_index(gebco_paths): idx = index.Index() raster_data_dict = {} for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): gebco_raster = rasterio.open(path) raster_data = gebco_raster.read(1) raster_data_dict[path] = (gebco_raster, raster_data) bounds = gebco_raster.bounds - idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + idx.insert( + i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path + ) return raster_data_dict, idx + # 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): """ Determine which raster datasets are relevant for the given coordinates using spatial index. This function uses a more efficient approach by performing a bounding box query for batches of points. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. raster_data_dict (dict): Dictionary containing raster datasets and their data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - + Returns: relevant_rasters (list): List of relevant raster datasets and their respective data. """ @@ -50,59 +53,70 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index # Get all rasters that intersect with the bounding box matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) relevant_paths = {match.object for match in matches} - relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + relevant_rasters = [ + (raster_data_dict[path][0], raster_data_dict[path][1]) + for path in relevant_paths + ] return relevant_rasters + # 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner def get_seafloor_elevation(lons, lats, relevant_rasters): """ Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. relevant_rasters (list): List of relevant raster datasets and their respective data. - + Returns: seafloor_elevations (numpy.ndarray): Array of seafloor elevations. """ points = np.vstack((lons, lats)).T seafloor_elevations = np.full(len(lons), np.nan) - + for gebco_raster, raster_data in relevant_rasters: # Use rasterio.sample to get values for multiple points in a batch values = list(gebco_raster.sample(points)) for idx, value in enumerate(values): if np.isnan(seafloor_elevations[idx]) and value is not None: seafloor_elevations[idx] = value[0] - + return seafloor_elevations + # 4. The main process function that ties everything together def process_seafloor_data(gebco_paths, sea_photon_dataset): """ Main function to process the seafloor data. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - + Returns: filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. """ # Create spatial index and load GEBCO Raster Data raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - + # Get the relevant rasters using spatial index - lons = sea_photon_dataset['longitude'].values - lats = sea_photon_dataset['latitude'].values - relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) - + lons = sea_photon_dataset["longitude"].values + lats = sea_photon_dataset["latitude"].values + relevant_rasters = get_relevant_rasters_using_index( + lons, lats, raster_data_dict, spatial_index + ) + # Get seafloor elevation for all points - sea_photon_dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) - + sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( + lons, lats, relevant_rasters + ) + # Filter out points below the seafloor - filtered_sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation']] + filtered_sea_photon_dataset = sea_photon_dataset[ + sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] + ] # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) - - return filtered_sea_photon_dataset \ No newline at end of file + + return filtered_sea_photon_dataset diff --git a/icesat-2_kdph_py/kd_utils/__init__.py b/icesat-2_kdph_py/kd_utils/__init__.py index 0c3915c..8976647 100644 --- a/icesat-2_kdph_py/kd_utils/__init__.py +++ b/icesat-2_kdph_py/kd_utils/__init__.py @@ -1,8 +1,18 @@ # utils/__init__.py -from .data_processing import load_data, extract_file_params, Extract_sea_photons, create_photon_dataframe -from .visualization import plot_photon_height, plot_kd_photons -from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton -from .interpolation import interpolate_labels, apply_interpolation, geoid_correction, refraction_correction from .bathy_processing import * -from .sea_photons_analysis import * \ No newline at end of file +from .data_processing import ( + Extract_sea_photons, + create_photon_dataframe, + extract_file_params, + load_data, +) +from .interpolation import ( + apply_interpolation, + geoid_correction, + interpolate_labels, + refraction_correction, +) +from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton +from .sea_photons_analysis import * +from .visualization import plot_kd_photons, plot_photon_height diff --git a/icesat-2_kdph_py/kd_utils/bathy_processing.py b/icesat-2_kdph_py/kd_utils/bathy_processing.py index 772f5cd..f29418b 100644 --- a/icesat-2_kdph_py/kd_utils/bathy_processing.py +++ b/icesat-2_kdph_py/kd_utils/bathy_processing.py @@ -1,18 +1,19 @@ # utils/bathy_processing.py -import os import logging +import os + +import numpy as np import pandas as pd import rasterio -import numpy as np +from rtree import index +from scipy.spatial import cKDTree +from scipy.stats import norm +from shapely.geometry import box + from kd_utils.sea_photons_analysis import ( get_sea_surface_height_adaptive, - get_sea_surface_height_static, horizontal_vertical_bin_dataset, ) -from rtree import index -from shapely.geometry import box, Point -from scipy.spatial import cKDTree -from scipy.stats import norm def apply_optional_histogram_quality_filter( @@ -23,7 +24,7 @@ def apply_optional_histogram_quality_filter( depth_min=6.0, depth_max=7.0, reference_depth_min=0.0, - reference_depth_max=1.0 + reference_depth_max=1.0, ): """ Optional histogram quality check: keep along-track bins only if enough @@ -62,32 +63,41 @@ def apply_optional_histogram_quality_filter( if subsurface_photon_dataset.empty: return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - quality_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height': sea_surface_height - }).dropna(subset=['sea_surface_height']).copy() + quality_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + .dropna(subset=["sea_surface_height"]) + .copy() + ) if quality_df.empty: return subsurface_photon_dataset.iloc[0:0].copy() ratios = [] for _, row in quality_df.iterrows(): - lat_bin = row['lat_bins'] - surface = row['sea_surface_height'] - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + lat_bin = row["lat_bins"] + surface = row["sea_surface_height"] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: ratios.append(0.0) continue - depth = surface - bin_data['photon_height'] + depth = surface - bin_data["photon_height"] underwater_mask = depth > 0 # Reference band: near-surface photon count (incoming signal) - ref_mask = underwater_mask & (depth >= reference_depth_min) & (depth <= reference_depth_max) + ref_mask = ( + underwater_mask + & (depth >= reference_depth_min) + & (depth <= reference_depth_max) + ) ref_count = float(ref_mask.sum()) if ref_count == 0: ratios.append(0.0) @@ -104,9 +114,15 @@ def apply_optional_histogram_quality_filter( ratios.append(decay_ratio) - quality_df['hist_quality_ratio'] = ratios - valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + quality_df["hist_quality_ratio"] = ratios + valid_bins = set( + quality_df.loc[ + quality_df["hist_quality_ratio"] >= min_ratio, "lat_bins" + ].tolist() + ) + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): @@ -117,32 +133,34 @@ def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): distribution). Returns a dict mapping lat_bin -> sigma (float or NaN). Bins with fewer than 10 surface photons get NaN. """ - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return {} sigma_map = {} - for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height, strict=False): if pd.isna(surface): continue - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: continue peak_data = bin_data[ - (bin_data['photon_height'] > surface - 1.0) & - (bin_data['photon_height'] < surface + 1.0) + (bin_data["photon_height"] > surface - 1.0) + & (bin_data["photon_height"] < surface + 1.0) ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) peak_data2 = bin_data[ - (bin_data['photon_height'] > mu - half_win2) & - (bin_data['photon_height'] < mu + half_win2) + (bin_data["photon_height"] > mu - half_win2) + & (bin_data["photon_height"] < mu + half_win2) ] if len(peak_data2) > 10: - _, sigma = norm.fit(peak_data2['photon_height']) + _, sigma = norm.fit(peak_data2["photon_height"]) else: sigma = np.nan sigma_map[lat_bin] = sigma @@ -154,7 +172,7 @@ def apply_optional_surface_sigma_filter( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=0.5 + sigma_max=0.5, ): """ Optional Gaussian surface sigma filter: discard along-track bins if the @@ -172,10 +190,14 @@ def apply_optional_surface_sigma_filter( return subsurface_photon_dataset sigma_df = pd.DataFrame( - list(sigma_map.items()), columns=['lat_bins', 'surface_sigma'] + list(sigma_map.items()), columns=["lat_bins", "surface_sigma"] + ) + valid_bins = set( + sigma_df.loc[sigma_df["surface_sigma"] <= sigma_max, "lat_bins"].tolist() ) - valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def apply_optional_refraction_correction( @@ -183,7 +205,7 @@ def apply_optional_refraction_correction( subsurface_photon_dataset, sea_surface_height, water_temp_c=20.0, - wavelength_nm=532.0 + wavelength_nm=532.0, ): """ Optional refraction correction for subsurface photons. @@ -192,23 +214,36 @@ def apply_optional_refraction_correction( Flowchart step: 9 — Compute water depths (distance below surface) and correct depths for refraction. """ - required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} - if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): + required_cols = { + "lat_bins", + "photon_height", + "ref_elevation", + "ref_azimuth", + "lon", + "lat", + } + if subsurface_photon_dataset.empty or ( + not required_cols.issubset(set(subsurface_photon_dataset.columns)) + ): return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) - corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() - if corrected['sea_surface_height'].isna().all(): + surface_df = pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + corrected = subsurface_photon_dataset.merge( + surface_df, on="lat_bins", how="left" + ).copy() + if corrected["sea_surface_height"].isna().all(): return subsurface_photon_dataset - corrected['photon_height_pre_refraction'] = corrected['photon_height'] - corrected['lon_pre_refraction'] = corrected['lon'] - corrected['lat_pre_refraction'] = corrected['lat'] + corrected["photon_height_pre_refraction"] = corrected["photon_height"] + corrected["lon_pre_refraction"] = corrected["lon"] + corrected["lat_pre_refraction"] = corrected["lat"] # Refraction index parameterization from legacy module. a = -0.000001501562500 @@ -217,14 +252,26 @@ def apply_optional_refraction_correction( d = -0.000160475520686 e = 1.398067112092424 n1 = 1.00029 - n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e + n2 = ( + (a * water_temp_c**2) + + (b * wavelength_nm**2) + + (c * water_temp_c) + + (d * wavelength_nm) + + e + ) - ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) - ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) - z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) - ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) - x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) - y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) + ref_elev = pd.to_numeric(corrected["ref_elevation"], errors="coerce").to_numpy( + dtype=float + ) + ref_az = pd.to_numeric(corrected["ref_azimuth"], errors="coerce").to_numpy( + dtype=float + ) + z = pd.to_numeric(corrected["photon_height"], errors="coerce").to_numpy(dtype=float) + ws = pd.to_numeric(corrected["sea_surface_height"], errors="coerce").to_numpy( + dtype=float + ) + x = pd.to_numeric(corrected["lon"], errors="coerce").to_numpy(dtype=float) + y = pd.to_numeric(corrected["lat"], errors="coerce").to_numpy(dtype=float) # Convert to radians if values look like degrees. if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): @@ -232,7 +279,13 @@ def apply_optional_refraction_correction( if np.nanmax(np.abs(ref_az)) > (2 * np.pi): ref_az = np.deg2rad(ref_az) - valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) + valid_mask = ( + np.isfinite(ref_elev) + & np.isfinite(ref_az) + & np.isfinite(z) + & np.isfinite(ws) + & (z <= ws) + ) if not np.any(valid_mask): return subsurface_photon_dataset @@ -248,7 +301,7 @@ def apply_optional_refraction_correction( R = (S * n1) / n2 Gamma = (np.pi / 2.0) - theta1 phi = theta1 - theta2 - P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) + P_sq = np.maximum(R**2 + S**2 - 2 * R * S * np.cos(phi), 0.0) P = np.sqrt(P_sq) alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) alpha_arg = np.clip(alpha_arg, -1.0, 1.0) @@ -264,9 +317,9 @@ def apply_optional_refraction_correction( y_corr = y[valid_mask] + DN z_corr = z[valid_mask] + DZ - corrected.loc[valid_mask, 'lon'] = x_corr - corrected.loc[valid_mask, 'lat'] = y_corr - corrected.loc[valid_mask, 'photon_height'] = z_corr + corrected.loc[valid_mask, "lon"] = x_corr + corrected.loc[valid_mask, "lat"] = y_corr + corrected.loc[valid_mask, "photon_height"] = z_corr return corrected @@ -275,7 +328,7 @@ def apply_sea_surface_flattening( sea_surface_height, lat_bin_keys, horizontal_res, - flattening_window_m=500 + flattening_window_m=500, ): """ Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. @@ -287,15 +340,18 @@ def apply_sea_surface_flattening( if len(sea_surface_height) != len(lat_bin_keys): return subsurface_photon_dataset - surface_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height_local': sea_surface_height - }).dropna(subset=['sea_surface_height_local']).copy() + surface_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height_local": sea_surface_height} + ) + .dropna(subset=["sea_surface_height_local"]) + .copy() + ) if surface_df.empty: return subsurface_photon_dataset - surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') - surface_df = surface_df.dropna(subset=['lat_bins_num']) + surface_df["lat_bins_num"] = pd.to_numeric(surface_df["lat_bins"], errors="coerce") + surface_df = surface_df.dropna(subset=["lat_bins_num"]) if surface_df.empty: return subsurface_photon_dataset @@ -305,65 +361,83 @@ def apply_sea_surface_flattening( "Sea surface flattening window (%d m) <= horizontal_res (%d m) → " "big_bin_size=1, flattening has no effect. " "Use --sea_surface_flattening_window_m > --horizontal_res.", - flattening_window_m, horizontal_res + flattening_window_m, + horizontal_res, ) - surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) + surface_df["big_bin"] = ( + surface_df["lat_bins_num"].astype(int) // big_bin_size + ).astype(int) mean_surface = ( - surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] + surface_df.groupby("big_bin", observed=False)["sea_surface_height_local"] .mean() - .rename('sea_surface_height_bigbin_mean') + .rename("sea_surface_height_bigbin_mean") .reset_index() ) - surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') - surface_df['sea_surface_flattening_offset'] = ( - surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] + surface_df = surface_df.merge(mean_surface, on="big_bin", how="left") + surface_df["sea_surface_flattening_offset"] = ( + surface_df["sea_surface_height_bigbin_mean"] + - surface_df["sea_surface_height_local"] ) flattened = subsurface_photon_dataset.copy() - flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') - flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) + flattened["lat_bins_num"] = pd.to_numeric(flattened["lat_bins"], errors="coerce") + flattened["big_bin"] = ( + flattened["lat_bins_num"].fillna(-1).astype(int) // big_bin_size + ).astype(int) flattened = flattened.merge( - surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], - on='lat_bins', - how='left' - ) - flattened['photon_height_original'] = flattened['photon_height'] - flattened['photon_height'] = ( - flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) + surface_df[ + [ + "lat_bins", + "sea_surface_height_local", + "sea_surface_height_bigbin_mean", + "sea_surface_flattening_offset", + ] + ], + on="lat_bins", + how="left", ) + flattened["photon_height_original"] = flattened["photon_height"] + flattened["photon_height"] = flattened["photon_height"] - flattened[ + "sea_surface_flattening_offset" + ].fillna(0.0) # Re-bin height_bins from flattened photon_height so that downstream # Kd calculation (which reads height_bins) uses the corrected depths. - if 'height_bins' in flattened.columns: + if "height_bins" in flattened.columns: vertical_res_actual = horizontal_res # not used; infer from existing bins # Infer the vertical resolution from the original height_bins spacing - orig_bins = pd.to_numeric(flattened['height_bins'], errors='coerce').dropna().unique() + orig_bins = ( + pd.to_numeric(flattened["height_bins"], errors="coerce").dropna().unique() + ) if len(orig_bins) >= 2: sorted_bins = np.sort(orig_bins) diffs = np.diff(sorted_bins) v_res = float(np.median(diffs[diffs > 0])) if np.any(diffs > 0) else 0.25 else: v_res = 0.25 - h_min = flattened['photon_height'].min() - h_max = flattened['photon_height'].max() + h_min = flattened["photon_height"].min() + h_max = flattened["photon_height"].max() n_hbins = max(1, int(round((h_max - h_min) / v_res))) bin_edges = np.linspace(h_min, h_max, n_hbins + 1) bin_labels = np.round((bin_edges[:-1] + bin_edges[1:]) / 2, decimals=1) - flattened['height_bins'] = pd.cut( - flattened['photon_height'], bins=bin_edges, labels=bin_labels, - include_lowest=True + flattened["height_bins"] = pd.cut( + flattened["photon_height"], + bins=bin_edges, + labels=bin_labels, + include_lowest=True, ) - return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') + return flattened.drop(columns=["lat_bins_num", "big_bin"], errors="ignore") + # 1. Function to create an R-tree spatial index for raster bounds def create_spatial_index(gebco_paths): """ Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. - + Returns: raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. @@ -371,26 +445,29 @@ def create_spatial_index(gebco_paths): idx = index.Index() raster_data_dict = {} for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): gebco_raster = rasterio.open(path) raster_data = gebco_raster.read(1) raster_data_dict[path] = (gebco_raster, raster_data) bounds = gebco_raster.bounds - idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + idx.insert( + i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path + ) return raster_data_dict, idx + # 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): """ Determine which raster datasets are relevant for the given coordinates using spatial index. This function uses a more efficient approach by performing a bounding box query for batches of points. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. raster_data_dict (dict): Dictionary containing raster datasets and their data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - + Returns: relevant_rasters (list): List of relevant raster datasets and their respective data. """ @@ -402,7 +479,10 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index # Get all rasters that intersect with the bounding box matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) relevant_paths = {match.object for match in matches} - relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + relevant_rasters = [ + (raster_data_dict[path][0], raster_data_dict[path][1]) + for path in relevant_paths + ] return relevant_rasters @@ -410,25 +490,25 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index def get_seafloor_elevation(lons, lats, relevant_rasters): """ Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. relevant_rasters (list): List of relevant raster datasets and their respective data. - + Returns: seafloor_elevations (numpy.ndarray): Array of seafloor elevations. """ points = np.vstack((lons, lats)).T seafloor_elevations = np.full(len(lons), np.nan) - + for gebco_raster, raster_data in relevant_rasters: # Use rasterio.sample to get values for multiple points in a batch values = list(gebco_raster.sample(points)) for idx, value in enumerate(values): if np.isnan(seafloor_elevations[idx]) and value is not None: seafloor_elevations[idx] = value[0] - + return seafloor_elevations @@ -456,10 +536,12 @@ def query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths): """ dataset = sea_photon_dataset.copy() raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - lons = dataset['longitude'].values - lats = dataset['latitude'].values - relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) - dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + lons = dataset["longitude"].values + lats = dataset["latitude"].values + relevant_rasters = get_relevant_rasters_using_index( + lons, lats, raster_data_dict, spatial_index + ) + dataset["seafloor_elevation"] = get_seafloor_elevation(lons, lats, relevant_rasters) return dataset @@ -481,7 +563,7 @@ def remove_photons_below_seafloor(sea_photon_dataset): Only rows where photon_height > seafloor_elevation. """ return sea_photon_dataset[ - sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation'] + sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] ].copy() @@ -508,13 +590,13 @@ def discard_shallow_seabed_bins(sea_photon_dataset, min_depth_m): pd.DataFrame Only rows where depth >= min_depth_m. """ - return sea_photon_dataset[ - sea_photon_dataset['depth'] >= min_depth_m - ].copy() + return sea_photon_dataset[sea_photon_dataset["depth"] >= min_depth_m].copy() # 4. The main process function that ties everything together -def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): +def process_seafloor_data( + sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres +): """ Query GEBCO bathymetry then apply seabed and shallow-water filters. @@ -533,9 +615,11 @@ def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Hei containing points above the seafloor in water deeper than threshold. """ dataset = query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths) # step 12 - dataset['depth'] = -dataset['seafloor_elevation'] - dataset = remove_photons_below_seafloor(dataset) # step 13 - return discard_shallow_seabed_bins(dataset, abs(Ignore_Subsurface_Height_Thres)) # step 14 + dataset["depth"] = -dataset["seafloor_elevation"] + dataset = remove_photons_below_seafloor(dataset) # step 13 + return discard_shallow_seabed_bins( + dataset, abs(Ignore_Subsurface_Height_Thres) + ) # step 14 def load_atl24_points(atl24_file_path): @@ -547,87 +631,123 @@ def load_atl24_points(atl24_file_path): retrieve GEBCO data (or other regional raster bathymetry data) for region. """ if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) lower_path = atl24_file_path.lower() - if lower_path.endswith(('.h5', '.hdf5')): + if lower_path.endswith((".h5", ".hdf5")): try: import h5py - beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'] + + beams = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] frames = [] - with h5py.File(atl24_file_path, 'r') as f: + with h5py.File(atl24_file_path, "r") as f: for beam in beams: if beam not in f: continue grp = f[beam] - if not all(k in grp for k in ('lon_ph', 'lat_ph', 'ortho_h', 'class_ph')): + if not all( + k in grp for k in ("lon_ph", "lat_ph", "ortho_h", "class_ph") + ): continue - class_ph = grp['class_ph'][:] + class_ph = grp["class_ph"][:] bathy_mask = class_ph == 40 # ATL24 bathymetry class # Apply low_confidence_flag filter if available - if 'low_confidence_flag' in grp: - bathy_mask = bathy_mask & (grp['low_confidence_flag'][:] == 0) + if "low_confidence_flag" in grp: + bathy_mask = bathy_mask & (grp["low_confidence_flag"][:] == 0) if bathy_mask.sum() == 0: continue - frames.append(pd.DataFrame({ - 'longitude': grp['lon_ph'][:][bathy_mask].astype(float), - 'latitude': grp['lat_ph'][:][bathy_mask].astype(float), - 'seafloor_elevation_atl24': grp['ortho_h'][:][bathy_mask].astype(float), - })) + frames.append( + pd.DataFrame( + { + "longitude": grp["lon_ph"][:][bathy_mask].astype(float), + "latitude": grp["lat_ph"][:][bathy_mask].astype(float), + "seafloor_elevation_atl24": grp["ortho_h"][:][ + bathy_mask + ].astype(float), + } + ) + ) if not frames: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) return pd.concat(frames, ignore_index=True).dropna() except Exception: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) - elif lower_path.endswith(('.csv', '.txt')): + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) + elif lower_path.endswith((".csv", ".txt")): atl24_df = pd.read_csv(atl24_file_path) - elif lower_path.endswith('.parquet'): + elif lower_path.endswith(".parquet"): atl24_df = pd.read_parquet(atl24_file_path) - elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): + elif lower_path.endswith((".gpkg", ".shp", ".geojson")): try: import geopandas as gpd + atl24_gdf = gpd.read_file(atl24_file_path) atl24_df = pd.DataFrame(atl24_gdf) - if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): - atl24_df['longitude'] = atl24_gdf.geometry.x - atl24_df['latitude'] = atl24_gdf.geometry.y + if ( + "longitude" not in atl24_df.columns + or "latitude" not in atl24_df.columns + ) and ("geometry" in atl24_gdf.columns): + atl24_df["longitude"] = atl24_gdf.geometry.x + atl24_df["latitude"] = atl24_gdf.geometry.y except Exception: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) else: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) - lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] - lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] + lon_candidates = ["longitude", "lon", "x", "LONGITUDE", "LON"] + lat_candidates = ["latitude", "lat", "y", "LATITUDE", "LAT"] elev_candidates = [ - 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', - 'depth', 'water_depth' + "seafloor_elevation", + "bottom_elevation", + "bathymetry", + "elevation", + "z", + "depth", + "water_depth", ] lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) if lon_col is None or lat_col is None or elev_col is None: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) - out_df = pd.DataFrame({ - 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), - 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), - 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') - }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() + out_df = ( + pd.DataFrame( + { + "longitude": pd.to_numeric(atl24_df[lon_col], errors="coerce"), + "latitude": pd.to_numeric(atl24_df[lat_col], errors="coerce"), + "atl24_raw_bathy": pd.to_numeric(atl24_df[elev_col], errors="coerce"), + } + ) + .dropna(subset=["longitude", "latitude", "atl24_raw_bathy"]) + .copy() + ) - if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): - out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() + if "depth" in elev_col.lower() and ("elev" not in elev_col.lower()): + out_df["seafloor_elevation_atl24"] = -out_df["atl24_raw_bathy"].abs() else: - out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] + out_df["seafloor_elevation_atl24"] = out_df["atl24_raw_bathy"] - return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] + return out_df[["longitude", "latitude", "seafloor_elevation_atl24"]] def process_seafloor_data_atl24( sea_photon_dataset, atl24_file_path, Ignore_Subsurface_Height_Thres, - max_match_distance_deg=0.01 + max_match_distance_deg=0.01, ): """ Match ATL24 bathymetry points to photons, then filter below seafloor and @@ -644,26 +764,32 @@ def process_seafloor_data_atl24( if atl24_points.empty: return sea_photon_dataset, 0 - photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) - atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) + photon_coords = sea_photon_dataset[["longitude", "latitude"]].to_numpy(dtype=float) + atl24_coords = atl24_points[["longitude", "latitude"]].to_numpy(dtype=float) if len(photon_coords) == 0 or len(atl24_coords) == 0: return sea_photon_dataset, 0 tree = cKDTree(atl24_coords) - distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) + distances, indices = tree.query( + photon_coords, k=1, distance_upper_bound=max_match_distance_deg + ) valid_match = np.isfinite(distances) & (indices < len(atl24_points)) matched_count = int(valid_match.sum()) if matched_count == 0: return sea_photon_dataset, 0 matched_dataset = sea_photon_dataset.copy() - matched_dataset['seafloor_elevation'] = np.nan - matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] - matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() - matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] - - above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 - filtered_dataset = discard_shallow_seabed_bins(above_floor, abs(Ignore_Subsurface_Height_Thres)) # step 14 + matched_dataset["seafloor_elevation"] = np.nan + matched_dataset.loc[valid_match, "seafloor_elevation"] = atl24_points[ + "seafloor_elevation_atl24" + ].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=["seafloor_elevation"]).copy() + matched_dataset["depth"] = -matched_dataset["seafloor_elevation"] + + above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 + filtered_dataset = discard_shallow_seabed_bins( + above_floor, abs(Ignore_Subsurface_Height_Thres) + ) # step 14 return filtered_dataset, matched_count @@ -715,7 +841,7 @@ def rebuild_and_refit_surface_after_refraction( water_temp_c=water_temp_c, wavelength_nm=wavelength_nm, ) - rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 + rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 corrected_full_dataset, horizontal_res, vertical_res ) return get_sea_surface_height_adaptive(rebinned_corrected) # step 11 @@ -728,7 +854,7 @@ def get_subsurface_photon( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -746,7 +872,7 @@ def get_subsurface_photon( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): """ Orchestrate per-beam subsurface photon extraction and all optional quality filters. @@ -772,8 +898,12 @@ def get_subsurface_photon( (applied unconditionally via Ignore_Subsurface_Height_Thres). """ # adaptive threshold - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - get_sea_surface_height_adaptive(binned_dataset_sea_surface) + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = get_sea_surface_height_adaptive(binned_dataset_sea_surface) # Always compute per-bin surface sigma for quality flagging and wave-adaptive fit sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) @@ -787,7 +917,7 @@ def get_subsurface_photon( depth_min=histogram_quality_depth_min, depth_max=histogram_quality_depth_max, reference_depth_min=histogram_quality_ref_depth_min, - reference_depth_max=histogram_quality_ref_depth_max + reference_depth_max=histogram_quality_ref_depth_max, ) if apply_surface_sigma_filter: @@ -795,7 +925,7 @@ def get_subsurface_photon( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=surface_sigma_max + sigma_max=surface_sigma_max, ) if apply_refraction_correction: @@ -804,25 +934,29 @@ def get_subsurface_photon( subsurface_photon_dataset, sea_surface_height, water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm + wavelength_nm=refraction_wavelength_nm, ) if apply_post_refraction_refit: - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - rebuild_and_refit_surface_after_refraction( # steps 10 & 11 - binned_dataset_sea_surface, - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, - horizontal_res=horizontal_res, - vertical_res=vertical_res, - ) + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = rebuild_and_refit_surface_after_refraction( # steps 10 & 11 + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) # Apply minimum depth threshold unconditionally — keeps photons at height # >= Ignore_Subsurface_Height_Thres regardless of whether GEBCO/ATL24 is on. # Without this, deep noise photons (down to -70 m) contaminate the Beer's Law fit # and produce near-zero Kd values, especially at shallow turbid sites. subsurface_photon_dataset = subsurface_photon_dataset[ - subsurface_photon_dataset['photon_height'] >= Ignore_Subsurface_Height_Thres + subsurface_photon_dataset["photon_height"] >= Ignore_Subsurface_Height_Thres ].copy() if use_atl24_filter: @@ -830,13 +964,15 @@ def get_subsurface_photon( subsurface_photon_dataset, atl24_file_path, abs(Ignore_Subsurface_Height_Thres), - max_match_distance_deg=atl24_max_match_distance_deg + max_match_distance_deg=atl24_max_match_distance_deg, ) if atl24_match_count > 0: filtered_seafloor_subsurface_photon_dataset = atl24_filtered elif use_gebco_filter: filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + subsurface_photon_dataset, + GEBCO_paths, + abs(Ignore_Subsurface_Height_Thres), ) else: filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset @@ -849,24 +985,33 @@ def get_subsurface_photon( if apply_flattening: lat_bin_keys = list( - binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() + binned_dataset_sea_surface.groupby( + ["lat_bins"], observed=False + ).groups.keys() ) filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( filtered_seafloor_subsurface_photon_dataset, sea_surface_height, lat_bin_keys, horizontal_res=horizontal_res, - flattening_window_m=flattening_window_m + flattening_window_m=flattening_window_m, ) # Attach per-bin surface_sigma as a quality flag column - filtered_seafloor_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset.copy() + filtered_seafloor_subsurface_photon_dataset = ( + filtered_seafloor_subsurface_photon_dataset.copy() + ) if sigma_map: - filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = \ - filtered_seafloor_subsurface_photon_dataset['lat_bins'].map(sigma_map) + filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = ( + filtered_seafloor_subsurface_photon_dataset["lat_bins"].map(sigma_map) + ) else: - filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = np.nan + filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = np.nan - return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset + return ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) # Main processing function to apply the subsurface photon filtering beam-by-beam @@ -876,7 +1021,7 @@ def process_subsurface_photon_filtering( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -894,7 +1039,7 @@ def process_subsurface_photon_filtering( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): """ Beam-by-beam wrapper that calls get_subsurface_photon for every beam and @@ -910,37 +1055,40 @@ def process_subsurface_photon_filtering( filtered_beam_datasets = [] # Group the binned dataset by 'beam_id' and process each group separately - for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): - print(f'Processing subsurface filtering for beam: {beam_id}') + for beam_id, beam_data in binned_dataset_sea_surface.groupby("beam_id"): + print(f"Processing subsurface filtering for beam: {beam_id}") # Apply get_subsurface_photon to the current beam's dataset - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - get_subsurface_photon( - beam_data, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=use_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=atl24_max_match_distance_deg, - use_gebco_filter=use_gebco_filter, - apply_histogram_quality_filter=apply_histogram_quality_filter, - histogram_quality_min_ratio=histogram_quality_min_ratio, - histogram_quality_depth_min=histogram_quality_depth_min, - histogram_quality_depth_max=histogram_quality_depth_max, - histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, - apply_surface_sigma_filter=apply_surface_sigma_filter, - surface_sigma_max=surface_sigma_max, - apply_refraction_correction=apply_refraction_correction, - refraction_water_temp_c=refraction_water_temp_c, - refraction_wavelength_nm=refraction_wavelength_nm, - apply_post_refraction_refit=apply_post_refraction_refit, - apply_flattening=apply_flattening, - flattening_window_m=flattening_window_m, - horizontal_res=horizontal_res, - vertical_res=vertical_res - ) + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) # Append each result to the lists sea_surface_heights.append(sea_surface_height) @@ -949,5 +1097,5 @@ def process_subsurface_photon_filtering( # Combine all filtered beam datasets into a single DataFrame combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) - + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/icesat-2_kdph_py/kd_utils/data_processing.py b/icesat-2_kdph_py/kd_utils/data_processing.py index 388ce31..f21217a 100644 --- a/icesat-2_kdph_py/kd_utils/data_processing.py +++ b/icesat-2_kdph_py/kd_utils/data_processing.py @@ -1,42 +1,28 @@ # utils/data_processing.py -import re -import os import io -import time -import math -import h5py import logging -import netCDF4 -import numpy as np +import math +import os +import re + import geopandas as gpd +import h5py +# from pyproj import Transformer +import numpy as np import pandas as pd -from datetime import datetime -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from pyproj import Proj, Transformer from scipy.spatial import ConvexHull -import argparse -import subprocess # import fiona -import utm -import pyproj -from pyproj import Transformer, Proj - -# from pyproj import Transformer -from matplotlib.widgets import LassoSelector -from matplotlib.path import Path - -from sklearn.cluster import DBSCAN from .interpolation import * - logger = logging.getLogger(__name__) -#this function from icesat2_toolkit +# this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ @@ -62,9 +48,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, 'r') + fileID = h5py.File(FILENAME, "r") else: - fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + fileID = h5py.File(os.path.expanduser(FILENAME), "r") # Output HDF5 file information logging.info(fileID.filename) @@ -76,12 +62,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]['geolocation']['segment_id'] - fileID[gtx]['heights']['delta_time'] + fileID[gtx]["geolocation"]["segment_id"] + fileID[gtx]["heights"]["delta_time"] except KeyError: pass else: @@ -89,82 +75,81 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # for each included beam for gtx in IS2_atl03_beams: - # ------------------------------------------- # 1. make sure the beam-level dict exists IS2_atl03_attrs.setdefault(gtx, {}) # 2. always save the two “must-have” attributes - for key in ('atlas_beam_type', 'atlas_spot_number'): + for key in ("atlas_beam_type", "atlas_spot_number"): IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] - + # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]['heights'] = {} - IS2_atl03_mds[gtx]['geolocation'] = {} - IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} - IS2_atl03_mds[gtx]['geophys_corr'] = {} + IS2_atl03_mds[gtx]["heights"] = {} + IS2_atl03_mds[gtx]["geolocation"] = {} + IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} + IS2_atl03_mds[gtx]["geophys_corr"] = {} # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_mds[gtx]['heights'][key] = val[:] + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_mds[gtx]["heights"][key] = val[:] # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_mds[gtx]["geolocation"][key] = val[:] # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]['heights'] = {} - IS2_atl03_attrs[gtx]['geolocation'] = {} - IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} - IS2_atl03_attrs[gtx]['geophys_corr'] = {} - + IS2_atl03_attrs[gtx]["heights"] = {} + IS2_atl03_attrs[gtx]["geolocation"] = {} + IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} + IS2_atl03_attrs[gtx]["geophys_corr"] = {} + # Global Group Attributes - for att_name,att_val in fileID[gtx].attrs.items(): + for att_name, att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_attrs[gtx]['heights'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_attrs[gtx]["heights"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_attrs[gtx]['geolocation'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_attrs[gtx]["geolocation"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val # ICESat-2 Geophysical Corrections Group - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds['orbit_info'] = {} - IS2_atl03_attrs['orbit_info'] = {} - for key,val in fileID['orbit_info'].items(): - IS2_atl03_mds['orbit_info'][key] = val[:] + IS2_atl03_mds["orbit_info"] = {} + IS2_atl03_attrs["orbit_info"] = {} + for key, val in fileID["orbit_info"].items(): + IS2_atl03_mds["orbit_info"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID['orbit_info'].attrs.items(): - IS2_atl03_attrs['orbit_info'][att_name] = att_val + for att_name, att_val in fileID["orbit_info"].attrs.items(): + IS2_atl03_attrs["orbit_info"][att_name] = att_val # Variable Attributes - IS2_atl03_attrs['orbit_info'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + IS2_atl03_attrs["orbit_info"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["orbit_info"][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -172,91 +157,108 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds['ancillary_data'] = {} - IS2_atl03_attrs['ancillary_data'] = {} - ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', - 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', - 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', - 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', - 'start_orbit','start_region','start_rgt','version'] + IS2_atl03_mds["ancillary_data"] = {} + IS2_atl03_attrs["ancillary_data"] = {} + ancillary_keys = [ + "atlas_sdp_gps_epoch", + "data_end_utc", + "data_start_utc", + "end_cycle", + "end_geoseg", + "end_gpssow", + "end_gpsweek", + "end_orbit", + "end_region", + "end_rgt", + "granule_end_utc", + "granule_start_utc", + "release", + "start_cycle", + "start_geoseg", + "start_gpssow", + "start_gpsweek", + "start_orbit", + "start_region", + "start_rgt", + "version", + ] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data'][key] = {} - for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): - IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"][key] = {} + for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): + IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds['ancillary_data']['tep'] = {} - IS2_atl03_attrs['ancillary_data']['tep'] = {} - for key,val in fileID['ancillary_data']['tep'].items(): + IS2_atl03_mds["ancillary_data"]["tep"] = {} + IS2_atl03_attrs["ancillary_data"]["tep"] = {} + for key, val in fileID["ancillary_data"]["tep"].items(): # get each HDF5 variable - IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data']['tep'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1,cal2 = ('ancillary_data','calibrations') - for var in ['dead_time','first_photon_bias']: + cal1, cal2 = ("ancillary_data", "calibrations") + for var in ["dead_time", "first_photon_bias"]: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key,val in fileID[cal1][cal2][var].items(): + for key, val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1,tep2 = ('atlas_impulse_response','tep_histogram') + tep1, tep2 = ("atlas_impulse_response", "tep_histogram") IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ['pce1_spot1','pce2_spot3']: - IS2_atl03_mds[tep1][pce] = {tep2:{}} - IS2_atl03_attrs[tep1][pce] = {tep2:{}} + for pce in ["pce1_spot1", "pce2_spot3"]: + IS2_atl03_mds[tep1][pce] = {tep2: {}} + IS2_atl03_attrs[tep1][pce] = {tep2: {}} # for each TEP variable - for key,val in fileID[tep1][pce][tep2].items(): + for key, val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name,att_val in fileID.attrs.items(): + for att_name, att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) - + return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 @@ -264,22 +266,22 @@ def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = '0' + utm_band + utm_band = "0" + utm_band if lat >= 0: - epsg_code = 'epsg:326' + utm_band + epsg_code = "epsg:326" + utm_band return epsg_code - epsg_code = 'epsg:327' + utm_band + epsg_code = "epsg:327" + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - #To transform from WGS84 ellipsoidal height + # To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -297,7 +299,6 @@ def orthometric_correction(lat, lon, Z, epsg): return Y_utm, X_utm, Z_egm08 - def load_data(file_path, read_attributes=False): """ Wrapper around read_granule that lets you decide @@ -313,10 +314,8 @@ def load_data(file_path, read_attributes=False): If True, read_granule returns the full IS2_atl03_attrs tree. Defaults to False. """ - - - return read_granule(file_path, ATTRIBUTES = read_attributes) + return read_granule(file_path, ATTRIBUTES=read_attributes) # requires that the input gdf has ranged index values i @@ -327,16 +326,15 @@ def load_data(file_path, read_attributes=False): def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, 'is_land', - zero_int_array - 1, False) + ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -347,13 +345,15 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + land_polygon_gdf = gpd.read_file( + shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" + ) # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -365,7 +365,6 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: - print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -376,37 +375,64 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ - quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, - solar_elevation=None, background_rate=None): +def create_photon_dataframe( + lat_ph, + lon_ph, + ref_elev, + ref_azimuth, + geoid, + h_ph, + quality_ph, + is_land_label_interp1d, + signal_conf_photon, + x_atc, + relative_AT_dist, + solar_elevation=None, + background_rate=None, +): # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights h_ph_geoid_cor = h_ph[:] - geoid[:] - - # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - + # Perform orthometric correction to obtain UTM coordinates and corrected heights lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - + # Put the data into the dataframe - sea_photon_dataset = pd.DataFrame({ - 'latitude': lat_ph, - 'longitude': lon_ph, - 'lat': lat_utm, - 'lon': lon_utm, - 'photon_height': h_ph_geoid_cor, - 'quality_ph': quality_ph, - 'is_land_label': is_land_label_interp1d, - 'photon_conf': signal_conf_photon, - 'ref_elevation': ref_elev, - 'ref_azimuth': ref_azimuth, - 'relative_AT_dist': relative_AT_dist - }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) + sea_photon_dataset = pd.DataFrame( + { + "latitude": lat_ph, + "longitude": lon_ph, + "lat": lat_utm, + "lon": lon_utm, + "photon_height": h_ph_geoid_cor, + "quality_ph": quality_ph, + "is_land_label": is_land_label_interp1d, + "photon_conf": signal_conf_photon, + "ref_elevation": ref_elev, + "ref_azimuth": ref_azimuth, + "relative_AT_dist": relative_AT_dist, + }, + columns=[ + "latitude", + "longitude", + "lat", + "lon", + "photon_height", + "quality_ph", + "is_land_label", + "photon_conf", + "ref_elevation", + "ref_azimuth", + "relative_AT_dist", + ], + ) if solar_elevation is not None: - sea_photon_dataset['solar_elevation'] = solar_elevation + sea_photon_dataset["solar_elevation"] = solar_elevation if background_rate is not None: - sea_photon_dataset['background_rate'] = background_rate + sea_photon_dataset["background_rate"] = background_rate return sea_photon_dataset @@ -430,7 +456,13 @@ def interpolate_by_time(source_time, source_values, target_time): if unique_time.size < 2: return np.full_like(target_time, unique_values[0], dtype=float) - return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) + return np.interp( + target_time, + unique_time, + unique_values, + left=unique_values[0], + right=unique_values[-1], + ) def apply_optional_solar_background_filter( @@ -439,7 +471,7 @@ def apply_optional_solar_background_filter( day_threshold=6.0, median_window_deg=10.0, noise_multiplier=1.5, - min_signal_conf=2 + min_signal_conf=2, ): """ Optionally filter photons under strong daytime solar background conditions. @@ -456,17 +488,21 @@ def apply_optional_solar_background_filter( if not enabled: return sea_photon_dataset - required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} + required_cols = {"solar_elevation", "background_rate", "photon_conf"} if not required_cols.issubset(set(sea_photon_dataset.columns)): - logger.warning("Solar background filter skipped because required columns are missing.") + logger.warning( + "Solar background filter skipped because required columns are missing." + ) return sea_photon_dataset filtered_dataset = sea_photon_dataset.copy() - daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold + daytime_mask = filtered_dataset["solar_elevation"] >= day_threshold # Select valid daytime photons for rolling median computation daytime = filtered_dataset.loc[daytime_mask].copy() - valid = np.isfinite(daytime['background_rate']) & np.isfinite(daytime['solar_elevation']) + valid = np.isfinite(daytime["background_rate"]) & np.isfinite( + daytime["solar_elevation"] + ) daytime = daytime.loc[valid] if len(daytime) < 10: @@ -479,65 +515,66 @@ def apply_optional_solar_background_filter( # Clamp solar elevations to physical range [DAY_THRESHOLD, 90] to avoid # HDF5 fill values (e.g. 3.4e+38) blowing up the bin array. daytime = daytime.copy() - daytime['solar_elevation'] = daytime['solar_elevation'].clip( + daytime["solar_elevation"] = daytime["solar_elevation"].clip( lower=day_threshold, upper=90.0 ) # Vectorized degree-based rolling median: # Bin elevations into fine steps, compute one median per bin using # np.searchsorted on the sorted array, then map back to photons. - daytime = daytime.sort_values('solar_elevation') - elevations = daytime['solar_elevation'].values - bg_rates = daytime['background_rate'].values + daytime = daytime.sort_values("solar_elevation") + elevations = daytime["solar_elevation"].values + bg_rates = daytime["background_rate"].values half_window = median_window_deg / 2.0 bin_step = 0.1 # degrees — fine enough for smooth curve - bin_centers = np.arange( - elevations[0], elevations[-1] + bin_step, bin_step - ) + bin_centers = np.arange(elevations[0], elevations[-1] + bin_step, bin_step) bin_medians = np.empty(len(bin_centers)) for i in range(len(bin_centers)): - lo = np.searchsorted(elevations, bin_centers[i] - half_window, side='left') - hi = np.searchsorted(elevations, bin_centers[i] + half_window, side='right') + lo = np.searchsorted(elevations, bin_centers[i] - half_window, side="left") + hi = np.searchsorted(elevations, bin_centers[i] + half_window, side="right") if lo < hi: bin_medians[i] = np.median(bg_rates[lo:hi]) else: bin_medians[i] = np.nan # Map bin medians back to each photon via nearest-bin lookup (vectorized) - bin_indices = np.searchsorted(bin_centers, elevations, side='right') - 1 + bin_indices = np.searchsorted(bin_centers, elevations, side="right") - 1 bin_indices = np.clip(bin_indices, 0, len(bin_centers) - 1) expected_bg = bin_medians[bin_indices] # Map expected background onto the full DataFrame. Non-daytime and # non-finite daytime rows stay NaN, so they are never flagged noisy. - filtered_dataset['expected_bg'] = np.nan - filtered_dataset.loc[daytime.index, 'expected_bg'] = expected_bg + filtered_dataset["expected_bg"] = np.nan + filtered_dataset.loc[daytime.index, "expected_bg"] = expected_bg # Flag noisy daytime photons noisy_daytime_mask = daytime_mask & ( - filtered_dataset['background_rate'] >= filtered_dataset['expected_bg'] * noise_multiplier + filtered_dataset["background_rate"] + >= filtered_dataset["expected_bg"] * noise_multiplier + ) + keep_mask = (~noisy_daytime_mask) | ( + filtered_dataset["photon_conf"] >= min_signal_conf ) - keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) # Clean up temporary column - filtered_dataset.drop(columns=['expected_bg'], inplace=True, errors='ignore') + filtered_dataset.drop(columns=["expected_bg"], inplace=True, errors="ignore") before_count = len(filtered_dataset) filtered_dataset = filtered_dataset.loc[keep_mask].copy() after_count = len(filtered_dataset) logger.info( "Solar background filter (multiplier=%.1f) removed %s photons (from %s to %s).", - noise_multiplier, before_count - after_count, before_count, after_count + noise_multiplier, + before_count - after_count, + before_count, + after_count, ) return filtered_dataset def apply_optional_ir_ap_filter( - sea_photon_dataset, - enabled=False, - quality_max=0, - min_signal_conf=0 + sea_photon_dataset, enabled=False, quality_max=0, min_signal_conf=0 ): """ Optionally remove photons flagged as afterpulse using the quality_ph bitmask. @@ -559,19 +596,23 @@ def apply_optional_ir_ap_filter( filtered_dataset = sea_photon_dataset.copy() keep_mask = np.ones(len(filtered_dataset), dtype=bool) - if 'quality_ph' in filtered_dataset.columns: - quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce').fillna(0).astype(int) - afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse + if "quality_ph" in filtered_dataset.columns: + quality_values = ( + pd.to_numeric(filtered_dataset["quality_ph"], errors="coerce") + .fillna(0) + .astype(int) + ) + afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse keep_mask &= ~afterpulse_mask logger.info( "IR/AP filter: %d photons flagged as afterpulse (quality_ph bit 0) will be removed.", - int(afterpulse_mask.sum()) + int(afterpulse_mask.sum()), ) else: logger.warning("IR/AP filter enabled, but quality_ph is missing.") - if 'photon_conf' in filtered_dataset.columns: - conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') + if "photon_conf" in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset["photon_conf"], errors="coerce") keep_mask &= conf_values >= min_signal_conf else: logger.warning("IR/AP filter enabled, but photon_conf is missing.") @@ -581,7 +622,9 @@ def apply_optional_ir_ap_filter( after_count = len(filtered_dataset) logger.info( "IR/AP proxy filter removed %s photons (from %s to %s).", - before_count - after_count, before_count, after_count + before_count - after_count, + before_count, + after_count, ) return filtered_dataset @@ -601,105 +644,126 @@ def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path) Segment_ref_azimuth = {} background_rate = {} background_counts = {} - + # Initialize a list to store data for each beam beam_datasets = [] - - + # Loop over each strong beam in target_strong_beams for gtx in target_strong_beams: - print('Processing strong beam ID:', gtx) - + print("Processing strong beam ID:", gtx) + # Access the data for the current beam IS2_val = IS2_atl03_mds[gtx] - + # Initialize dictionaries to store segment data for the beam - Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] n_seg = len(Segment_ID[gtx]) - n_pe, = IS2_val['heights']['delta_time'].shape - Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 - Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] - Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] - Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] - delta_time = IS2_val['geolocation']['delta_time'] - segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() - segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() - ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() - ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() - geoid = IS2_val['geophys_corr']['geoid'][:].copy() - h_ph = IS2_val['heights']['h_ph'][:].copy() - photon_delta_time = IS2_val['heights']['delta_time'][:].copy() - lat_ph = IS2_val['heights']['lat_ph'][:].copy() - lon_ph = IS2_val['heights']['lon_ph'][:].copy() - signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() - x_atc = IS2_val['heights']['dist_ph_along'][:].copy() - y_atc = IS2_val['heights']['dist_ph_across'][:].copy() - quality_ph = IS2_val['heights']['quality_ph'] + (n_pe,) = IS2_val["heights"]["delta_time"].shape + Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 + Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] + Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] + Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] + delta_time = IS2_val["geolocation"]["delta_time"] + segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() + segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() + ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() + ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() + geoid = IS2_val["geophys_corr"]["geoid"][:].copy() + h_ph = IS2_val["heights"]["h_ph"][:].copy() + photon_delta_time = IS2_val["heights"]["delta_time"][:].copy() + lat_ph = IS2_val["heights"]["lat_ph"][:].copy() + lon_ph = IS2_val["heights"]["lon_ph"][:].copy() + signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() + x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() + y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() + quality_ph = IS2_val["heights"]["quality_ph"] # Optional variables for solar background sensitivity testing photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) - if 'solar_elevation' in IS2_val['geolocation']: - segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() + if "solar_elevation" in IS2_val["geolocation"]: + segment_solar_elevation = IS2_val["geolocation"]["solar_elevation"][ + : + ].copy() photon_solar_elevation = interpolate_by_time( delta_time, segment_solar_elevation, photon_delta_time ) - if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: - bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() - bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() + if ( + "bckgrd_atlas" in IS2_val + and "bckgrd_rate" in IS2_val["bckgrd_atlas"] + and "delta_time" in IS2_val["bckgrd_atlas"] + ): + bckgrd_rate = IS2_val["bckgrd_atlas"]["bckgrd_rate"][:].copy() + bckgrd_time = IS2_val["bckgrd_atlas"]["delta_time"][:].copy() photon_background_rate = interpolate_by_time( bckgrd_time, bckgrd_rate, photon_delta_time ) - # Adjust x_atc based on segment distances + # Adjust x_atc based on segment distances for seg_index in range(n_seg): idx = Segment_Index_begin[gtx][seg_index] cnt = Segment_PE_count[gtx][seg_index] - x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - # Calculate relative distances + # Calculate relative distances relative_AT_dist = (x_atc - x_atc[0]) / 1000 - relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 - + relative_seg_dist = ( + Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] + ) / 1000 + # Create a GeoDataFrame to hold segment data for shoreline check - Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - + # Determine if it is land by the land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels # Apply interpolations for required data - is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) - ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) - ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) + is_land_label_interp1d = apply_interpolation( + interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph + ) + ph_ref_elev = apply_interpolation( + interpolate_labels(segment_lat, ref_elev), lat_ph + ) + ph_ref_azimuth = apply_interpolation( + interpolate_labels(segment_lat, ref_azimuth), lat_ph + ) ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) - - # Create photon DataFrame for the current beam - sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, - ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, - geoid=ph_geoid,h_ph=h_ph, - quality_ph=quality_ph, - is_land_label_interp1d=is_land_label_interp1d, - signal_conf_photon=signal_conf_photon, - x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. - relative_AT_dist=relative_AT_dist, - solar_elevation=photon_solar_elevation, - background_rate=photon_background_rate) + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe( + lat_ph=lat_ph, + lon_ph=lon_ph, + ref_elev=ph_ref_elev, + ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid, + h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate, + ) # Filter out land photons - sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] - + sea_photon_dataset = sea_photon_dataset[ + sea_photon_dataset["is_land_label"] != 1 + ] + # Add a new column to indicate the beam ID - sea_photon_dataset['beam_id'] = gtx - + sea_photon_dataset["beam_id"] = gtx + # Append the processed dataset for the current beam to the list beam_datasets.append(sea_photon_dataset) - + # Concatenate all beam data into a single DataFrame all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) - + return all_beams_dataset @@ -708,103 +772,116 @@ def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000) Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, convex hull areas, and convex hull points. """ - lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) + lat_bins_grouped = photon_dataset.groupby("lat_bins", observed=False) filtered_dataset = photon_dataset.copy() - + convex_hulls = {} convex_hull_areas = {} - + for lat_bin, bin_data in lat_bins_grouped: if len(bin_data) >= 3: - points = bin_data[['lat', 'photon_height']].to_numpy() + points = bin_data[["lat", "photon_height"]].to_numpy() hull = ConvexHull(points) area = hull.volume convex_hull_areas[lat_bin] = area - + # Store the ConvexHull points if area meets the threshold if area >= hull_area_threshold: convex_hulls[lat_bin] = points[hull.vertices] else: # Remove bins with hull area below the threshold - filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] - - return filtered_dataset, convex_hull_areas, convex_hulls - - - - - - + filtered_dataset = filtered_dataset[ + filtered_dataset["lat_bins"] != lat_bin + ] + return filtered_dataset, convex_hull_areas, convex_hulls ########################## -##Discard Functions Below +##Discard Functions Below ########################## -# Extracts key information from filenames, +# Extracts key information from filenames, # which could include processed status, product ID, timestamps, identifiers, or metadata. def extract_file_params(file_path): - ''' + """ Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" - Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', '2023', '11', '15', '001', '12', '_data') - ''' + """ # Defines a regex pattern to match strings in the file path - rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' - r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') + rx = re.compile( + r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" + r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" + ) # Searches for all matches of the regex pattern in the given params = rx.findall(file_path).pop() return params def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): - ''' + """ create a mask for filtering sea surface, sea floor, and threshold_height - ''' - mask = (binned_data['height'] <= sea_surface_height) & \ - (binned_data['height'] >= seafloor_height) & \ - (binned_data['height'] >= threshold_height) + """ + mask = ( + (binned_data["height"] <= sea_surface_height) + & (binned_data["height"] >= seafloor_height) + & (binned_data["height"] >= threshold_height) + ) return mask -def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): - ''' +def generate_polygons_from_binned_data( + lat, height, mask, lat_interval, height_interval +): + """ horizontal_vertical_bin_dataset Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals - ''' + """ # Create latitude bins based on the specified interval lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) - + # Create height bins within the range [-12, 2] based on the specified interval height_bins = np.arange(-12, 2 + height_interval, height_interval) - + # Identify which bin each masked latitude and height value belongs to lat_bin_indices = np.digitize(lat[mask], lat_bins) - + # Determine which height bin each masked point belongs to height_bin_indices = np.digitize(height[mask], height_bins) - + # Combine latitude and height bin indices for each point - combined_bins = list(zip(lat_bin_indices, height_bin_indices)) - - # Find unique bin combinations and count the number of points in each bin + combined_bins = list(zip(lat_bin_indices, height_bin_indices, strict=False)) + + # Find unique bin combinations and count the number of points in each bin unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) - - # Filter bins to include only those within valid index ranges - valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] + + # Filter bins to include only those within valid index ranges + valid_bins = [ + (bin[0], bin[1]) + for bin in unique_bins + if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins) + ] polygons = [] for bin_lat, bin_height in valid_bins: - if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): - bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) - if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) + if ( + bin_lat > 0 + and bin_lat < len(lat_bins) + and bin_height > 0 + and bin_height < len(height_bins) + ): + bin_points = np.array( + [ + (lat[mask][i], height[mask][i]) + for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat + and height_bin_indices[i] == bin_height + ] + ) if len(bin_points) > 2: hull = ConvexHull(bin_points) polygons.append(bin_points[hull.vertices]) return lat_bins, height_bins, valid_bins, polygons - - - diff --git a/icesat-2_kdph_py/kd_utils/interpolation.py b/icesat-2_kdph_py/kd_utils/interpolation.py index 7670c89..23a4c24 100644 --- a/icesat-2_kdph_py/kd_utils/interpolation.py +++ b/icesat-2_kdph_py/kd_utils/interpolation.py @@ -2,18 +2,24 @@ import scipy.interpolate + def interpolate_labels(segment_lat, labels): model = scipy.interpolate.interp1d(segment_lat, labels, fill_value="extrapolate") return model + def apply_interpolation(model, lat_ph): return model(lat_ph) + def geoid_correction(lat_ph, segment_lat, geoid): model = interpolate_labels(segment_lat, geoid) return apply_interpolation(model, lat_ph) + def refraction_correction(lat_ph, segment_lat, ref_elev, ref_azimuth): elev_model = interpolate_labels(segment_lat, ref_elev) azimuth_model = interpolate_labels(segment_lat, ref_azimuth) - return apply_interpolation(elev_model, lat_ph), apply_interpolation(azimuth_model, lat_ph) + return apply_interpolation(elev_model, lat_ph), apply_interpolation( + azimuth_model, lat_ph + ) diff --git a/icesat-2_kdph_py/kd_utils/kd_utils.py b/icesat-2_kdph_py/kd_utils/kd_utils.py index 39c3687..5aa45ca 100644 --- a/icesat-2_kdph_py/kd_utils/kd_utils.py +++ b/icesat-2_kdph_py/kd_utils/kd_utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# coding: utf-8 ##### # This group of functions processes ICESat-2 data and creates a bathymetric model. @@ -14,40 +13,32 @@ # 8. Calculate bathymetric height (get_bath_height()) # 9. Produce figures (produce_figures()) ##### -import os -# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" +from datetime import datetime +# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" import io +import logging +import math import os import re import time -import math + +import geopandas as gpd import h5py -import logging +import hdbscan + +# from pyproj import Transformer +import matplotlib.pyplot as plt import netCDF4 import numpy as np -import geopandas as gpd - import pandas as pd -from datetime import datetime -import matplotlib.pyplot as plt -import hdbscan -from sklearn.preprocessing import StandardScaler, MinMaxScaler -import argparse -import subprocess # import fiona -import utm -import pyproj -from pyproj import Transformer, Proj - -# from pyproj import Transformer -from matplotlib.widgets import LassoSelector -from matplotlib.path import Path +from pyproj import Proj, Transformer +from sklearn.preprocessing import MinMaxScaler -from sklearn.cluster import DBSCAN -#this function from icesat2_toolkit +# this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ @@ -71,9 +62,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, 'r') + fileID = h5py.File(FILENAME, "r") else: - fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + fileID = h5py.File(os.path.expanduser(FILENAME), "r") # Output HDF5 file information logging.info(fileID.filename) @@ -85,12 +76,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]['geolocation']['segment_id'] - fileID[gtx]['heights']['delta_time'] + fileID[gtx]["geolocation"]["segment_id"] + fileID[gtx]["heights"]["delta_time"] except KeyError: pass else: @@ -100,71 +91,71 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): for gtx in IS2_atl03_beams: # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]['heights'] = {} - IS2_atl03_mds[gtx]['geolocation'] = {} - IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} - IS2_atl03_mds[gtx]['geophys_corr'] = {} + IS2_atl03_mds[gtx]["heights"] = {} + IS2_atl03_mds[gtx]["geolocation"] = {} + IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} + IS2_atl03_mds[gtx]["geophys_corr"] = {} # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_mds[gtx]['heights'][key] = val[:] + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_mds[gtx]["heights"][key] = val[:] # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_mds[gtx]["geolocation"][key] = val[:] # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]['heights'] = {} - IS2_atl03_attrs[gtx]['geolocation'] = {} - IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} - IS2_atl03_attrs[gtx]['geophys_corr'] = {} + IS2_atl03_attrs[gtx]["heights"] = {} + IS2_atl03_attrs[gtx]["geolocation"] = {} + IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} + IS2_atl03_attrs[gtx]["geophys_corr"] = {} # Global Group Attributes - for att_name,att_val in fileID[gtx].attrs.items(): + for att_name, att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_attrs[gtx]['heights'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_attrs[gtx]["heights"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_attrs[gtx]['geolocation'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_attrs[gtx]["geolocation"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val # ICESat-2 Geophysical Corrections Group - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds['orbit_info'] = {} - IS2_atl03_attrs['orbit_info'] = {} - for key,val in fileID['orbit_info'].items(): - IS2_atl03_mds['orbit_info'][key] = val[:] + IS2_atl03_mds["orbit_info"] = {} + IS2_atl03_attrs["orbit_info"] = {} + for key, val in fileID["orbit_info"].items(): + IS2_atl03_mds["orbit_info"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID['orbit_info'].attrs.items(): - IS2_atl03_attrs['orbit_info'][att_name] = att_val + for att_name, att_val in fileID["orbit_info"].attrs.items(): + IS2_atl03_attrs["orbit_info"][att_name] = att_val # Variable Attributes - IS2_atl03_attrs['orbit_info'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + IS2_atl03_attrs["orbit_info"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["orbit_info"][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -172,112 +163,131 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds['ancillary_data'] = {} - IS2_atl03_attrs['ancillary_data'] = {} - ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', - 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', - 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', - 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', - 'start_orbit','start_region','start_rgt','version'] + IS2_atl03_mds["ancillary_data"] = {} + IS2_atl03_attrs["ancillary_data"] = {} + ancillary_keys = [ + "atlas_sdp_gps_epoch", + "data_end_utc", + "data_start_utc", + "end_cycle", + "end_geoseg", + "end_gpssow", + "end_gpsweek", + "end_orbit", + "end_region", + "end_rgt", + "granule_end_utc", + "granule_start_utc", + "release", + "start_cycle", + "start_geoseg", + "start_gpssow", + "start_gpsweek", + "start_orbit", + "start_region", + "start_rgt", + "version", + ] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data'][key] = {} - for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): - IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"][key] = {} + for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): + IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds['ancillary_data']['tep'] = {} - IS2_atl03_attrs['ancillary_data']['tep'] = {} - for key,val in fileID['ancillary_data']['tep'].items(): + IS2_atl03_mds["ancillary_data"]["tep"] = {} + IS2_atl03_attrs["ancillary_data"]["tep"] = {} + for key, val in fileID["ancillary_data"]["tep"].items(): # get each HDF5 variable - IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data']['tep'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1,cal2 = ('ancillary_data','calibrations') - for var in ['dead_time','first_photon_bias']: + cal1, cal2 = ("ancillary_data", "calibrations") + for var in ["dead_time", "first_photon_bias"]: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key,val in fileID[cal1][cal2][var].items(): + for key, val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1,tep2 = ('atlas_impulse_response','tep_histogram') + tep1, tep2 = ("atlas_impulse_response", "tep_histogram") IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ['pce1_spot1','pce2_spot3']: - IS2_atl03_mds[tep1][pce] = {tep2:{}} - IS2_atl03_attrs[tep1][pce] = {tep2:{}} + for pce in ["pce1_spot1", "pce2_spot3"]: + IS2_atl03_mds[tep1][pce] = {tep2: {}} + IS2_atl03_attrs[tep1][pce] = {tep2: {}} # for each TEP variable - for key,val in fileID[tep1][pce][tep2].items(): + for key, val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name,att_val in fileID.attrs.items(): + for att_name, att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) + # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = '0' + utm_band + utm_band = "0" + utm_band if lat >= 0: - epsg_code = 'epsg:326' + utm_band + epsg_code = "epsg:326" + utm_band return epsg_code - epsg_code = 'epsg:327' + utm_band + epsg_code = "epsg:327" + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - #To transform from WGS84 ellipsoidal height + # To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -333,8 +343,10 @@ def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): # Iterate through ph_index_beg, from the first to second to last number # and set the photons between ph_index_beg i to ph_index_beg i + 1 to # segment id i - for i in range(0, len(atl03_ph_index_beg) - 1): - ph_segment_id[atl03_ph_index_beg[i]:atl03_ph_index_beg[i + 1]] = atl03_segment_id[i] + for i in range(len(atl03_ph_index_beg) - 1): + ph_segment_id[atl03_ph_index_beg[i] : atl03_ph_index_beg[i + 1]] = ( + atl03_segment_id[i] + ) # Return list of segment_id at the photon level return ph_segment_id @@ -388,34 +400,51 @@ def ref_linear_interp(x, y): def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): """Bin data along vertical and horizontal scales for later segmentation""" - + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-50, 10) - valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + valid_mask = (dataset["photon_height"] > valid_range[0]) & ( + dataset["photon_height"] < valid_range[1] + ) # Apply the valid_mask to filter unwanted values filtered_dataset = dataset[valid_mask] # Calculate the number of height bins - height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) - height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + height_range = abs( + filtered_dataset["photon_height"].max() + - filtered_dataset["photon_height"].min() + ) + height_bin_number = max( + 1, round(height_range / vertical_res) + ) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + lat_bins = pd.cut( + filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) # Create bins for height - height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, - labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), - filtered_dataset['photon_height'].max(), - num=height_bin_number), decimals=1)) + height_bins = pd.cut( + filtered_dataset["photon_height"], + bins=height_bin_number, + labels=np.round( + np.linspace( + filtered_dataset["photon_height"].min(), + filtered_dataset["photon_height"].max(), + num=height_bin_number, + ), + decimals=1, + ), + ) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, 'lat_bins'] = lat_bins - filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset.loc[:, "lat_bins"] = lat_bins + filtered_dataset.loc[:, "height_bins"] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -423,22 +452,29 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): # thinking about grid searching to detect bathymetric directly # rather than bin and then search -def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density_threshold): +def horizontal_vertical_grid_density_cal( + dataset, lat_res, vertical_res, density_threshold +): """Bin data along vertical and horizontal scales - and calculate high-density points using a grid method""" + and calculate high-density points using a grid method""" # Calculate the number of bins required both vertically - lat_bin_number = round(abs(dataset['lat'].min() - dataset['lat'].max()) / lat_res) + lat_bin_number = round(abs(dataset["lat"].min() - dataset["lat"].max()) / lat_res) # and horizontally based on resolution size - height_bin_number = round(abs(dataset['photon_height'].min() - dataset['photon_height'].max()) / vertical_res) + height_bin_number = round( + abs(dataset["photon_height"].min() - dataset["photon_height"].max()) + / vertical_res + ) # Create the grid grid = np.zeros((lat_bin_number, height_bin_number)) # Iterate over the dataset and assign points to cells for _, row in dataset.iterrows(): - lat_index = int((row['lat'] - dataset['lat'].min()) / lat_res) - height_index = int((row['photon_height'] - dataset['photon_height'].min()) / vertical_res) + lat_index = int((row["lat"] - dataset["lat"].min()) / lat_res) + height_index = int( + (row["photon_height"] - dataset["photon_height"].min()) / vertical_res + ) grid[lat_index, height_index] += 1 # Identify high-density cells @@ -448,10 +484,14 @@ def horizontal_vertical_grid_density_cal(dataset, lat_res, vertical_res, density dataset_copy = dataset.copy() # Assign the cell indices as bins to the dataset - dataset_copy['lat_bins'] = pd.cut(dataset['lat'], bins=lat_bin_number, - labels=np.arange(lat_bin_number)) - dataset_copy['height_bins'] = pd.cut(dataset['photon_height'], bins=height_bin_number, - labels=np.arange(height_bin_number)) + dataset_copy["lat_bins"] = pd.cut( + dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) + dataset_copy["height_bins"] = pd.cut( + dataset["photon_height"], + bins=height_bin_number, + labels=np.arange(height_bin_number), + ) # Reset the index of the copied dataset dataset_copy = dataset_copy.reset_index(drop=True) @@ -465,17 +505,21 @@ def horizontal_bin_dataset(dataset, lat_res): for later segmentation""" # Calculate the number of bins required horizontally based on resolution size - lat_bin_number = round(abs(dataset['lat_utm'].min() - dataset['lat_utm'].max()) / lat_res) + lat_bin_number = round( + abs(dataset["lat_utm"].min() - dataset["lat_utm"].max()) / lat_res + ) # Cut lat bins # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) - lat_bins = pd.cut(dataset['lat_utm'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) + lat_bins = pd.cut( + dataset["lat_utm"], bins=lat_bin_number, labels=np.array(range(lat_bin_number)) + ) # Create a copy of the dataset dataset_copy = dataset.copy() # Add lat bins to the dataframe - dataset_copy['lat_bins'] = lat_bins + dataset_copy["lat_bins"] = lat_bins # Reset the index of the copied dataset dataset_copy = dataset_copy.reset_index(drop=True) @@ -492,33 +536,34 @@ def get_rm_sea_surface_bin(binned_dataset): flag = 1 # group dataset by lat bins - grouped_data = binned_dataset.groupby(['lat_bins'], group_keys=True) + grouped_data = binned_dataset.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Loop through groups and return average sea height for k, v in data_groups.items(): - - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby('height_bins',observed=False).count()) + new_df = pd.DataFrame(v.groupby("height_bins", observed=False).count()) # Return the bin with the highest count - largest_h_bin = new_df['lat'].argmax() + largest_h_bin = new_df["lat"].argmax() # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] # get all values below this bin # Use boolean indexing to select only the values below the peak bin - new_photon_array_without_peak_bin = v.loc[v['height_bins'] < largest_h_index] + new_photon_array_without_peak_bin = v.loc[v["height_bins"] < largest_h_index] if flag == 1: photon_array_without_peak_bin = new_photon_array_without_peak_bin flag = 2 else: - photon_array_without_peak_bin = photon_array_without_peak_bin.append(new_photon_array_without_peak_bin) + photon_array_without_peak_bin = photon_array_without_peak_bin.append( + new_photon_array_without_peak_bin + ) del new_df @@ -527,36 +572,37 @@ def get_rm_sea_surface_bin(binned_dataset): def get_sea_surface_height(binned_data, threshold): """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - #set flag for the df save + + # set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - # Group dataset by latitude bins - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups and return average sea height for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) # Return the bin with the highest count - largest_h_bin = new_df['lat'].argmax() + largest_h_bin = new_df["lat"].argmax() # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + photons_sea_surface = v.loc[ + v["height_bins"] == largest_h_index, "photon_height" + ] lat_bin_sea_median = photons_sea_surface.median() # Append to sea height list @@ -567,119 +613,131 @@ def get_sea_surface_height(binned_data, threshold): # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = \ - v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & - (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + photons_sea_surface_up = v.loc[ + (v["photon_height"] > (lat_bin_sea_median - threshold)) + & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) + ] # Calculate the photon ratio between surface and whole photons - if v['photon_height'].shape[0] > 0: - new_photons_ratio_sea_surface = \ - photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + if v["photon_height"].shape[0] > 0: + new_photons_ratio_sea_surface = ( + photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] + ) else: new_photons_ratio_sea_surface = np.nan - sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + sea_surface_subsurface_photons_ratio.append(1 - new_photons_ratio_sea_surface) # Filter out sea height bin values outside 2 SD of mean. mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | - (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = \ - np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) - + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): # Get all values below this bin - NewPhotonDFBelowThresholdPeak = \ - v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] - + NewPhotonDFBelowThresholdPeak = v.loc[ + v["photon_height"] < (final_sea_surface_height[i] - threshold) + ] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex=False - else: - PhotonDFBelowThresholdPeak=\ - pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) - - - return final_sea_surface_height, \ - sea_surface_height_abnormal_label, \ - sea_surface_dominated_label,\ - PhotonDFBelowThresholdPeak + firstTimeIndex = False + else: + PhotonDFBelowThresholdPeak = pd.concat( + [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] + ) + + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowThresholdPeak, + ) # -#Arbitrary cutoff below the max value - 0.5 m below peak +# Arbitrary cutoff below the max value - 0.5 m below peak # (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) -def get_photon_below_sea_surface(binned_data,threshold): - '''Calculate mean sea height for easier calculation of depth and cleaner figures''' - - #set flag for the df save - firstTimeIndex=1 - +def get_photon_below_sea_surface(binned_data, threshold): + """Calculate mean sea height for easier calculation of depth and cleaner figures""" + + # set flag for the df save + firstTimeIndex = 1 + # Create sea height list sea_surface_height = [] -# mean_lat_bins_seq=[] - - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + # mean_lat_bins_seq=[] + + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) - + # Loop through groups and return average sea height - for k,v in data_groups.items(): - - lat_bin_average=v['lat_utm'].mean() - + for k, v in data_groups.items(): + lat_bin_average = v["lat_utm"].mean() + # Create new dataframe based on occurance of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) - + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) + # Return the bin with the highest count - largest_h_bin = new_df['lat_utm'].argmax() - + largest_h_bin = new_df["lat_utm"].argmax() + # Select the index of the bin with the highest count largest_h = new_df.index[largest_h_bin] - + # Calculate the median value of all values within this bin - lat_bin_sea_median = v.loc[v['height_bins']==largest_h, 'photon_height'].median() - + lat_bin_sea_median = v.loc[ + v["height_bins"] == largest_h, "photon_height" + ].median() + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) -# mean_lat_bins_seq.append(lat_bin_average) + # mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Filter out sea height bin values outside 2 SD of mean. mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - Final_sea_height = np.where((sea_surface_height > (mean + 2*sd)) | (sea_surface_height < (mean - 2*sd)), np.nan, - sea_surface_height).tolist() - Abnormal_sea_height_label=np.where(np.isnan(Final_sea_height), 1, 0) - + Final_sea_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + Abnormal_sea_height_label = np.where(np.isnan(Final_sea_height), 1, 0) # Loop through groups again and return photons below 0.5m of sea height - for k,v in data_groups.items(): - + for k, v in data_groups.items(): # get all values below this bin # Use calculated sea height to determine photons at 0.5m below peak - NewPhotonArrayBelowThresholdPeak= \ - v.loc[v['photon_height']<(sea_surface_height[k]-threshold)] - - if firstTimeIndex ==1: - PhotonArrayBelowThresholdPeak=NewPhotonArrayBelowThresholdPeak - firstTimeIndex=2 - - else: - PhotonArrayBelowThresholdPeak=PhotonArrayBelowThresholdPeak.append(NewPhotonArrayBelowThresholdPeak) - + NewPhotonArrayBelowThresholdPeak = v.loc[ + v["photon_height"] < (sea_surface_height[k] - threshold) + ] - return PhotonArrayBelowThresholdPeak + if firstTimeIndex == 1: + PhotonArrayBelowThresholdPeak = NewPhotonArrayBelowThresholdPeak + firstTimeIndex = 2 + else: + PhotonArrayBelowThresholdPeak = PhotonArrayBelowThresholdPeak.append( + NewPhotonArrayBelowThresholdPeak + ) + + return PhotonArrayBelowThresholdPeak # Function to get elevation for multiple points @@ -689,12 +747,15 @@ def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): rows, cols = np.array(rows), np.array(cols) # Ensure the indices are within bounds - valid_mask = (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + valid_mask = ( + (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) + ) elevations = np.full(lons.shape, np.nan) elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] return elevations + def get_water_temp(date_year, date_month, date_day, latitude, longitude): """ Pull down surface water temperature along the track from the JPL GHRSST opendap website. @@ -712,7 +773,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -723,8 +784,11 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * - (new_lat_max - new_lat_min) + new_lat_min) + new_lat = round( + ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) + * (new_lat_max - new_lat_min) + + new_lat_min + ) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -733,24 +797,41 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * - (new_lon_max - new_lon_min) + new_lon_min) + new_lon = round( + ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) + * (new_lon_max - new_lon_min) + + new_lon_min + ) # Access the SST data using the JPL OpenDap interface - url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ - + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ - + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + url = ( + "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" + + str(year) + + "/" + + str(zero_day_of_year) + + "/" + + str(date) + + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" + ) dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction(WTemp, WSmodel, Wavelength, - Photon_ref_elev, Ph_ref_azimuth, - PhotonZ, PhotonX, PhotonY, Ph_Conf): +def refraction_correction( + WTemp, + WSmodel, + Wavelength, + Photon_ref_elev, + Ph_ref_azimuth, + PhotonZ, + PhotonX, + PhotonY, + Ph_Conf, +): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -780,18 +861,18 @@ def refraction_correction(WTemp, WSmodel, Wavelength, n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = (1 - (n1 / n2)) + correction_coef = 1 - (n1 / n2) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) # eq 3. S # Approximate water Surface = 1.5 @@ -810,7 +891,7 @@ def refraction_correction(WTemp, WSmodel, Wavelength, phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -834,25 +915,36 @@ def refraction_correction(WTemp, WSmodel, Wavelength, outY = PhotonY + DN outZ = PhotonZ + DZ - ''' + """ print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - ''' - return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, - Photon_ref_elev) # We are most interested in out-x, out-y, out-z - - -def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surface_height, vertical_res): - """ Detect bathymetric level per bin based on percentile_thresh """ + """ + return ( + outX, + outY, + outZ, + Ph_Conf, + PhotonX, + PhotonY, + PhotonZ, + Ph_ref_azimuth, + Photon_ref_elev, + ) # We are most interested in out-x, out-y, out-z + + +def get_bath_height_percentile_thresh( + binned_data, percentile_thresh, sea_surface_height, vertical_res +): + """Detect bathymetric level per bin based on percentile_thresh""" # Create sea height list bath_height = [] @@ -862,34 +954,42 @@ def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surfac # Group data by latitude # Filter out surface data that are two bins below median surface value calculated above - binned_data_bath = binned_data[(binned_data['photon_height'] < - sea_surface_height - (vertical_res * 2))] - grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + binned_data_bath = binned_data[ + (binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) + ] + grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Create a percentile threshold of photon counts in each grid, # grouped by both x and y axes. count_threshold = np.percentile( - binned_data.groupby(['lat_bins', 'height_bins']).size().reset_index().groupby('lat_bins')[[0]].max(), - percentile_thresh) + binned_data.groupby(["lat_bins", "height_bins"]) + .size() + .reset_index() + .groupby("lat_bins")[[0]] + .max(), + percentile_thresh, + ) # Loop through groups and return average bathy height for k, v in data_groups.items(): - new_df = pd.DataFrame(v.groupby('height_bins').count()) - bath_bin = new_df['lat'].argmax() + new_df = pd.DataFrame(v.groupby("height_bins").count()) + bath_bin = new_df["lat"].argmax() bath_bin_h = new_df.index[bath_bin] # Set threshold of photon counts per bin # here this script determines whether there is bathymetry signals by # the photon counts per bin below sea surface height - if new_df.iloc[bath_bin]['lat'] >= count_threshold: - - geo_photon_height.append(v.loc[v['height_bins'] == - bath_bin_h, 'cor_photon_height'].values) - geo_longitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lon'].values) - geo_latitude.append(v.loc[v['height_bins'] == bath_bin_h, 'lat'].values) - - bath_bin_median = v.loc[v['height_bins'] == bath_bin_h, 'cor_photon_height'].median() + if new_df.iloc[bath_bin]["lat"] >= count_threshold: + geo_photon_height.append( + v.loc[v["height_bins"] == bath_bin_h, "cor_photon_height"].values + ) + geo_longitude.append(v.loc[v["height_bins"] == bath_bin_h, "lon"].values) + geo_latitude.append(v.loc[v["height_bins"] == bath_bin_h, "lat"].values) + + bath_bin_median = v.loc[ + v["height_bins"] == bath_bin_h, "cor_photon_height" + ].median() bath_height.append(bath_bin_median) del new_df @@ -902,17 +1002,23 @@ def get_bath_height_percentile_thresh(binned_data, percentile_thresh, sea_surfac geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() geo_depth = sea_surface_height - geo_photon_list geo_df = pd.DataFrame( - {'lon': geo_longitude_list, 'lat': geo_latitude_list, - 'photon_height': geo_photon_list, - 'depth': geo_depth}) + { + "lon": geo_longitude_list, + "lat": geo_latitude_list, + "photon_height": geo_photon_list, + "depth": geo_depth, + } + ) del geo_longitude_list, geo_latitude_list, geo_photon_list return bath_height, geo_df -def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_height, vertical_res): - """ Calculate bathymetric level per lat bin based on horizontal resolution """ +def get_bath_height_HDBSCAN( + lat_binned_data, percentile_thresh, sea_surface_height, vertical_res +): + """Calculate bathymetric level per lat bin based on horizontal resolution""" # Create sea height list bath_height = [] geo_photon_height = [] @@ -922,15 +1028,15 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig # Group data by latitude # Filter out surface data that are two bins (2 times height resolution) # below median sea surface value calculated above - binned_data_bath = lat_binned_data[(lat_binned_data['photon_height'] < - sea_surface_height - (vertical_res * 2))] + binned_data_bath = lat_binned_data[ + (lat_binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) + ] - grouped_data = binned_data_bath.groupby(['lat_bins'], group_keys=True) + grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) data_groups = dict(list(grouped_data)) # Loop through groups and return average bathymetric height for k, v in data_groups.items(): - # assign each group of dataset to a new dataframe new_df = pd.DataFrame(v) @@ -944,7 +1050,9 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig del new_df continue - lat_height_pairs = list(zip(new_df['lat_utm'], new_df['cor_photon_height'])) + lat_height_pairs = list( + zip(new_df["lat_utm"], new_df["cor_photon_height"], strict=False) + ) # Convert to a numpy array for sklearn lat_height_pairs_array = np.array(lat_height_pairs) @@ -968,9 +1076,13 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig continue # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) - cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, - min_samples=3, cluster_selection_epsilon=20, - leaf_size=25, core_dist_n_jobs=-1) + cluster = hdbscan.HDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=3, + cluster_selection_epsilon=20, + leaf_size=25, + core_dist_n_jobs=-1, + ) cluster.fit(data_scaled) # Get the labels assigned to each point by the HDBSCAN model @@ -980,7 +1092,7 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig unique, counts = np.unique(labels, return_counts=True) # Create a dictionary that maps each cluster label to the count of points assigned to it - clusters_dict = dict(zip(unique, counts)) + clusters_dict = dict(zip(unique, counts, strict=False)) # Print the dictionary to see the size of each cluster print(clusters_dict) @@ -994,20 +1106,20 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig else: # Add labels to the DataFrame new_df_copy = new_df.copy() - new_df_copy['cluster'] = labels + new_df_copy["cluster"] = labels # Subset the DataFrame to get only the data points in the largest cluster - bath_cluster_data = new_df[new_df_copy['cluster'] == max_cluster_label] + bath_cluster_data = new_df[new_df_copy["cluster"] == max_cluster_label] # calculate the median height value as the average bathymetric height - bath_bin_median = bath_cluster_data['cor_photon_height'].median() + bath_bin_median = bath_cluster_data["cor_photon_height"].median() bath_height.append(bath_bin_median) # Extract the longitude, latitude, and photon height for each lat bin # and add it to respective lists - geo_photon_height.append(bath_cluster_data['cor_photon_height'].values) - geo_longitude.append(bath_cluster_data['lon_utm'].values) - geo_latitude.append(bath_cluster_data['lat_utm'].values) + geo_photon_height.append(bath_cluster_data["cor_photon_height"].values) + geo_longitude.append(bath_cluster_data["lon_utm"].values) + geo_latitude.append(bath_cluster_data["lat_utm"].values) del new_df @@ -1021,12 +1133,17 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig # Create a DataFrame geo_df = pd.DataFrame( - {'lon': geo_longitude_list, 'lat': geo_latitude_list, - 'photon_height': geo_photon_list, - 'depth': geo_depth.tolist()}) + { + "lon": geo_longitude_list, + "lat": geo_latitude_list, + "photon_height": geo_photon_list, + "depth": geo_depth.tolist(), + } + ) return bath_height, geo_df + # requires that the input gdf has ranged index values i # will need to change if index is changed to time or something # this currently checks point in polygon for EVERY point @@ -1035,16 +1152,15 @@ def get_bath_height_HDBSCAN(lat_binned_data, percentile_thresh, sea_surface_heig def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, 'is_land', - zero_int_array - 1, False) + ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -1055,13 +1171,15 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + land_polygon_gdf = gpd.read_file( + shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" + ) # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -1073,7 +1191,6 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: - print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -1084,29 +1201,44 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -# -def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, - y_limit_top, y_limit_bottom, percentile, file, geo_df, - ref_y, ref_z, beam, epsg_num): +def produce_figures( + binned_data, + bath_height, + sea_height, + solo_sea_surface_label, + y_limit_top, + y_limit_bottom, + percentile, + file, + geo_df, + ref_y, + ref_z, + beam, + epsg_num, +): """Create figures""" # Create bins for latitude - bath_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(bath_height))+20 + bath_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 + ) - sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(sea_height))+10 + sea_surface_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 + ) # Create new dataframes for median values - bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + sea_surface_label_df = pd.DataFrame( + {"x": sea_surface_x_axis_bins, "y": sea_surface_label} + ) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -1117,62 +1249,107 @@ def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') - plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', - alpha=0.1, c='red', label='Classified Photons') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") + plt.scatter( + geo_df.lat, + geo_df.photon_height, + s=0.8, + marker="o", + alpha=0.1, + c="red", + label="Classified Photons", + ) # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter(bath_median_df.x, bath_median_df.y, - marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') - - plt.scatter(sea_median_df.x, sea_median_df.y, - marker='o', c='b', alpha=1, s=2, label='Median sea surface') - - plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, - marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') - plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, - marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + plt.scatter( + bath_median_df.x, + bath_median_df.y, + marker="o", + c="r", + alpha=0.8, + s=2, + label="Median bathymetry", + ) + + plt.scatter( + sea_median_df.x, + sea_median_df.y, + marker="o", + c="b", + alpha=1, + s=2, + label="Median sea surface", + ) + + plt.scatter( + sea_surface_label_df.iloc[idx_1].x, + sea_surface_label_df.iloc[idx_1].y, + marker="o", + c="pink", + alpha=1, + s=3, + label="solo_sea_surface", + ) + plt.scatter( + sea_surface_label_df.iloc[idx_0].x, + sea_surface_label_df.iloc[idx_0].y, + marker="o", + c="g", + alpha=1, + s=3, + label="non_solo_sea_surface", + ) # Insert titles and subtitles - plt.title('Icesat2 Bathymetry\n' + file) - plt.xlabel('Latitude', fontsize=25) - plt.ylabel('Photon Height (m)', fontsize=25) + plt.title("Icesat2 Bathymetry\n" + file) + plt.xlabel("Latitude", fontsize=25) + plt.ylabel("Photon Height (m)", fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={'size': 20}) + plt.legend(loc="upper left", prop={"size": 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace('.h5', '') + file = file.replace(".h5", "") # Define where to save file plt.tight_layout() - plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + - str(beam) + '_' + str(percentile) + - '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + plt.savefig( + "C:/Workstation/ICESat2_HLS/" + + file + + "_gt" + + str(beam) + + "_" + + str(percentile) + + "_EPSG" + + str(epsg_num) + + "_" + + timestr + + ".pdf" + ) # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs("EPSG:" + str(epsg_num), - "EPSG:4326", always_xy=True) + transformer = Transformer.from_crs( + "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True + ) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform( - geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - geo_df['lon_wgs84'] = lon_wgs84 - geo_df['lat_wgs84'] = lat_wgs84 + geo_df["lon_wgs84"] = lon_wgs84 + geo_df["lat_wgs84"] = lat_wgs84 - geodf = gpd.GeoDataFrame(geo_df, - geometry=gpd.points_from_xy(geo_df.lon_wgs84, - geo_df.lat_wgs84)) + geodf = gpd.GeoDataFrame( + geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) + ) geodf.set_crs(epsg=4326, inplace=True) diff --git a/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py b/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py index 9df2766..b0c30ec 100644 --- a/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py +++ b/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py @@ -1,12 +1,12 @@ +from datetime import datetime +import netCDF4 import numpy as np -import geopandas as gpd -from datetime import datetime import pandas as pd -import netCDF4 from scipy.signal import find_peaks from scipy.stats import norm + # Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): """ @@ -21,18 +21,20 @@ def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res) binned_beam_datasets = [] # Group the dataset by 'beam_id' and process each group separately - for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): - print(f'Processing binning for beam: {beam_id}') + for beam_id, beam_data in sea_photon_dataset.groupby("beam_id"): + print(f"Processing binning for beam: {beam_id}") # Apply binning to the current beam dataset - binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) + binned_beam_data = horizontal_vertical_bin_dataset( + beam_data, horizontal_res, vertical_res + ) # Append the binned data for the current beam to the list binned_beam_datasets.append(binned_beam_data) # Combine all binned beam datasets into a single DataFrame binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) - + return binned_dataset_sea_surface @@ -48,35 +50,52 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): 10 — Re-build histograms based on corrected depths (called again after refraction correction when apply_post_refraction_refit is enabled). """ - + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-70, 5) - valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + valid_mask = (dataset["photon_height"] > valid_range[0]) & ( + dataset["photon_height"] < valid_range[1] + ) # Apply the valid_mask to filter unwanted values # and create a copy to avoid SettingWithCopyWarning filtered_dataset = dataset[valid_mask].copy() # Calculate the number of height bins - height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) - height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + height_range = abs( + filtered_dataset["photon_height"].max() + - filtered_dataset["photon_height"].min() + ) + height_bin_number = max( + 1, round(height_range / vertical_res) + ) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + lat_bins = pd.cut( + filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) # Create bins for height - height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, - labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), - filtered_dataset['photon_height'].max(), - num=height_bin_number), decimals=1)) + height_bins = pd.cut( + filtered_dataset["photon_height"], + bins=height_bin_number, + labels=np.round( + np.linspace( + filtered_dataset["photon_height"].min(), + filtered_dataset["photon_height"].max(), + num=height_bin_number, + ), + decimals=1, + ), + ) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, 'lat_bins'] = lat_bins - filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset.loc[:, "lat_bins"] = lat_bins + filtered_dataset.loc[:, "height_bins"] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -98,61 +117,66 @@ def get_sea_surface_height_static(binned_data, threshold): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface """ - - #set flag for the df save + + # set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - # Group dataset along horizental (latitude bins) - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups to detect sea surface for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) # Check if new_df is not empty before finding the bin with the highest photon count - if not new_df.empty: + if not new_df.empty: # Find the vertical bin with the highest photon count - largest_h_bin = new_df['lat'].argmax() - + largest_h_bin = new_df["lat"].argmax() + # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] - + # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + photons_sea_surface = v.loc[ + v["height_bins"] == largest_h_index, "photon_height" + ] lat_bin_sea_median = photons_sea_surface.median() - + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = \ - v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & - (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] - + photons_sea_surface_up = v.loc[ + (v["photon_height"] > (lat_bin_sea_median - threshold)) + & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) + ] + # Calculate the photon ratio between surface and whole photons - if v['photon_height'].shape[0] > 0: - new_photons_ratio_sea_surface = \ - photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + if v["photon_height"].shape[0] > 0: + new_photons_ratio_sea_surface = ( + photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] + ) else: new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) - + + sea_surface_subsurface_photons_ratio.append( + 1 - new_photons_ratio_sea_surface + ) + else: # Append NaNs if the group is empty sea_surface_height.append(np.nan) @@ -163,38 +187,44 @@ def get_sea_surface_height_static(binned_data, threshold): mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | - (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = \ - np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) - + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin if not np.isnan(final_sea_surface_height[i]): - NewPhotonDFBelowThresholdPeak = \ - v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] - + NewPhotonDFBelowThresholdPeak = v.loc[ + v["photon_height"] < (final_sea_surface_height[i] - threshold) + ] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex=False - else: - PhotonDFBelowThresholdPeak=\ - pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) - - - return final_sea_surface_height, \ - sea_surface_height_abnormal_label, \ - sea_surface_dominated_label,\ - PhotonDFBelowThresholdPeak + firstTimeIndex = False + else: + PhotonDFBelowThresholdPeak = pd.concat( + [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] + ) + + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowThresholdPeak, + ) def get_sea_surface_height_adaptive(binned_data): @@ -219,18 +249,18 @@ def get_sea_surface_height_adaptive(binned_data): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed """ - + firstTimeIndex = True sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) for k, v in data_groups.items(): - lat_bin_average = v['lat'].mean() - + lat_bin_average = v["lat"].mean() + if len(v) < 20: # Increase minimum photon count for robustness sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -238,18 +268,22 @@ def get_sea_surface_height_adaptive(binned_data): continue # Finer histogram for better peak resolution - hist, bin_edges = np.histogram(v['photon_height'], - bins=100, # Finer bins - density=True, - range=(v['photon_height'].min(), v['photon_height'].max())) + hist, bin_edges = np.histogram( + v["photon_height"], + bins=100, # Finer bins + density=True, + range=(v["photon_height"].min(), v["photon_height"].max()), + ) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 # Enhanced peak detection - peaks, properties = find_peaks(hist, - height=np.max(hist)*0.3, # Stricter peak height - distance=10, # Wider separation - prominence=np.max(hist)*0.1) # Require prominent peaks - + peaks, properties = find_peaks( + hist, + height=np.max(hist) * 0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist) * 0.1, + ) # Require prominent peaks + if len(peaks) == 0: sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -263,25 +297,31 @@ def get_sea_surface_height_adaptive(binned_data): # Two-pass Gaussian fit: start with ±1 m, widen if sigma is large # (the ±1 m window underestimates sigma for SWH > 2 m) half_win = 1.0 - peak_data = v[(v['photon_height'] > surface_height - half_win) & - (v['photon_height'] < surface_height + half_win)] + peak_data = v[ + (v["photon_height"] > surface_height - half_win) + & (v["photon_height"] < surface_height + half_win) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) # Refit with wider window if sigma suggests truncation if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[(v['photon_height'] > mu - half_win2) & - (v['photon_height'] < mu + half_win2)] + peak_data2 = v[ + (v["photon_height"] > mu - half_win2) + & (v["photon_height"] < mu + half_win2) + ] if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2['photon_height']) + mu, sigma = norm.fit(peak_data2["photon_height"]) else: mu, sigma = surface_height, 0.2 # More conservative default adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # at least 1 m below # Extended surface layer to remove wave effects - surface_photons = v[(v['photon_height'] > adaptive_threshold) & - (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound + surface_photons = v[ + (v["photon_height"] > adaptive_threshold) + & (v["photon_height"] < mu + 3.0 * sigma) + ] # Wider upper bound ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan sea_surface_height.append(mu) @@ -291,45 +331,58 @@ def get_sea_surface_height_adaptive(binned_data): # Outlier filtering mean = np.nanmean(sea_surface_height) sd = np.nanstd(sea_surface_height) - final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | - (sea_surface_height < mean - 2*sd), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) - sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > mean + 2 * sd) | (sea_surface_height < mean - 2 * sd), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) # Filter subsurface photons with diagnostics PhotonDFBelowSurface = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): if not np.isnan(final_sea_surface_height[i]): half_win = 1.0 - peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - half_win) & - (v['photon_height'] < final_sea_surface_height[i] + half_win)] + peak_data = v[ + (v["photon_height"] > final_sea_surface_height[i] - half_win) + & (v["photon_height"] < final_sea_surface_height[i] + half_win) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[(v['photon_height'] > mu - half_win2) & - (v['photon_height'] < mu + half_win2)] + peak_data2 = v[ + (v["photon_height"] > mu - half_win2) + & (v["photon_height"] < mu + half_win2) + ] if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2['photon_height']) + mu, sigma = norm.fit(peak_data2["photon_height"]) adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) else: adaptive_threshold = final_sea_surface_height[i] - 1.0 - NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] - + NewPhotonDFBelowSurface = v[v["photon_height"] < adaptive_threshold] + if firstTimeIndex: PhotonDFBelowSurface = NewPhotonDFBelowSurface firstTimeIndex = False else: - PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) + PhotonDFBelowSurface = pd.concat( + [PhotonDFBelowSurface, NewPhotonDFBelowSurface] + ) - return (final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowSurface) + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface, + ) def get_water_temp(date_year, date_month, date_day, latitude, longitude): @@ -349,7 +402,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -360,8 +413,11 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * - (new_lat_max - new_lat_min) + new_lat_min) + new_lat = round( + ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) + * (new_lat_max - new_lat_min) + + new_lat_min + ) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -370,24 +426,41 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * - (new_lon_max - new_lon_min) + new_lon_min) + new_lon = round( + ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) + * (new_lon_max - new_lon_min) + + new_lon_min + ) # Access the SST data using the JPL OpenDap interface - url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ - + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ - + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + url = ( + "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" + + str(year) + + "/" + + str(zero_day_of_year) + + "/" + + str(date) + + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" + ) dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction(WTemp, WSmodel, Wavelength, - Photon_ref_elev, Ph_ref_azimuth, - PhotonZ, PhotonX, PhotonY, Ph_Conf): +def refraction_correction( + WTemp, + WSmodel, + Wavelength, + Photon_ref_elev, + Ph_ref_azimuth, + PhotonZ, + PhotonX, + PhotonY, + Ph_Conf, +): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -417,18 +490,18 @@ def refraction_correction(WTemp, WSmodel, Wavelength, n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = (1 - (n1 / n2)) + correction_coef = 1 - (n1 / n2) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) # eq 3. S # Approximate water Surface = 1.5 @@ -447,7 +520,7 @@ def refraction_correction(WTemp, WSmodel, Wavelength, phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -471,19 +544,27 @@ def refraction_correction(WTemp, WSmodel, Wavelength, outY = PhotonY + DN outZ = PhotonZ + DZ - ''' + """ print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - ''' - return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, - Photon_ref_elev) # We are most interested in out-x, out-y, out-z - + """ + return ( + outX, + outY, + outZ, + Ph_Conf, + PhotonX, + PhotonY, + PhotonZ, + Ph_ref_azimuth, + Photon_ref_elev, + ) # We are most interested in out-x, out-y, out-z diff --git a/icesat-2_kdph_py/kd_utils/visualization.py b/icesat-2_kdph_py/kd_utils/visualization.py index 67c45ba..d9ed46f 100644 --- a/icesat-2_kdph_py/kd_utils/visualization.py +++ b/icesat-2_kdph_py/kd_utils/visualization.py @@ -1,66 +1,103 @@ # utils/visualization.py -import numpy as np import os -import pandas as pd import time + import geopandas as gpd -from pyproj import Transformer, Proj import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pyproj import Transformer from scipy.spatial import ConvexHull + def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): fig, ax = plt.subplots() - ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') - ax.set_xlabel('Relative AT Distance') - ax.set_ylabel('Height (h_ph)') - ax.set_title('Scatter Plot of ATL03 Photons') + ax.scatter( + sea_photon_dataset["latitude"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="ATL03 Photons", + ) + ax.set_xlabel("Relative AT Distance") + ax.set_ylabel("Height (h_ph)") + ax.set_title("Scatter Plot of ATL03 Photons") ax.set_ylim(hlims) ax.legend() plt.show() -def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): + +def plot_filtered_seafloor_photons( + filtered_seafloor_subsurface_dataset, + sea_photon_dataset, + sea_surface_height, + output_path, +): """ Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. Parameters: - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. - - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. - sea_surface_height: Array of sea surface height values. - output_path: Path to save the output plot. """ - + # get sea_surface_x_axis_bins - sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), - filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), - len(sea_surface_height)) - + sea_surface_x_axis_bins = np.linspace( + filtered_seafloor_subsurface_dataset["relative_AT_dist"].min(), + filtered_seafloor_subsurface_dataset["relative_AT_dist"].max(), + len(sea_surface_height), + ) + hlims = [-25, 10] fig, ax = plt.subplots() # Scatter plot of subsurface photons - ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.scatter( + sea_photon_dataset["relative_AT_dist"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) # Overlay the seafloor elevation data as points - ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') - - ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], - linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') + ax.plot( + filtered_seafloor_subsurface_dataset["relative_AT_dist"], + filtered_seafloor_subsurface_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) + + ax.plot( + sea_surface_x_axis_bins, + [x - 0.5 for x in sea_surface_height], + linewidth=0.8, + color="#DD571C", + alpha=0.4, + label="0.5 m Below Surface Peak", + ) # Set labels and title - ax.set_xlabel('Distance From Start of Track (km)') - ax.set_ylabel('Photon Height (m)') - ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') + ax.set_xlabel("Distance From Start of Track (km)") + ax.set_ylabel("Photon Height (m)") + ax.set_title("Scatter Plot of Filtered Sea Photon Dataset") ax.set_ylim(hlims) # Add a legend ax.legend() # Save the plot - plt.legend(loc='upper right') - plt.savefig(output_path, dpi=400, format='jpeg') + plt.legend(loc="upper right") + plt.savefig(output_path, dpi=400, format="jpeg") # Show the plot plt.show() @@ -70,32 +107,54 @@ def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull """ Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. """ - #filter beam type + # filter beam type # photon_dataset - photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] - + photon_dataset = photon_dataset[photon_dataset["beam_id"].isin(target_beam_ids)] + fig, ax = plt.subplots(figsize=(10, 6)) hlims = [-10, 2] - + for lat_bin, hull_points in convex_hulls.items(): hull = ConvexHull(hull_points) - ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') - + ax.fill( + hull_points[hull.vertices, 0], + hull_points[hull.vertices, 1], + alpha=0.3, + label=f"Bin {lat_bin}", + ) + # Calculate centroid to place the label centroid = np.mean(hull_points[hull.vertices], axis=0) - ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', - horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') - - ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.text( + centroid[0], + centroid[1], + f"{convex_hull_areas[lat_bin]:.2f}", + horizontalalignment="center", + verticalalignment="center", + fontsize=10, + color="black", + ) + + ax.scatter( + photon_dataset["lat"], + photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) ax.set_ylim(hlims) - ax.set_xlabel('Latitude') - ax.set_ylabel('Photon Height') - ax.set_title('ConvexHull of Photons within each Lat Bin') + ax.set_xlabel("Latitude") + ax.set_ylabel("Photon Height") + ax.set_title("ConvexHull of Photons within each Lat Bin") plt.show() + # plot photons coloured by quality_ph flag for diagnostic purposes -def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target_beam_ids): +def plot_photon_quality_flags( + output_path, timestamp, sea_photon_dataset, target_beam_ids +): """ Scatter plot of photon height vs. along-track distance, coloured by quality_ph value. @@ -104,145 +163,219 @@ def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target bit 1 (value 2) = possible impulse response effect bit 2 (value 4) = possible TEP """ - dataset = sea_photon_dataset[sea_photon_dataset['beam_id'].isin(target_beam_ids)].copy() + dataset = sea_photon_dataset[ + sea_photon_dataset["beam_id"].isin(target_beam_ids) + ].copy() if dataset.empty: return quality_colors = { - 0: ('dimgray', 'nominal (0)'), - 1: ('red', 'afterpulse (1)'), - 2: ('orange', 'impulse response (2)'), - 3: ('darkred', 'afterpulse + impulse (3)'), - 4: ('royalblue','TEP (4)'), - 5: ('purple', 'afterpulse + TEP (5)'), - 6: ('cyan', 'impulse + TEP (6)'), - 7: ('black', 'all flags (7)'), + 0: ("dimgray", "nominal (0)"), + 1: ("red", "afterpulse (1)"), + 2: ("orange", "impulse response (2)"), + 3: ("darkred", "afterpulse + impulse (3)"), + 4: ("royalblue", "TEP (4)"), + 5: ("purple", "afterpulse + TEP (5)"), + 6: ("cyan", "impulse + TEP (6)"), + 7: ("black", "all flags (7)"), } fig, ax = plt.subplots(figsize=(12, 5)) - unique_flags = sorted(dataset['quality_ph'].dropna().astype(int).unique()) + unique_flags = sorted(dataset["quality_ph"].dropna().astype(int).unique()) for flag in unique_flags: - subset = dataset[dataset['quality_ph'].astype(int) == flag] - color, label = quality_colors.get(flag, ('magenta', f'unknown ({flag})')) - ax.scatter(subset['relative_AT_dist'], subset['photon_height'], - s=0.5, c=color, alpha=0.3, edgecolors='none', label=label) - - ax.set_xlabel('Relative Along-Track Distance (km)', fontsize=12) - ax.set_ylabel('Photon Height (m)', fontsize=12) - ax.set_title('Photon quality_ph flag distribution', fontsize=13) + subset = dataset[dataset["quality_ph"].astype(int) == flag] + color, label = quality_colors.get(flag, ("magenta", f"unknown ({flag})")) + ax.scatter( + subset["relative_AT_dist"], + subset["photon_height"], + s=0.5, + c=color, + alpha=0.3, + edgecolors="none", + label=label, + ) + + ax.set_xlabel("Relative Along-Track Distance (km)", fontsize=12) + ax.set_ylabel("Photon Height (m)", fontsize=12) + ax.set_title("Photon quality_ph flag distribution", fontsize=13) ax.set_ylim([-15, 5]) - ax.legend(loc='lower right', markerscale=6, fontsize=9) + ax.legend(loc="lower right", markerscale=6, fontsize=9) fig.tight_layout() - save_path = os.path.join(output_path, f'{timestamp}_quality_ph_flags.jpg') - plt.savefig(save_path, dpi=300, format='jpeg') + save_path = os.path.join(output_path, f"{timestamp}_quality_ph_flags.jpg") + plt.savefig(save_path, dpi=300, format="jpeg") plt.show() print(f"Quality flag plot saved: {save_path}") # Print summary counts print("\nquality_ph flag summary:") for flag in unique_flags: - count = (dataset['quality_ph'].astype(int) == flag).sum() - _, label = quality_colors.get(flag, ('', f'unknown ({flag})')) + count = (dataset["quality_ph"].astype(int) == flag).sum() + _, label = quality_colors.get(flag, ("", f"unknown ({flag})")) pct = 100.0 * count / len(dataset) print(f" {label:35s}: {count:>8,d} ({pct:.1f}%)") # plot the kd and photon -def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): - #filter beam type +def plot_kd_photons( + OutputPath, + timestamp, + target_beam_ids, + subsurface_photon_dataset, + Kd_DF_MergedDistance, +): + # filter beam type # photon_dataset - subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] - Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] - + subsurface_photon_dataset = subsurface_photon_dataset[ + subsurface_photon_dataset["beam_id"].isin(target_beam_ids) + ] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[ + Kd_DF_MergedDistance["beam_id"].isin(target_beam_ids) + ] + hlims = [-45, 5] fig, ax1 = plt.subplots(figsize=(10, 6)) - ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], - s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') - ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) - ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize - ax1.set_ylabel('Photon Height', color='b', fontsize=18) - ax1.tick_params(axis='y', labelcolor='b', labelsize=18) - if 'seafloor_elevation' in subsurface_photon_dataset.columns: - ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + ax1.scatter( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["photon_height"], + s=1.5, + c="k", + alpha=0.2, + edgecolors="none", + label="Subsurface ATL03 Photon Height", + ) + ax1.set_xlabel("Relative Along-Track Distance", fontsize=18) + ax1.tick_params(axis="x", labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel("Photon Height", color="b", fontsize=18) + ax1.tick_params(axis="y", labelcolor="b", labelsize=18) + if "seafloor_elevation" in subsurface_photon_dataset.columns: + ax1.plot( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') # ax1.set_xlim([1800, 2100]) # Set x-axis limits ax1.set_ylim(hlims) - + ax2 = ax1.twinx() - ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], - label='Kd values', color='r', alpha=0.6) - ax2.set_ylabel('Kd Value', color='r', fontsize=18) - ax2.tick_params(axis='y', labelcolor='r',labelsize=18) + ax2.scatter( + Kd_DF_MergedDistance["relative_AT_dist"], + Kd_DF_MergedDistance["kd"], + label="Kd values", + color="r", + alpha=0.6, + ) + ax2.set_ylabel("Kd Value", color="r", fontsize=18) + ax2.tick_params(axis="y", labelcolor="r", labelsize=18) # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() - fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) + fig.legend( + handles1 + handles2, + labels1 + labels2, + loc="upper right", + bbox_to_anchor=(0.85, 0.85), + ) # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) fig.tight_layout() - plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') + plt.savefig( + os.path.join(OutputPath, f"{timestamp}_IS2_subsurface_kd_2.jpg"), + dpi=400, + format="jpeg", + ) plt.show() - -def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): +def plot_bin_polygon_data( + lat, + height, + seafloor_height, + mask, + lat_bins, + height_bins, + valid_bins, + polygons, + y_min, + y_max, +): plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title("Original Height Data") - plt.scatter(lat, height, c=height, cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.scatter(lat, height, c=height, cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.subplot(1, 2, 2) plt.title("Masked Region (ROI)") - plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.scatter(lat[mask], height[mask], c=height[mask], cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") for polygon in polygons: - plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) + plt.gca().add_patch( + plt.Polygon(polygon, edgecolor="red", facecolor="none", linewidth=1) + ) for lb in lat_bins: - plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) + plt.axvline(x=lb, color="gray", linestyle="--", linewidth=0.5) for hb in height_bins: - plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) + plt.axhline(y=hb, color="gray", linestyle="--", linewidth=0.5) - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.tight_layout() plt.show() -# -def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, - y_limit_top, y_limit_bottom, percentile, file, geo_df, - ref_y, ref_z, beam, epsg_num): + +def produce_figures( + binned_data, + bath_height, + sea_height, + solo_sea_surface_label, + y_limit_top, + y_limit_bottom, + percentile, + file, + geo_df, + ref_y, + ref_z, + beam, + epsg_num, +): """Create figures""" # Create bins for latitude - bath_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(bath_height))+20 + bath_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 + ) - sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(sea_height))+10 + sea_surface_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 + ) # Create new dataframes for median values - bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + sea_surface_label_df = pd.DataFrame( + {"x": sea_surface_x_axis_bins, "y": sea_surface_label} + ) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -253,65 +386,110 @@ def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') - plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', - alpha=0.1, c='red', label='Classified Photons') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") + plt.scatter( + geo_df.lat, + geo_df.photon_height, + s=0.8, + marker="o", + alpha=0.1, + c="red", + label="Classified Photons", + ) # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter(bath_median_df.x, bath_median_df.y, - marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') - - plt.scatter(sea_median_df.x, sea_median_df.y, - marker='o', c='b', alpha=1, s=2, label='Median sea surface') - - plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, - marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') - plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, - marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + plt.scatter( + bath_median_df.x, + bath_median_df.y, + marker="o", + c="r", + alpha=0.8, + s=2, + label="Median bathymetry", + ) + + plt.scatter( + sea_median_df.x, + sea_median_df.y, + marker="o", + c="b", + alpha=1, + s=2, + label="Median sea surface", + ) + + plt.scatter( + sea_surface_label_df.iloc[idx_1].x, + sea_surface_label_df.iloc[idx_1].y, + marker="o", + c="pink", + alpha=1, + s=3, + label="solo_sea_surface", + ) + plt.scatter( + sea_surface_label_df.iloc[idx_0].x, + sea_surface_label_df.iloc[idx_0].y, + marker="o", + c="g", + alpha=1, + s=3, + label="non_solo_sea_surface", + ) # Insert titles and subtitles - plt.title('Icesat2 Bathymetry\n' + file) - plt.xlabel('Latitude', fontsize=25) - plt.ylabel('Photon Height (m)', fontsize=25) + plt.title("Icesat2 Bathymetry\n" + file) + plt.xlabel("Latitude", fontsize=25) + plt.ylabel("Photon Height (m)", fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={'size': 20}) + plt.legend(loc="upper left", prop={"size": 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace('.h5', '') + file = file.replace(".h5", "") # Define where to save file plt.tight_layout() - plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + - str(beam) + '_' + str(percentile) + - '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + plt.savefig( + "C:/Workstation/ICESat2_HLS/" + + file + + "_gt" + + str(beam) + + "_" + + str(percentile) + + "_EPSG" + + str(epsg_num) + + "_" + + timestr + + ".pdf" + ) # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs("EPSG:" + str(epsg_num), - "EPSG:4326", always_xy=True) + transformer = Transformer.from_crs( + "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True + ) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform( - geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - geo_df['lon_wgs84'] = lon_wgs84 - geo_df['lat_wgs84'] = lat_wgs84 + geo_df["lon_wgs84"] = lon_wgs84 + geo_df["lat_wgs84"] = lat_wgs84 - geodf = gpd.GeoDataFrame(geo_df, - geometry=gpd.points_from_xy(geo_df.lon_wgs84, - geo_df.lat_wgs84)) + geodf = gpd.GeoDataFrame( + geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) + ) geodf.set_crs(epsg=4326, inplace=True) # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") \ No newline at end of file + # driver="GPKG") diff --git a/icesat-2_kdph_py/main.py b/icesat-2_kdph_py/main.py index bf046f3..9168ef0 100644 --- a/icesat-2_kdph_py/main.py +++ b/icesat-2_kdph_py/main.py @@ -4,10 +4,7 @@ import re import sys -import pandas as pd - from config import get_args -from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.bathy_processing import process_subsurface_photon_filtering from kd_utils.data_processing import ( Extract_sea_photons, @@ -16,33 +13,41 @@ filter_photon_dataset_by_hull_area, load_data, ) +from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.sea_photons_analysis import process_sea_photon_binning -from kd_utils.visualization import plot_convex_hulls, plot_kd_photons, plot_photon_quality_flags - +from kd_utils.visualization import ( + plot_convex_hulls, + plot_kd_photons, + plot_photon_quality_flags, +) +import pandas as pd -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) PAIR_ID_MAP = { - 'gt1l': 'gt1_pair', - 'gt1r': 'gt1_pair', - 'gt2l': 'gt2_pair', - 'gt2r': 'gt2_pair', - 'gt3l': 'gt3_pair', - 'gt3r': 'gt3_pair', + "gt1l": "gt1_pair", + "gt1r": "gt1_pair", + "gt2l": "gt2_pair", + "gt2r": "gt2_pair", + "gt3l": "gt3_pair", + "gt3r": "gt3_pair", } def get_target_beams(all_beams, beam_attrs, target_beams_arg): strong_beams = [ - gtx for gtx in all_beams - if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' + gtx + for gtx in all_beams + if beam_attrs[gtx]["atlas_beam_type"].decode("utf-8") == "strong" ] if not target_beams_arg: return strong_beams[:1] # auto-select first strong beam - requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] + requested = [b.strip() for b in target_beams_arg.split(",") if b.strip()] return [b for b in requested if b in strong_beams] @@ -58,14 +63,16 @@ def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=Fals return dataset combined = dataset.copy() - combined['source_beam_id'] = combined['beam_id'] - combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) + combined["source_beam_id"] = combined["beam_id"] + combined["beam_id"] = ( + combined["beam_id"].map(PAIR_ID_MAP).fillna(combined["beam_id"]) + ) - if 'relative_AT_dist' in combined.columns: - rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 - pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') + if "relative_AT_dist" in combined.columns: + rel_m = pd.to_numeric(combined["relative_AT_dist"], errors="coerce") * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype("Int64") pair_bins = pair_bins.fillna(-1).astype(int) - combined['lat_bins'] = pair_bins + combined["lat_bins"] = pair_bins return combined @@ -87,9 +94,15 @@ def run_pipeline(args): 19 — Check data for reasonableness (filter_photon_dataset_by_hull_area + plot_kd_photons). """ - atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) - shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) - gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) + atl03_h5_file_path = os.path.join( + args.workspace_path, args.atl03_path, args.atl03_file + ) + shoreline_data_path = os.path.join( + args.workspace_path, args.other_data_path, args.shoreline_data + ) + gebco_full_path = os.path.join( + args.workspace_path, args.other_data_path, args.gebco_path + ) atl24_file_path = args.atl24_file if atl24_file_path and (not os.path.isabs(atl24_file_path)): atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) @@ -99,9 +112,14 @@ def run_pipeline(args): match = re.search(r"_(\d{14})_", atl03_h5_file_path) timestamp = match.group(1) if match else "unknown" - version_match = re.search(r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path)) + version_match = re.search( + r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path) + ) atl03_version = int(version_match.group(1)) if version_match else None - logger.info("ATL03 version detected: %s", f"{atl03_version:03d}" if atl03_version else "unknown") + logger.info( + "ATL03 version detected: %s", + f"{atl03_version:03d}" if atl03_version else "unknown", + ) if args.enable_ir_ap_filter: if atl03_version is None or atl03_version < 7: @@ -120,15 +138,21 @@ def run_pipeline(args): is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) if not target_strong_beams: - logger.error("No target strong beams found. Check input file and --target_beams.") + logger.error( + "No target strong beams found. Check input file and --target_beams." + ) sys.exit(1) logger.info("Strong beams selected: %s", target_strong_beams) plot_target_beam = [target_strong_beams[0]] - sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + sea_photon_dataset = Extract_sea_photons( + is2_mds, target_strong_beams, shoreline_data_path + ) if args.enable_ir_ap_filter and not args.no_plot: - plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, plot_target_beam) + plot_photon_quality_flags( + output_path, timestamp, sea_photon_dataset, plot_target_beam + ) sea_photon_dataset = apply_optional_ir_ap_filter( sea_photon_dataset, @@ -142,8 +166,8 @@ def run_pipeline(args): # Use --disable_solar_background_filter to turn it off. solar_bg_enabled = not args.disable_solar_background_filter if not solar_bg_enabled: - if 'solar_elevation' in sea_photon_dataset.columns: - max_solar_elev = sea_photon_dataset['solar_elevation'].max() + if "solar_elevation" in sea_photon_dataset.columns: + max_solar_elev = sea_photon_dataset["solar_elevation"].max() if max_solar_elev > args.solar_elevation_day_threshold: logger.warning( "Solar background filter is DISABLED via --disable_solar_background_filter " @@ -164,7 +188,9 @@ def run_pipeline(args): ) binned_dataset_sea_surface = process_sea_photon_binning( - sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res + sea_photon_dataset, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, ) post_refraction_refit_enabled = args.enable_post_refraction_refit @@ -175,50 +201,58 @@ def run_pipeline(args): ) post_refraction_refit_enabled = False - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - process_subsurface_photon_filtering( - binned_dataset_sea_surface, - gebco_file_path_lists, - args.subsurface_thresh, - args.ignore_subsurface_height_thres, - use_atl24_filter=args.enable_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, - use_gebco_filter=args.enable_gebco_filter, - apply_histogram_quality_filter=args.enable_histogram_quality_filter, - histogram_quality_min_ratio=args.histogram_quality_min_ratio, - histogram_quality_depth_min=args.histogram_quality_depth_min, - histogram_quality_depth_max=args.histogram_quality_depth_max, - histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, - apply_surface_sigma_filter=args.enable_surface_sigma_filter, - surface_sigma_max=args.surface_sigma_max, - apply_refraction_correction=args.enable_refraction_correction, - refraction_water_temp_c=args.refraction_water_temp_c, - refraction_wavelength_nm=args.refraction_wavelength_nm, - apply_post_refraction_refit=post_refraction_refit_enabled, - apply_flattening=args.enable_sea_surface_flattening, - flattening_window_m=args.sea_surface_flattening_window_m, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) - final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ - filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) - ].copy() + final_filtered_subsurface_photon_dataset = ( + filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset["beam_id"].isin( + target_strong_beams + ) + ].copy() + ) if args.enable_convex_hull_filter: - final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = ( filter_photon_dataset_by_hull_area( final_filtered_subsurface_photon_dataset, - hull_area_threshold=args.convex_hull_area_threshold + hull_area_threshold=args.convex_hull_area_threshold, ) + ) if plot_target_beam and not args.no_plot: plot_convex_hulls( final_filtered_subsurface_photon_dataset, plot_target_beam, convex_hulls, - convex_hull_areas + convex_hull_areas, ) subsurface_output_path = os.path.join( @@ -230,43 +264,50 @@ def run_pipeline(args): kd_input_dataset = optionally_combine_paired_beams_for_kd( final_filtered_subsurface_photon_dataset, horizontal_res=args.horizontal_res, - enabled=args.enable_paired_beam_combine + enabled=args.enable_paired_beam_combine, ) wave_mult = args.wave_exclusion_multiplier if args.enable_wave_adaptive_fit else 0.0 subsurface_photon_df_added_kd = process_kd_calculation( - kd_input_dataset, decay_zone_threshold=args.decay_zone_threshold, + kd_input_dataset, + decay_zone_threshold=args.decay_zone_threshold, kd_fit_method=args.kd_fit_method, wave_exclusion_multiplier=wave_mult, wave_sigma_calm_threshold=args.wave_sigma_calm_threshold, ) - kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") + kd_output_path = os.path.join( + output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv" + ) subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) - if not args.no_plot and 'relative_AT_dist' in kd_input_dataset.columns: + if not args.no_plot and "relative_AT_dist" in kd_input_dataset.columns: unique_photon_dataset = kd_input_dataset[ - ['relative_AT_dist', 'lat_bins', 'photon_height'] + ["relative_AT_dist", "lat_bins", "photon_height"] ].drop_duplicates() - unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( - 'lat_bins', observed=False - )['relative_AT_dist'].transform('mean') + unique_photon_dataset["relative_AT_dist_center"] = ( + unique_photon_dataset.groupby( + "lat_bins", observed=False + )["relative_AT_dist"].transform("mean") + ) _closest_rows = [] - for _lat_bin, _group in unique_photon_dataset.groupby('lat_bins', observed=False): + for _lat_bin, _group in unique_photon_dataset.groupby( + "lat_bins", observed=False + ): if _group.empty: continue - _center = _group['relative_AT_dist_center'].iloc[0] + _center = _group["relative_AT_dist_center"].iloc[0] _group = _group.copy() - _group['dist_to_center'] = abs(_group['relative_AT_dist'] - _center) - _closest_rows.append(_group.loc[[_group['dist_to_center'].idxmin()]]) + _group["dist_to_center"] = abs(_group["relative_AT_dist"] - _center) + _closest_rows.append(_group.loc[[_group["dist_to_center"].idxmin()]]) if not _closest_rows: logger.warning("No along-track bins survived filtering. Skipping plot.") else: closest_to_center = pd.concat(_closest_rows).reset_index(drop=True) kd_df_merged_distance = closest_to_center.merge( - subsurface_photon_df_added_kd, - on='lat_bins', - how='left' - ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + subsurface_photon_df_added_kd, on="lat_bins", how="left" + ).drop( + columns=["relative_AT_dist_center", "dist_to_center"], errors="ignore" + ) plot_kd_photons( output_path, @@ -279,7 +320,7 @@ def run_pipeline(args): logger.info("SUCCESS! Kd output: %s", kd_output_path) -if __name__ == '__main__': +if __name__ == "__main__": cli_args = get_args() try: run_pipeline(cli_args) diff --git a/pyproject.toml b/pyproject.toml index 2e859fd..390fe22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,4 +149,4 @@ ignore = [ # "*/tests/*" = ["E266"] [tool.ruff.lint.isort] -force-sort-within-sections = true \ No newline at end of file +force-sort-within-sections = true diff --git a/requirements.txt b/requirements.txt index 9ee58dd..d359c9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,12 @@ -icepyx - -# Need to confirm which of these remain dependencies -hdbscan -netcdf4 -utm geopandas h5py + +# Need to confirm which of these remain dependencies +hdbscan +icepyx matplotlib +netcdf4 numpy pandas pyproj @@ -15,4 +14,5 @@ rasterio Rtree scikit_learn scipy -Shapely \ No newline at end of file +Shapely +utm From de78ee92b86fd03d50fe19a930dc2dedb8fd7fcf Mon Sep 17 00:00:00 2001 From: Chao Date: Mon, 30 Mar 2026 15:57:04 -0400 Subject: [PATCH 06/11] move updated code into aok/core/kd_utils and remove duplicate icesat-2_kdph_py directory Co-Authored-By: Claude Opus 4.6 --- aok/core/kd_utils/Kd_analysis.py | 824 ++++++++-- aok/core/kd_utils/bathy_processing.py | 891 ++++++----- aok/core/kd_utils/config.py | 447 +++--- aok/core/kd_utils/data_processing.py | 714 ++++----- aok/core/kd_utils/main.py | 279 ++-- aok/core/kd_utils/sea_photons_analysis.py | 386 +++-- aok/core/kd_utils/visualization.py | 455 +++--- icesat-2_kdph_py/config.py | 464 ------ icesat-2_kdph_py/kd_utils/Kd_analysis.py | 1003 ------------ .../kd_utils/SeafloorFilterByGEBCO_Module.py | 122 -- icesat-2_kdph_py/kd_utils/__init__.py | 18 - icesat-2_kdph_py/kd_utils/bathy_processing.py | 1101 ------------- icesat-2_kdph_py/kd_utils/data_processing.py | 887 ----------- icesat-2_kdph_py/kd_utils/interpolation.py | 25 - icesat-2_kdph_py/kd_utils/kd_utils.py | 1358 ----------------- .../kd_utils/sea_photons_analysis.py | 570 ------- icesat-2_kdph_py/kd_utils/visualization.py | 495 ------ icesat-2_kdph_py/main.py | 329 ---- 18 files changed, 2290 insertions(+), 8078 deletions(-) delete mode 100644 icesat-2_kdph_py/config.py delete mode 100644 icesat-2_kdph_py/kd_utils/Kd_analysis.py delete mode 100644 icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py delete mode 100644 icesat-2_kdph_py/kd_utils/__init__.py delete mode 100644 icesat-2_kdph_py/kd_utils/bathy_processing.py delete mode 100644 icesat-2_kdph_py/kd_utils/data_processing.py delete mode 100644 icesat-2_kdph_py/kd_utils/interpolation.py delete mode 100644 icesat-2_kdph_py/kd_utils/kd_utils.py delete mode 100644 icesat-2_kdph_py/kd_utils/sea_photons_analysis.py delete mode 100644 icesat-2_kdph_py/kd_utils/visualization.py delete mode 100644 icesat-2_kdph_py/main.py diff --git a/aok/core/kd_utils/Kd_analysis.py b/aok/core/kd_utils/Kd_analysis.py index d315da2..4b03867 100644 --- a/aok/core/kd_utils/Kd_analysis.py +++ b/aok/core/kd_utils/Kd_analysis.py @@ -1,63 +1,61 @@ # utils/Kd_analysis.py # updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. -import logging - import numpy as np import pandas as pd - -# from scipy.optimize import curve_fit +import logging +from scipy.optimize import curve_fit from sklearn.linear_model import LinearRegression # def log_model(z, kd, e0): # return np.log(e0) - kd * z -## This is wrong because the input is already a bined data, +## This is wrong because the input is already a bined data, ## it is not necessary to do a histogram again # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -68,22 +66,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -97,47 +95,47 @@ # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -148,22 +146,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -173,119 +171,709 @@ # }) +def find_exponential_decay_zone(hist_df, decay_threshold=0.0): + """ + Identify the depth bins that lie within the exponential decay zone. + + Flowchart step: 16 — Determine depth where signal decays to 1% of incoming + signal or where photon decay is no longer exponential. Bins with zero photon + counts (log undefined) are excluded; the remaining contiguous non-zero bins + define the exponential decay zone. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (depth from surface, ascending), 'photon_counts'. + decay_threshold : float, optional + Fraction of peak photon count below which bins are excluded. + 0.0 (default) preserves original behaviour (only zero-count bins removed). + E.g. 0.01 removes bins with < 1% of the peak signal. + + Returns + ------- + pd.DataFrame + Filtered rows with a 'log_photon_counts' column added; only bins inside + the exponential decay zone are kept (photon_counts > 0 and log finite). + """ + df = hist_df.copy() + + # Apply decay threshold: exclude bins below threshold fraction of peak signal + if decay_threshold > 0.0: + peak_count = df['photon_counts'].max() + if peak_count > 0: + df.loc[df['photon_counts'] < decay_threshold * peak_count, 'photon_counts'] = 0 + + df['log_photon_counts'] = np.log(df['photon_counts'].replace(0, np.nan)) + df.loc[np.isinf(df['log_photon_counts']), 'log_photon_counts'] = np.nan + return df.dropna(subset=['zdepth', 'log_photon_counts']) + + +def fit_beers_law(valid_data): + """ + Fit Beer's Law exponential decay curve to the identified decay zone. + + Flowchart step: 17 — Fit exponential decay curve (Beer's Law) to data + within the zone of exponential decay. Implemented as linear regression on + log(photon_counts) vs depth; kd = -slope, e0 = exp(intercept). + + Parameters + ---------- + valid_data : pd.DataFrame + Output of find_exponential_decay_zone; columns 'zdepth' and + 'log_photon_counts'. + + Returns + ------- + tuple[float, float] + (kd, e0). kd is set to np.nan if negative or if fewer than 4 points. + """ + if valid_data['log_photon_counts'].notna().sum() <= 3: + return np.nan, np.nan + + zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) + log_counts_valid = valid_data['log_photon_counts'].values + + model = LinearRegression() + model.fit(zdepth_valid, log_counts_valid) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + + if kd < 0: + kd = np.nan + return kd, e0 + + +# --------------------------------------------------------------------------- +# Strategy A: Background subtraction + log-linear fit +# --------------------------------------------------------------------------- +def fit_beers_law_bg_subtract(hist_df, bg_fraction=0.2): + """ + Estimate the noise floor from the deepest bins, subtract it, then fit + Beer's Law in log-space on the noise-subtracted counts. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + bg_fraction : float + Fraction of the deepest bins used to estimate the background level. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + n_bins = len(df) + if n_bins < 5: + return np.nan, np.nan, np.nan + + # Estimate noise floor from the deepest bg_fraction of bins + n_bg = max(int(n_bins * bg_fraction), 2) + noise_floor = float(df.tail(n_bg)['photon_counts'].median()) + + # Subtract noise and keep only positive residuals + df['signal'] = df['photon_counts'] - noise_floor + df = df[df['signal'] > 0].copy() + if len(df) < 4: + return np.nan, np.nan, noise_floor + + df['log_signal'] = np.log(df['signal']) + + zdepth = df['zdepth'].values.reshape(-1, 1) + log_signal = df['log_signal'].values + + model = LinearRegression() + model.fit(zdepth, log_signal) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + + if kd < 0: + kd = np.nan + return kd, e0, noise_floor + + +# --------------------------------------------------------------------------- +# Strategy B: Breakpoint / segmented regression +# --------------------------------------------------------------------------- +def fit_beers_law_breakpoint(hist_df): + """ + Find the optimal breakpoint between exponential decay and noise floor, + then fit Beer's Law only to the decay segment. + + The breakpoint is chosen by testing every candidate position and selecting + the one that minimises total residual sum of squares of a two-segment + model: linear slope above the breakpoint, flat constant below. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + + Returns + ------- + tuple[float, float, float, float] + (kd, e0, breakpoint_depth, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + df = df[df['photon_counts'] > 0].copy() + if len(df) < 5: + return np.nan, np.nan, np.nan, np.nan + + df['log_counts'] = np.log(df['photon_counts']) + depths = df['zdepth'].values + log_counts = df['log_counts'].values + n = len(depths) + + best_rss = np.inf + best_bp_idx = None + + # Try each candidate breakpoint (need >= 4 points in decay segment, + # >= 1 in noise segment) + for bp_idx in range(4, n - 1): + # Decay segment: linear fit on bins 0..bp_idx-1 + z_decay = depths[:bp_idx].reshape(-1, 1) + lc_decay = log_counts[:bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + rss_decay = float(np.sum((model.predict(z_decay) - lc_decay) ** 2)) + + # Noise segment: flat at the mean of bins bp_idx..end + lc_noise = log_counts[bp_idx:] + noise_mean = lc_noise.mean() + rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) + + total_rss = rss_decay + rss_noise + if total_rss < best_rss: + best_rss = total_rss + best_bp_idx = bp_idx + + if best_bp_idx is None: + return np.nan, np.nan, np.nan, np.nan + + # Final fit on the decay segment + z_decay = depths[:best_bp_idx].reshape(-1, 1) + lc_decay = log_counts[:best_bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + + kd = -model.coef_[0] + e0 = np.exp(model.intercept_) + breakpoint_depth = float(depths[best_bp_idx]) + noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) + + if kd < 0: + kd = np.nan + return kd, e0, breakpoint_depth, noise_floor + + +# --------------------------------------------------------------------------- +# Strategy C: Nonlinear fit C(z) = A * exp(-Kd * z) + N +# --------------------------------------------------------------------------- +def _beer_plus_noise(z, A, kd, N): + """Model: signal = A * exp(-kd * z) + N.""" + return A * np.exp(-kd * z) + N + + +def fit_beers_law_nonlinear(hist_df): + """ + Fit the physically correct model C(z) = A·exp(-Kd·z) + N directly + to raw photon counts using nonlinear least-squares (no log transform). + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). + """ + df = hist_df.copy().sort_values('zdepth') + depths = df['zdepth'].values + counts = df['photon_counts'].values.astype(float) + + if len(depths) < 5: + return np.nan, np.nan, np.nan + + # Initial guesses + A0 = float(counts.max()) + N0 = float(np.median(counts[len(counts) * 3 // 4:])) # deepest 25% + # Quick log-linear Kd estimate for initial guess (ignore noise) + pos = counts > 0 + if pos.sum() >= 2: + log_c = np.log(counts[pos]) + z_pos = depths[pos] + kd0 = max(float(-(log_c[-1] - log_c[0]) / (z_pos[-1] - z_pos[0] + 1e-9)), 0.01) + else: + kd0 = 0.1 + + try: + popt, _ = curve_fit( + _beer_plus_noise, depths, counts, + p0=[A0, kd0, N0], + bounds=([0, 0, 0], [np.inf, np.inf, np.inf]), + maxfev=5000 + ) + A_fit, kd_fit, N_fit = popt + if kd_fit <= 0: + kd_fit = np.nan + return float(kd_fit), float(A_fit), float(N_fit) + except (RuntimeError, ValueError): + return np.nan, np.nan, np.nan + + +# --------------------------------------------------------------------------- +# Hybrid: Stabilised breakpoint zone detection + log-linear fit on decay only +# --------------------------------------------------------------------------- +def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, + expected_noise_floor=None, + noise_floor_tolerance=3.0): + """ + Two-stage approach matching the flowchart intent: + Step 16 — Find where exponential decay transitions to noise floor + (stabilised breakpoint with BIC selection). + Step 17 — Fit Beer's Law log-linear ONLY on the decay segment. + + Stabilisation constraints: + * Minimum breakpoint depth prevents shallow breakpoints that yield + unstable regression from too few decay bins. + * Decay segment slope must be negative (Kd > 0). + * BIC model selection penalises over-fitting, preventing the breakpoint + from drifting too deep (where the decay segment gets long and noisy). + * When expected_noise_floor is provided (from beam-level median), + candidate breakpoints whose noise segment deviates too far from the + expected value are penalised. + * Falls back to full log-linear if no valid breakpoint improves BIC + over a single-line model. + + Parameters + ---------- + hist_df : pd.DataFrame + Columns: 'zdepth' (ascending from surface), 'photon_counts'. + min_breakpoint_depth : float + Minimum depth (m) below surface for a valid breakpoint. Prevents + breakpoints landing in the first few bins where regression is + unstable. Default 2.0 m. + expected_noise_floor : float or None + If provided, the beam-level median noise floor (photon counts). + Candidate breakpoints whose noise segment mean deviates by more + than ``noise_floor_tolerance`` times from the expected value + receive a BIC penalty. + noise_floor_tolerance : float + Factor controlling how far the candidate noise segment mean may + deviate from ``expected_noise_floor`` before a penalty is applied. + Default 3.0 (allow 3x variation). + + Returns + ------- + tuple[float, float, float, float] + (kd, e0, breakpoint_depth, noise_floor). + breakpoint_depth and noise_floor are np.nan if fallback to full fit. + """ + df = hist_df.copy().sort_values('zdepth') + df = df[df['photon_counts'] > 0].copy() + n = len(df) + if n < 6: + return np.nan, np.nan, np.nan, np.nan + + df['log_counts'] = np.log(df['photon_counts']) + depths = df['zdepth'].values + log_counts = df['log_counts'].values + + # ------------------------------------------------------------------ + # Reference: BIC for a single-line model (no breakpoint) + # ------------------------------------------------------------------ + model_full = LinearRegression() + model_full.fit(depths.reshape(-1, 1), log_counts) + rss_full = float(np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2)) + k_full = 2 # slope + intercept + bic_full = n * np.log(rss_full / n + 1e-10) + k_full * np.log(n) + + # ------------------------------------------------------------------ + # Search: best breakpoint using BIC + # ------------------------------------------------------------------ + best_bic = bic_full # must beat the single-line model + best_bp_idx = None + best_model = None + + for bp_idx in range(4, n - 1): + # -- Minimum breakpoint depth constraint -- + if depths[bp_idx] < min_breakpoint_depth: + continue + + # -- Decay segment (surface to breakpoint) -- + z_decay = depths[:bp_idx].reshape(-1, 1) + lc_decay = log_counts[:bp_idx] + model = LinearRegression() + model.fit(z_decay, lc_decay) + + # Physical constraint: slope must be negative + if model.coef_[0] >= 0: + continue + + pred_decay = model.predict(z_decay) + rss_decay = float(np.sum((pred_decay - lc_decay) ** 2)) + + # -- Noise segment (breakpoint to bottom) -- + lc_noise = log_counts[bp_idx:] + noise_mean = float(lc_noise.mean()) + rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) + + # -- BIC for piecewise model (3 params: slope, intercept, noise level) + rss_total = rss_decay + rss_noise + k_bp = 3 + bic_bp = n * np.log(rss_total / n + 1e-10) + k_bp * np.log(n) + + # -- Noise floor consistency penalty -- + # When we have a beam-level expected noise floor, penalise + # candidates whose noise segment deviates substantially. + if expected_noise_floor is not None and expected_noise_floor > 0: + candidate_nf = float(np.exp(noise_mean)) + ratio = candidate_nf / expected_noise_floor + if ratio > noise_floor_tolerance or ratio < 1.0 / noise_floor_tolerance: + # Add a penalty proportional to log-deviation + bic_bp += n * abs(np.log(ratio)) + + if bic_bp < best_bic: + best_bic = bic_bp + best_bp_idx = bp_idx + best_model = model + + # ------------------------------------------------------------------ + # Result + # ------------------------------------------------------------------ + if best_bp_idx is None: + # No breakpoint beats the single-line model — fall back + kd = -model_full.coef_[0] + e0 = np.exp(model_full.intercept_) + if kd < 0: + kd = np.nan + return kd, e0, np.nan, np.nan + + kd = -best_model.coef_[0] + e0 = np.exp(best_model.intercept_) + breakpoint_depth = float(depths[best_bp_idx]) + noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) + + if kd < 0: + kd = np.nan + return kd, e0, breakpoint_depth, noise_floor + + +# --------------------------------------------------------------------------- +# Dispatcher: select fitting strategy by name +# --------------------------------------------------------------------------- +KD_FIT_METHODS = ('log_linear', 'bg_subtract', 'breakpoint', 'nonlinear', 'hybrid') + + +def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, + expected_noise_floor=None): + """ + Run the requested fitting strategy on a single histogram. + + Returns + ------- + tuple[float, float, float] + (kd, e0, noise_floor). noise_floor is np.nan for log_linear. + """ + if method == 'log_linear': + valid = find_exponential_decay_zone(hist_df, decay_threshold=decay_threshold) + kd, e0 = fit_beers_law(valid) + return kd, e0, np.nan + + elif method == 'bg_subtract': + return fit_beers_law_bg_subtract(hist_df) + + elif method == 'breakpoint': + kd, e0, bp, nf = fit_beers_law_breakpoint(hist_df) + return kd, e0, nf + + elif method == 'nonlinear': + return fit_beers_law_nonlinear(hist_df) + + elif method == 'hybrid': + kd, e0, bp, nf = fit_beers_law_hybrid( + hist_df, expected_noise_floor=expected_noise_floor + ) + return kd, e0, nf + + else: + raise ValueError(f"Unknown kd_fit_method: {method!r}. " + f"Choose from {KD_FIT_METHODS}") + + # another solution is to calculate kd without hist -def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): +def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_threshold=0.0, + kd_fit_method='log_linear', expected_noise_floor=None, + surface_sigma=None, + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1): + """ + Calculate Kd for a single along-track bin by fitting Beer's Law in log-space. + + Orchestrates steps 16 and 17 by calling find_exponential_decay_zone and + fit_beers_law in sequence. + + Flowchart steps: + 16 — find_exponential_decay_zone: determine depth range of exponential decay. + 17 — fit_beers_law: fit Beer's Law curve; kd = -slope, e0 = exp(intercept). + """ # Early exit if DataFrame is empty or missing required column - if df.empty or "lat_bins" not in df.columns: - return pd.DataFrame( - { - "lat_bins": [np.nan], - "kd": [np.nan], - "e0": [np.nan], - "latitude": [np.nan], - "longitude": [np.nan], - } - ) + if df.empty or 'lat_bins' not in df.columns: + return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], + 'noise_floor': [np.nan], 'surface_sigma': [np.nan], + 'latitude': [np.nan], 'longitude': [np.nan]}) # Retrieve latitude and longitude - lat_bin_value = df["lat_bins"].iloc[0] if not df["lat_bins"].empty else np.nan - latitude = ( - df["latitude"].mean() - if "latitude" in df.columns - else df["lat"].mean() - if "lat" in df.columns - else np.nan - ) - longitude = ( - df["longitude"].mean() - if "longitude" in df.columns - else df["lon"].mean() - if "lon" in df.columns - else np.nan - ) + lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan + latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan + longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan # Use value_counts to get photon counts in each height bin - height_counts = df["height_bins"].value_counts().sort_index() + height_counts = df['height_bins'].value_counts().sort_index() bin_centers = height_counts.index.astype(float) # Create a DataFrame for the height bins and counts - hist_df = pd.DataFrame( - {"zdepth": bin_centers, "photon_counts": height_counts.values} - ) + hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) # Reverse zdepth for model alignment - hist_df["zdepth"] = hist_df["zdepth"].max() - hist_df["zdepth"] - - # Log-transform photon counts, replacing zeros with NaN for regression - hist_df["log_photon_counts"] = np.log(hist_df["photon_counts"].replace(0, np.nan)) - hist_df.loc[np.isinf(hist_df["log_photon_counts"]), "log_photon_counts"] = np.nan - - # Filter for rows without NaNs for regression - valid_data = hist_df.dropna(subset=["zdepth", "log_photon_counts"]) + hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + + # Wave-adaptive fit: skip shallow bins contaminated by wave smearing/bubbles. + # zdepth=0 in the histogram corresponds to the adaptive surface detection + # threshold: max(3*sigma, 1.0 m) below the Gaussian surface peak + # (set by get_sea_surface_height_adaptive). The wave-adaptive trim adds + # an additional margin of (k - 3)*sigma beyond the 3-sigma cutoff to + # cover bubble injection beneath wave troughs and compensate for sigma + # underestimation from the truncated ±1 m Gaussian fit window. + if surface_sigma is not None and wave_exclusion_multiplier > 0: + if not np.isnan(surface_sigma) and surface_sigma > wave_sigma_calm_threshold: + adaptive_offset = max(3.0 * surface_sigma, 1.0) + wave_skip = max(0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset) + trimmed = hist_df[hist_df['zdepth'] >= wave_skip] + if len(trimmed) >= 4: + logging.info("Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " + "skip=%.2f m, %d->%d bins", + surface_sigma, adaptive_offset, wave_skip, + len(hist_df), len(trimmed)) + hist_df = trimmed + # else: keep original hist_df (too few bins after trim) + + kd, e0, noise_floor = _fit_kd_with_method( + hist_df, method=kd_fit_method, decay_threshold=decay_zone_threshold, + expected_noise_floor=expected_noise_floor + ) - # Perform regression if there are sufficient valid data points - if valid_data["log_photon_counts"].notna().sum() > 3: - zdepth_valid = valid_data["zdepth"].values.reshape(-1, 1) - log_counts_valid = valid_data["log_photon_counts"].values + return pd.DataFrame({ + 'lat_bins': [lat_bin_value], + 'kd': [kd], + 'e0': [e0], + 'noise_floor': [noise_floor], + 'surface_sigma': [surface_sigma if surface_sigma is not None else np.nan], + 'latitude': [latitude], + 'longitude': [longitude] + }) - # Perform linear regression - model = LinearRegression() - model.fit(zdepth_valid, log_counts_valid) - # Calculate kd and e0 - kd = -model.coef_[0] - e0 = np.exp(model.intercept_) +# Original kd calculation function remains unchanged +def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', + wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): + """ + Loop over along-track bins and call CalculateKdFromFilteredSubsurfacePhoton + for each bin. + + For the 'hybrid' method, a two-pass approach is used: + Pass 1 — Fit each bin independently to estimate per-bin noise floors. + Pass 2 — Compute the beam-level median noise floor, then re-fit each + bin with the expected noise floor as a stabilisation constraint. + + After fitting, IQR-based outlier filtering removes extreme Kd values + (for hybrid method only). + + Flowchart steps: + 16 — Determine depth of exponential decay zone (per bin, inside + CalculateKdFromFilteredSubsurfacePhoton). + 17 — Fit Beer's Law exponential decay curve (per bin). + 18 — Save K_dph attenuation coefficient value and statistical terms + (kd, e0 columns in the returned DataFrame). + """ + logging.info("Calculating Kd from filtered subsurface photon dataset (method=%s)", kd_fit_method) + + empty_df = pd.DataFrame({'lat_bins': [], 'kd': [], 'e0': [], 'noise_floor': [], 'surface_sigma': [], 'latitude': [], 'longitude': []}) + + groups = list(filtered_seafloor_subsurface_photon_dataset.groupby('lat_bins', observed=False)) + if not groups: + return empty_df + + if kd_fit_method == 'hybrid': + # ------------------------------------------------------------------ + # Pass 1: estimate per-bin noise floors (no expected_noise_floor) + # ------------------------------------------------------------------ + logging.info("Hybrid pass 1: estimating per-bin noise floors") + pass1_results = [] + for _, group in groups: + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + pass1_results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + )) + if not pass1_results: + return empty_df + pass1_df = pd.concat(pass1_results, ignore_index=True) + + # ------------------------------------------------------------------ + # Compute sliding-window noise floor for pass 2 + # ------------------------------------------------------------------ + # 10 km window = ±10 bins at 500 m resolution (20 bins total). + # Each bin's expected noise floor = median of valid noise floors + # within ±half_window bins. Falls back to beam-level median when + # the local window has < 3 valid estimates. + HALF_WINDOW = 10 # ±10 bins = ±5 km at 500 m horizontal_res + MIN_LOCAL = 3 # minimum valid noise floors to use local median + + nf_array = pass1_df['noise_floor'].values.copy() + n_bins = len(nf_array) + + # Beam-level fallback + valid_nf_all = pass1_df['noise_floor'].dropna() + valid_nf_all = valid_nf_all[valid_nf_all > 0] + beam_median_nf = float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + + if beam_median_nf is not None: + # Build per-bin expected noise floor via sliding window + expected_nf_per_bin = np.full(n_bins, np.nan) + for i in range(n_bins): + lo = max(0, i - HALF_WINDOW) + hi = min(n_bins, i + HALF_WINDOW + 1) + window_nf = nf_array[lo:hi] + valid = window_nf[~np.isnan(window_nf) & (window_nf > 0)] + if len(valid) >= MIN_LOCAL: + expected_nf_per_bin[i] = float(np.median(valid)) + else: + expected_nf_per_bin[i] = beam_median_nf # fallback + + logging.info("Hybrid pass 2: sliding window noise floor " + "(half_window=%d bins, beam_median=%.3f, " + "local range=%.3f-%.3f)", + HALF_WINDOW, beam_median_nf, + float(np.nanmin(expected_nf_per_bin)), + float(np.nanmax(expected_nf_per_bin))) + else: + expected_nf_per_bin = None + logging.info("Hybrid: insufficient noise floor estimates (%d), " + "skipping pass 2", len(valid_nf_all)) + + # ------------------------------------------------------------------ + # Pass 2: re-fit with per-bin expected noise floor constraint + # ------------------------------------------------------------------ + if expected_nf_per_bin is not None: + results = [] + for idx, (_, group) in enumerate(groups): + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + expected_noise_floor=float(expected_nf_per_bin[idx]), + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold + )) + SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) + else: + SubsurfacePhotonDFAddedKd = pass1_df + + # ------------------------------------------------------------------ + # IQR-based outlier filtering (hybrid only) + # Uses 3×IQR to accommodate real spatial variation (e.g. turbidity + # gradients near river mouths) while removing fitting artefacts. + # ------------------------------------------------------------------ + kd_vals = SubsurfacePhotonDFAddedKd['kd'].dropna() + if len(kd_vals) >= 5: + q1 = float(kd_vals.quantile(0.25)) + q3 = float(kd_vals.quantile(0.75)) + iqr = q3 - q1 + lower = q1 - 3.0 * iqr + upper = q3 + 3.0 * iqr + # Also apply physical cap: Kd > 5.0 m⁻¹ is unrealistic + upper = min(upper, 5.0) + outlier_mask = (SubsurfacePhotonDFAddedKd['kd'] < lower) | (SubsurfacePhotonDFAddedKd['kd'] > upper) + n_outliers = int(outlier_mask.sum()) + if n_outliers > 0: + logging.info("Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", + n_outliers, lower, upper) + SubsurfacePhotonDFAddedKd.loc[outlier_mask, 'kd'] = np.nan - # Set kd to NaN if negative - if kd < 0: - kd = np.nan else: - kd, e0 = np.nan, np.nan - - return pd.DataFrame( - { - "lat_bins": [lat_bin_value], - "kd": [kd], - "e0": [e0], - "latitude": [latitude], - "longitude": [longitude], - } - ) - + # ------------------------------------------------------------------ + # Non-hybrid methods: single pass, no IQR filtering + # ------------------------------------------------------------------ + results = [] + for _, group in groups: + sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None + results.append(CalculateKdFromFilteredSubsurfacePhoton( + group, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + )) + if not results: + return empty_df + SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) -# Original kd calculation function remains unchanged -def calculate_kd(filtered_seafloor_subsurface_photon_dataset): - logging.info("Calculating Kd from filtered subsurface photon dataset") - SubsurfacePhotonDFAddedKd = filtered_seafloor_subsurface_photon_dataset.groupby( - "lat_bins", observed=False - ).apply(CalculateKdFromFilteredSubsurfacePhoton, include_groups=True) - - # Remove the index without resetting it if 'lat_bins' already exists - SubsurfacePhotonDFAddedKd = SubsurfacePhotonDFAddedKd.droplevel(0) return SubsurfacePhotonDFAddedKd # Updated function to apply kd calculation beam-by-beam -def process_kd_calculation(Final_filtered_subsurface_photon_dataset): +def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', + wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): + """ + Beam-by-beam wrapper that calls calculate_kd for every beam and concatenates + the results into a single Kd output DataFrame. + + Flowchart steps: + 16 — Determine depth of exponential decay zone (delegated to + CalculateKdFromFilteredSubsurfacePhoton). + 17 — Fit Beer's Law exponential decay curve (delegated). + 18 — Save K_dph attenuation coefficient value (slope of the Beer's Law + curve) and statistical terms (kd, e0) — the returned DataFrame is + written to CSV by run_pipeline. + """ # Initialize list to store results for each beam kd_beam_datasets = [] # Group by 'beam_id' to process each beam independently - for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby( - "beam_id" - ): + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): logging.info(f"Calculating Kd for beam: {beam_id}") # Apply the calculate_kd function to the current beam's dataset - SubsurfacePhotonDFAddedKd = calculate_kd(beam_data) + SubsurfacePhotonDFAddedKd = calculate_kd( + beam_data, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) # Add a column to track the beam_id in the results - SubsurfacePhotonDFAddedKd["beam_id"] = beam_id + SubsurfacePhotonDFAddedKd['beam_id'] = beam_id # Append the result to the list kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) + if not kd_beam_datasets: + logging.warning( + "process_kd_calculation: no beam data remained after filtering. " + "All bins may have been discarded by an upstream filter (e.g. histogram quality). " + "Returning empty DataFrame." + ) + return pd.DataFrame(columns=['lat_bins', 'kd', 'e0', 'noise_floor', 'surface_sigma', 'latitude', 'longitude', 'beam_id']) + # Combine results from all beams into a single DataFrame combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) - return combined_kd_dataset + return combined_kd_dataset \ No newline at end of file diff --git a/aok/core/kd_utils/bathy_processing.py b/aok/core/kd_utils/bathy_processing.py index 07e1f19..772f5cd 100644 --- a/aok/core/kd_utils/bathy_processing.py +++ b/aok/core/kd_utils/bathy_processing.py @@ -1,18 +1,18 @@ # utils/bathy_processing.py import os - -import numpy as np +import logging import pandas as pd import rasterio -from rtree import index -from scipy.spatial import cKDTree -from scipy.stats import norm -from shapely.geometry import box - +import numpy as np from kd_utils.sea_photons_analysis import ( get_sea_surface_height_adaptive, + get_sea_surface_height_static, horizontal_vertical_bin_dataset, ) +from rtree import index +from shapely.geometry import box, Point +from scipy.spatial import cKDTree +from scipy.stats import norm def apply_optional_histogram_quality_filter( @@ -22,108 +22,160 @@ def apply_optional_histogram_quality_filter( min_ratio=0.05, depth_min=6.0, depth_max=7.0, + reference_depth_min=0.0, + reference_depth_max=1.0 ): """ - Optional histogram quality check: - keep along-track bins only if the target depth-band photon ratio is >= min_ratio. + Optional histogram quality check: keep along-track bins only if enough + signal remains at the target depth band relative to near-surface signal. + + Flowchart step: 6 — Review histograms for quality: if <5% of signal remains + at an uncorrected water depth of 6-7 m below the surface, quality-flag this + along-track bin as low confidence and exclude it from further analyses + (Possible discard). + + The decay ratio is computed as: + photon_count_in_depth_band / photon_count_in_reference_band + + This measures how much signal has decayed at depth compared to near the + surface, consistent with Beer's Law attenuation. + + Parameters + ---------- + binned_dataset_sea_surface : pd.DataFrame + Full binned photon dataset with 'lat_bins' and 'photon_height'. + subsurface_photon_dataset : pd.DataFrame + Subsurface photon dataset to filter. + sea_surface_height : list + Sea surface height per along-track bin. + min_ratio : float + Minimum decay ratio to keep a bin (default 0.05 = 5%). + depth_min : float + Minimum depth of the quality-check band (default 6.0 m). + depth_max : float + Maximum depth of the quality-check band (default 7.0 m). + reference_depth_min : float + Minimum depth of the near-surface reference band (default 0.0 m). + reference_depth_max : float + Maximum depth of the near-surface reference band (default 1.0 m). """ if subsurface_photon_dataset.empty: return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - quality_df = ( - pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} - ) - .dropna(subset=["sea_surface_height"]) - .copy() - ) + quality_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height': sea_surface_height + }).dropna(subset=['sea_surface_height']).copy() if quality_df.empty: return subsurface_photon_dataset.iloc[0:0].copy() ratios = [] for _, row in quality_df.iterrows(): - lat_bin = row["lat_bins"] - surface = row["sea_surface_height"] - bin_data = binned_dataset_sea_surface[ - binned_dataset_sea_surface["lat_bins"] == lat_bin - ] + lat_bin = row['lat_bins'] + surface = row['sea_surface_height'] + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] if bin_data.empty: ratios.append(0.0) continue - depth = surface - bin_data["photon_height"] + depth = surface - bin_data['photon_height'] underwater_mask = depth > 0 - total_underwater = int(underwater_mask.sum()) - if total_underwater == 0: + + # Reference band: near-surface photon count (incoming signal) + ref_mask = underwater_mask & (depth >= reference_depth_min) & (depth <= reference_depth_max) + ref_count = float(ref_mask.sum()) + if ref_count == 0: ratios.append(0.0) continue + # Target band: photon count at the check depth band_mask = underwater_mask & (depth >= depth_min) & (depth <= depth_max) - ratios.append(float(band_mask.sum()) / float(total_underwater)) + band_count = float(band_mask.sum()) - quality_df["hist_quality_ratio"] = ratios - valid_bins = set( - quality_df.loc[ - quality_df["hist_quality_ratio"] >= min_ratio, "lat_bins" - ].tolist() - ) - return subsurface_photon_dataset[ - subsurface_photon_dataset["lat_bins"].isin(valid_bins) - ].copy() + # Normalize by band width so bands of different sizes are comparable + ref_width = max(reference_depth_max - reference_depth_min, 0.01) + band_width = max(depth_max - depth_min, 0.01) + decay_ratio = (band_count / band_width) / (ref_count / ref_width) + ratios.append(decay_ratio) -def apply_optional_surface_sigma_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - sigma_max=0.5, -): - """ - Optional Gaussian surface sigma filter: - discard along-track bins if fitted surface sigma exceeds sigma_max. - """ - if subsurface_photon_dataset.empty: - return subsurface_photon_dataset + quality_df['hist_quality_ratio'] = ratios + valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + + +def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): + """Compute per-bin Gaussian surface sigma from photons near the surface peak. - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) + Uses a two-pass fit: initial ±1 m window, then expands to ±min(2.5σ, 3 m) + if the initial sigma > 0.4 m (indicating the ±1 m window truncates the + distribution). Returns a dict mapping lat_bin -> sigma (float or NaN). + Bins with fewer than 10 surface photons get NaN. + """ + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): - return subsurface_photon_dataset + return {} - sigma_rows = [] - for lat_bin, surface in zip(lat_bin_keys, sea_surface_height, strict=False): + sigma_map = {} + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): if pd.isna(surface): continue - bin_data = binned_dataset_sea_surface[ - binned_dataset_sea_surface["lat_bins"] == lat_bin - ] + bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] if bin_data.empty: continue peak_data = bin_data[ - (bin_data["photon_height"] > surface - 1.0) - & (bin_data["photon_height"] < surface + 1.0) + (bin_data['photon_height'] > surface - 1.0) & + (bin_data['photon_height'] < surface + 1.0) ] if len(peak_data) > 10: - _, sigma = norm.fit(peak_data["photon_height"]) + mu, sigma = norm.fit(peak_data['photon_height']) + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = bin_data[ + (bin_data['photon_height'] > mu - half_win2) & + (bin_data['photon_height'] < mu + half_win2) + ] + if len(peak_data2) > 10: + _, sigma = norm.fit(peak_data2['photon_height']) else: sigma = np.nan - sigma_rows.append((lat_bin, sigma)) + sigma_map[lat_bin] = sigma + + return sigma_map + - if not sigma_rows: +def apply_optional_surface_sigma_filter( + binned_dataset_sea_surface, + subsurface_photon_dataset, + sea_surface_height, + sigma_max=0.5 +): + """ + Optional Gaussian surface sigma filter: discard along-track bins if the + fitted surface Gaussian sigma exceeds sigma_max. + + Flowchart step: 8 — If standard deviation of Gaussian peak is >X m, + quality-flag this along-track bin as low confidence and discard it + (addresses surface waves) (Possible discard). + """ + if subsurface_photon_dataset.empty: return subsurface_photon_dataset - sigma_df = pd.DataFrame(sigma_rows, columns=["lat_bins", "surface_sigma"]) - valid_bins = set( - sigma_df.loc[sigma_df["surface_sigma"] <= sigma_max, "lat_bins"].tolist() + sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) + if not sigma_map: + return subsurface_photon_dataset + + sigma_df = pd.DataFrame( + list(sigma_map.items()), columns=['lat_bins', 'surface_sigma'] ) - return subsurface_photon_dataset[ - subsurface_photon_dataset["lat_bins"].isin(valid_bins) - ].copy() + valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) + return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() def apply_optional_refraction_correction( @@ -131,42 +183,32 @@ def apply_optional_refraction_correction( subsurface_photon_dataset, sea_surface_height, water_temp_c=20.0, - wavelength_nm=532.0, + wavelength_nm=532.0 ): """ Optional refraction correction for subsurface photons. Updates photon positions/heights using Snell-based geometry. + + Flowchart step: 9 — Compute water depths (distance below surface) and + correct depths for refraction. """ - required_cols = { - "lat_bins", - "photon_height", - "ref_elevation", - "ref_azimuth", - "lon", - "lat", - } - if subsurface_photon_dataset.empty or ( - not required_cols.issubset(set(subsurface_photon_dataset.columns)) - ): + required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} + if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) + grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - surface_df = pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} - ) - corrected = subsurface_photon_dataset.merge( - surface_df, on="lat_bins", how="left" - ).copy() - if corrected["sea_surface_height"].isna().all(): + surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) + corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() + if corrected['sea_surface_height'].isna().all(): return subsurface_photon_dataset - corrected["photon_height_pre_refraction"] = corrected["photon_height"] - corrected["lon_pre_refraction"] = corrected["lon"] - corrected["lat_pre_refraction"] = corrected["lat"] + corrected['photon_height_pre_refraction'] = corrected['photon_height'] + corrected['lon_pre_refraction'] = corrected['lon'] + corrected['lat_pre_refraction'] = corrected['lat'] # Refraction index parameterization from legacy module. a = -0.000001501562500 @@ -175,26 +217,14 @@ def apply_optional_refraction_correction( d = -0.000160475520686 e = 1.398067112092424 n1 = 1.00029 - n2 = ( - (a * water_temp_c**2) - + (b * wavelength_nm**2) - + (c * water_temp_c) - + (d * wavelength_nm) - + e - ) + n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e - ref_elev = pd.to_numeric(corrected["ref_elevation"], errors="coerce").to_numpy( - dtype=float - ) - ref_az = pd.to_numeric(corrected["ref_azimuth"], errors="coerce").to_numpy( - dtype=float - ) - z = pd.to_numeric(corrected["photon_height"], errors="coerce").to_numpy(dtype=float) - ws = pd.to_numeric(corrected["sea_surface_height"], errors="coerce").to_numpy( - dtype=float - ) - x = pd.to_numeric(corrected["lon"], errors="coerce").to_numpy(dtype=float) - y = pd.to_numeric(corrected["lat"], errors="coerce").to_numpy(dtype=float) + ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) + ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) + z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) + ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) + x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) + y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) # Convert to radians if values look like degrees. if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): @@ -202,13 +232,7 @@ def apply_optional_refraction_correction( if np.nanmax(np.abs(ref_az)) > (2 * np.pi): ref_az = np.deg2rad(ref_az) - valid_mask = ( - np.isfinite(ref_elev) - & np.isfinite(ref_az) - & np.isfinite(z) - & np.isfinite(ws) - & (z <= ws) - ) + valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) if not np.any(valid_mask): return subsurface_photon_dataset @@ -224,7 +248,7 @@ def apply_optional_refraction_correction( R = (S * n1) / n2 Gamma = (np.pi / 2.0) - theta1 phi = theta1 - theta2 - P_sq = np.maximum(R**2 + S**2 - 2 * R * S * np.cos(phi), 0.0) + P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) P = np.sqrt(P_sq) alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) alpha_arg = np.clip(alpha_arg, -1.0, 1.0) @@ -240,9 +264,9 @@ def apply_optional_refraction_correction( y_corr = y[valid_mask] + DN z_corr = z[valid_mask] + DZ - corrected.loc[valid_mask, "lon"] = x_corr - corrected.loc[valid_mask, "lat"] = y_corr - corrected.loc[valid_mask, "photon_height"] = z_corr + corrected.loc[valid_mask, 'lon'] = x_corr + corrected.loc[valid_mask, 'lat'] = y_corr + corrected.loc[valid_mask, 'photon_height'] = z_corr return corrected @@ -251,7 +275,7 @@ def apply_sea_surface_flattening( sea_surface_height, lat_bin_keys, horizontal_res, - flattening_window_m=500, + flattening_window_m=500 ): """ Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. @@ -263,69 +287,83 @@ def apply_sea_surface_flattening( if len(sea_surface_height) != len(lat_bin_keys): return subsurface_photon_dataset - surface_df = ( - pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height_local": sea_surface_height} - ) - .dropna(subset=["sea_surface_height_local"]) - .copy() - ) + surface_df = pd.DataFrame({ + 'lat_bins': lat_bin_keys, + 'sea_surface_height_local': sea_surface_height + }).dropna(subset=['sea_surface_height_local']).copy() if surface_df.empty: return subsurface_photon_dataset - surface_df["lat_bins_num"] = pd.to_numeric(surface_df["lat_bins"], errors="coerce") - surface_df = surface_df.dropna(subset=["lat_bins_num"]) + surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') + surface_df = surface_df.dropna(subset=['lat_bins_num']) if surface_df.empty: return subsurface_photon_dataset big_bin_size = max(1, int(round(flattening_window_m / max(horizontal_res, 1)))) - surface_df["big_bin"] = ( - surface_df["lat_bins_num"].astype(int) // big_bin_size - ).astype(int) + if big_bin_size <= 1: + logging.warning( + "Sea surface flattening window (%d m) <= horizontal_res (%d m) → " + "big_bin_size=1, flattening has no effect. " + "Use --sea_surface_flattening_window_m > --horizontal_res.", + flattening_window_m, horizontal_res + ) + surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) mean_surface = ( - surface_df.groupby("big_bin", observed=False)["sea_surface_height_local"] + surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] .mean() - .rename("sea_surface_height_bigbin_mean") + .rename('sea_surface_height_bigbin_mean') .reset_index() ) - surface_df = surface_df.merge(mean_surface, on="big_bin", how="left") - surface_df["sea_surface_flattening_offset"] = ( - surface_df["sea_surface_height_bigbin_mean"] - - surface_df["sea_surface_height_local"] + surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') + surface_df['sea_surface_flattening_offset'] = ( + surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] ) flattened = subsurface_photon_dataset.copy() - flattened["lat_bins_num"] = pd.to_numeric(flattened["lat_bins"], errors="coerce") - flattened["big_bin"] = ( - flattened["lat_bins_num"].fillna(-1).astype(int) // big_bin_size - ).astype(int) + flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') + flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) flattened = flattened.merge( - surface_df[ - [ - "lat_bins", - "sea_surface_height_local", - "sea_surface_height_bigbin_mean", - "sea_surface_flattening_offset", - ] - ], - on="lat_bins", - how="left", + surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], + on='lat_bins', + how='left' + ) + flattened['photon_height_original'] = flattened['photon_height'] + flattened['photon_height'] = ( + flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) ) - flattened["photon_height_original"] = flattened["photon_height"] - flattened["photon_height"] = flattened["photon_height"] - flattened[ - "sea_surface_flattening_offset" - ].fillna(0.0) - return flattened.drop(columns=["lat_bins_num", "big_bin"], errors="ignore") + # Re-bin height_bins from flattened photon_height so that downstream + # Kd calculation (which reads height_bins) uses the corrected depths. + if 'height_bins' in flattened.columns: + vertical_res_actual = horizontal_res # not used; infer from existing bins + # Infer the vertical resolution from the original height_bins spacing + orig_bins = pd.to_numeric(flattened['height_bins'], errors='coerce').dropna().unique() + if len(orig_bins) >= 2: + sorted_bins = np.sort(orig_bins) + diffs = np.diff(sorted_bins) + v_res = float(np.median(diffs[diffs > 0])) if np.any(diffs > 0) else 0.25 + else: + v_res = 0.25 + h_min = flattened['photon_height'].min() + h_max = flattened['photon_height'].max() + n_hbins = max(1, int(round((h_max - h_min) / v_res))) + bin_edges = np.linspace(h_min, h_max, n_hbins + 1) + bin_labels = np.round((bin_edges[:-1] + bin_edges[1:]) / 2, decimals=1) + flattened['height_bins'] = pd.cut( + flattened['photon_height'], bins=bin_edges, labels=bin_labels, + include_lowest=True + ) + + return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') # 1. Function to create an R-tree spatial index for raster bounds def create_spatial_index(gebco_paths): """ Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. - + Returns: raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. @@ -333,29 +371,26 @@ def create_spatial_index(gebco_paths): idx = index.Index() raster_data_dict = {} for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): + with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): gebco_raster = rasterio.open(path) raster_data = gebco_raster.read(1) raster_data_dict[path] = (gebco_raster, raster_data) bounds = gebco_raster.bounds - idx.insert( - i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path - ) + idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) return raster_data_dict, idx - # 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): """ Determine which raster datasets are relevant for the given coordinates using spatial index. This function uses a more efficient approach by performing a bounding box query for batches of points. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. raster_data_dict (dict): Dictionary containing raster datasets and their data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - + Returns: relevant_rasters (list): List of relevant raster datasets and their respective data. """ @@ -367,10 +402,7 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index # Get all rasters that intersect with the bounding box matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) relevant_paths = {match.object for match in matches} - relevant_rasters = [ - (raster_data_dict[path][0], raster_data_dict[path][1]) - for path in relevant_paths - ] + relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] return relevant_rasters @@ -378,191 +410,317 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index def get_seafloor_elevation(lons, lats, relevant_rasters): """ Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. relevant_rasters (list): List of relevant raster datasets and their respective data. - + Returns: seafloor_elevations (numpy.ndarray): Array of seafloor elevations. """ points = np.vstack((lons, lats)).T seafloor_elevations = np.full(len(lons), np.nan) - + for gebco_raster, raster_data in relevant_rasters: # Use rasterio.sample to get values for multiple points in a batch values = list(gebco_raster.sample(points)) for idx, value in enumerate(values): if np.isnan(seafloor_elevations[idx]) and value is not None: seafloor_elevations[idx] = value[0] - + return seafloor_elevations -# 4. The main process function that ties everything together -def process_seafloor_data( - sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres -): +def query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths): + """ + Query GEBCO raster tiles and attach a 'seafloor_elevation' column to the + photon dataset. Does not remove any rows. + + Flowchart step: 12 — Retrieve GEBCO data (or other regional raster + bathymetry data) for the region when ATL24 is unavailable or returns no + matching points. + + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain 'longitude' and 'latitude' columns. + gebco_paths : list + Paths to GEBCO GeoTIFF raster files. + + Returns + ------- + pd.DataFrame + Input dataset with 'seafloor_elevation' column added (negative = below + sea level). """ - Main function to process the seafloor data. + dataset = sea_photon_dataset.copy() + raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + lons = dataset['longitude'].values + lats = dataset['latitude'].values + relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) + dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + return dataset - Parameters: - gebco_paths (list): List of path to GEBCO raster files. - sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - Returns: - filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. +def remove_photons_below_seafloor(sea_photon_dataset): """ + Remove photons whose height is at or below the seabed elevation. - # Create spatial index and load GEBCO Raster Data - raster_data_dict, spatial_index = create_spatial_index(gebco_paths) + Flowchart step: 13 — Remove data below seabed (seabed defined by ATL24 or + GEBCO). - # Get the relevant rasters using spatial index - lons = sea_photon_dataset["longitude"].values - lats = sea_photon_dataset["latitude"].values - relevant_rasters = get_relevant_rasters_using_index( - lons, lats, raster_data_dict, spatial_index - ) + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain 'photon_height' and 'seafloor_elevation' columns. - # Get seafloor elevation for all points - sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( - lons, lats, relevant_rasters - ) + Returns + ------- + pd.DataFrame + Only rows where photon_height > seafloor_elevation. + """ + return sea_photon_dataset[ + sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation'] + ].copy() - # Calculate depth (assuming seafloor_elevation is negative below sea level) - # If your elevation is positive below sea level, remove the negative sign - sea_photon_dataset["depth"] = -sea_photon_dataset["seafloor_elevation"] - # Filter out points: - # 1. below the seafloor (photon_height > seafloor_elevation) - # 2. where depth is less than 6 meters (depth >= 6) - filtered_sea_photon_dataset = sea_photon_dataset[ - (sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"]) - & (sea_photon_dataset["depth"] >= Ignore_Subsurface_Height_Thres) - ] +def discard_shallow_seabed_bins(sea_photon_dataset, min_depth_m): + """ + Discard all along-track bins where the seabed is shallower than the minimum + depth threshold. Kd is not calculated for these bins; they are labelled as + low confidence by exclusion. + + Flowchart step: 14 — Discard all photon data in horizontal bins where the + seabed is <5 m deep; Kd is not calculated for these bins — label discarded + bins as low confidence. + + Parameters + ---------- + sea_photon_dataset : pd.DataFrame + Must contain a 'depth' column (= -seafloor_elevation, positive downward). + min_depth_m : float + Minimum seabed depth to retain (e.g., 5.0 m). Bins shallower than this + are discarded. + + Returns + ------- + pd.DataFrame + Only rows where depth >= min_depth_m. + """ + return sea_photon_dataset[ + sea_photon_dataset['depth'] >= min_depth_m + ].copy() + + +# 4. The main process function that ties everything together +def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): + """ + Query GEBCO bathymetry then apply seabed and shallow-water filters. - return filtered_sea_photon_dataset + Delegates to three single-step helpers: + query_gebco_seafloor_elevation → step 12 + remove_photons_below_seafloor → step 13 + discard_shallow_seabed_bins → step 14 + + Parameters: + gebco_paths (list): List of path to GEBCO raster files. + sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, + latitude, and photon height. + + Returns: + filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset + containing points above the seafloor in water deeper than threshold. + """ + dataset = query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths) # step 12 + dataset['depth'] = -dataset['seafloor_elevation'] + dataset = remove_photons_below_seafloor(dataset) # step 13 + return discard_shallow_seabed_bins(dataset, abs(Ignore_Subsurface_Height_Thres)) # step 14 def load_atl24_points(atl24_file_path): """ - Load ATL24 point dataset and normalize to columns: longitude, latitude, seafloor_elevation_atl24. + Load ATL24 point dataset and normalize to columns: longitude, latitude, + seafloor_elevation_atl24. + + Flowchart step: 12 — Query ATL24 data for bathymetry match; if no match, + retrieve GEBCO data (or other regional raster bathymetry data) for region. """ if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) lower_path = atl24_file_path.lower() - if lower_path.endswith((".csv", ".txt")): + if lower_path.endswith(('.h5', '.hdf5')): + try: + import h5py + beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'] + frames = [] + with h5py.File(atl24_file_path, 'r') as f: + for beam in beams: + if beam not in f: + continue + grp = f[beam] + if not all(k in grp for k in ('lon_ph', 'lat_ph', 'ortho_h', 'class_ph')): + continue + class_ph = grp['class_ph'][:] + bathy_mask = class_ph == 40 # ATL24 bathymetry class + # Apply low_confidence_flag filter if available + if 'low_confidence_flag' in grp: + bathy_mask = bathy_mask & (grp['low_confidence_flag'][:] == 0) + if bathy_mask.sum() == 0: + continue + frames.append(pd.DataFrame({ + 'longitude': grp['lon_ph'][:][bathy_mask].astype(float), + 'latitude': grp['lat_ph'][:][bathy_mask].astype(float), + 'seafloor_elevation_atl24': grp['ortho_h'][:][bathy_mask].astype(float), + })) + if not frames: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.concat(frames, ignore_index=True).dropna() + except Exception: + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + elif lower_path.endswith(('.csv', '.txt')): atl24_df = pd.read_csv(atl24_file_path) - elif lower_path.endswith(".parquet"): + elif lower_path.endswith('.parquet'): atl24_df = pd.read_parquet(atl24_file_path) - elif lower_path.endswith((".gpkg", ".shp", ".geojson")): + elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): try: import geopandas as gpd - atl24_gdf = gpd.read_file(atl24_file_path) atl24_df = pd.DataFrame(atl24_gdf) - if ( - "longitude" not in atl24_df.columns - or "latitude" not in atl24_df.columns - ) and ("geometry" in atl24_gdf.columns): - atl24_df["longitude"] = atl24_gdf.geometry.x - atl24_df["latitude"] = atl24_gdf.geometry.y + if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): + atl24_df['longitude'] = atl24_gdf.geometry.x + atl24_df['latitude'] = atl24_gdf.geometry.y except Exception: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) else: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) - lon_candidates = ["longitude", "lon", "x", "LONGITUDE", "LON"] - lat_candidates = ["latitude", "lat", "y", "LATITUDE", "LAT"] + lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] + lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] elev_candidates = [ - "seafloor_elevation", - "bottom_elevation", - "bathymetry", - "elevation", - "z", - "depth", - "water_depth", + 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', + 'depth', 'water_depth' ] lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) if lon_col is None or lat_col is None or elev_col is None: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) + return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) - out_df = ( - pd.DataFrame( - { - "longitude": pd.to_numeric(atl24_df[lon_col], errors="coerce"), - "latitude": pd.to_numeric(atl24_df[lat_col], errors="coerce"), - "atl24_raw_bathy": pd.to_numeric(atl24_df[elev_col], errors="coerce"), - } - ) - .dropna(subset=["longitude", "latitude", "atl24_raw_bathy"]) - .copy() - ) + out_df = pd.DataFrame({ + 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), + 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), + 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') + }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() - if "depth" in elev_col.lower() and ("elev" not in elev_col.lower()): - out_df["seafloor_elevation_atl24"] = -out_df["atl24_raw_bathy"].abs() + if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): + out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() else: - out_df["seafloor_elevation_atl24"] = out_df["atl24_raw_bathy"] + out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] - return out_df[["longitude", "latitude", "seafloor_elevation_atl24"]] + return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] def process_seafloor_data_atl24( sea_photon_dataset, atl24_file_path, Ignore_Subsurface_Height_Thres, - max_match_distance_deg=0.01, + max_match_distance_deg=0.01 ): """ - Match ATL24 bathymetry points to photons, then filter below seafloor and shallow bins. - Returns filtered dataset and number of matched photons. + Match ATL24 bathymetry points to photons, then filter below seafloor and + shallow bins. Returns filtered dataset and number of matched photons. + + Flowchart steps: + 12 — Query ATL24 data for bathymetry match; if no match, retrieve GEBCO data. + 13 — Remove data below seabed (seabed defined by ATL24 or GEBCO). + 14 — Discard all photon data in horizontal bins where the seabed is <5 m + deep; Kd is not calculated for these bins — label discarded bins as + low confidence. """ atl24_points = load_atl24_points(atl24_file_path) if atl24_points.empty: return sea_photon_dataset, 0 - photon_coords = sea_photon_dataset[["longitude", "latitude"]].to_numpy(dtype=float) - atl24_coords = atl24_points[["longitude", "latitude"]].to_numpy(dtype=float) + photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) + atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) if len(photon_coords) == 0 or len(atl24_coords) == 0: return sea_photon_dataset, 0 tree = cKDTree(atl24_coords) - distances, indices = tree.query( - photon_coords, k=1, distance_upper_bound=max_match_distance_deg - ) + distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) valid_match = np.isfinite(distances) & (indices < len(atl24_points)) matched_count = int(valid_match.sum()) if matched_count == 0: return sea_photon_dataset, 0 matched_dataset = sea_photon_dataset.copy() - matched_dataset["seafloor_elevation"] = np.nan - matched_dataset.loc[valid_match, "seafloor_elevation"] = atl24_points[ - "seafloor_elevation_atl24" - ].to_numpy()[indices[valid_match]] - matched_dataset = matched_dataset.dropna(subset=["seafloor_elevation"]).copy() - matched_dataset["depth"] = -matched_dataset["seafloor_elevation"] - - filtered_dataset = matched_dataset[ - (matched_dataset["photon_height"] > matched_dataset["seafloor_elevation"]) - & (matched_dataset["depth"] >= Ignore_Subsurface_Height_Thres) - ] + matched_dataset['seafloor_elevation'] = np.nan + matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() + matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] + + above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 + filtered_dataset = discard_shallow_seabed_bins(above_floor, abs(Ignore_Subsurface_Height_Thres)) # step 14 return filtered_dataset, matched_count +def rebuild_and_refit_surface_after_refraction( + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c, + wavelength_nm, + horizontal_res, + vertical_res, +): + """ + Apply refraction correction to the full binned dataset, rebuild histograms + on the corrected depths, and re-fit the Gaussian surface peak. + + Flowchart steps: + 10 — Re-build histograms based on corrected depths + (apply_optional_refraction_correction → horizontal_vertical_bin_dataset). + 11 — Re-fit Gaussian curve to surface to identify surface peak; compute + standard deviation and remove histogram data within three standard + deviations (get_sea_surface_height_adaptive on the rebinned data). + + Parameters + ---------- + binned_dataset_sea_surface : pd.DataFrame + Full binned photon dataset before refraction correction. + sea_surface_height : list + Per-bin surface heights from step 7. + water_temp_c : float + Water temperature for the refraction index calculation. + wavelength_nm : float + Laser wavelength (nm) for the refraction index calculation. + horizontal_res : float + Along-track bin size (m) for histogram rebuild. + vertical_res : float + Vertical bin size (m) for histogram rebuild. + + Returns + ------- + tuple + (sea_surface_height, sea_surface_height_abnormal_label, + solo_sea_surface_label, subsurface_photon_dataset) — same 4-tuple as + get_sea_surface_height_adaptive. + """ + corrected_full_dataset = apply_optional_refraction_correction( + binned_dataset_sea_surface, + binned_dataset_sea_surface.copy(), + sea_surface_height, + water_temp_c=water_temp_c, + wavelength_nm=wavelength_nm, + ) + rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 + corrected_full_dataset, horizontal_res, vertical_res + ) + return get_sea_surface_height_adaptive(rebinned_corrected) # step 11 + + # 5. The function to get the subsurface photon dataset def get_subsurface_photon( binned_dataset_sea_surface, @@ -570,13 +728,15 @@ def get_subsurface_photon( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path="", + atl24_file_path='', atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, histogram_quality_min_ratio=0.05, histogram_quality_depth_min=6.0, histogram_quality_depth_max=7.0, + histogram_quality_ref_depth_min=0.0, + histogram_quality_ref_depth_max=1.0, apply_surface_sigma_filter=False, surface_sigma_max=0.5, apply_refraction_correction=False, @@ -586,24 +746,37 @@ def get_subsurface_photon( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25, + vertical_res=0.25 ): - # get sea surface height - # threshold to determine how much depth below sea surface will be accounted - # Output: - # ->final_sea_surface_height - # ->sea_surface_height_abnormal_label - # ->sea_surface_dominated_label - # # static threshold - # sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - # get_sea_surface_height_static(binned_dataset_sea_surface, subsurface_thresh) + """ + Orchestrate per-beam subsurface photon extraction and all optional quality filters. + + Flowchart steps executed in order: + 7 — Fit Gaussian curve to identify surface elevation; compute std dev of + Gaussian peak (via get_sea_surface_height_adaptive). + 6 — Review histograms for quality; quality-flag low-confidence bins and + exclude (optional, apply_histogram_quality_filter). + 8 — If Gaussian std dev > X m, quality-flag and discard bin (optional, + apply_surface_sigma_filter). + 9 — Compute water depths and correct for refraction (optional, + apply_refraction_correction). + 10 — Re-build histograms based on corrected depths (optional, + apply_post_refraction_refit → rebuild_and_refit_surface_after_refraction). + 11 — Re-fit Gaussian curve to identify surface peak; remove histogram data + within three standard deviations (optional, + rebuild_and_refit_surface_after_refraction). + 12 — Query ATL24 / retrieve GEBCO bathymetry data (optional, + use_atl24_filter / use_gebco_filter). + 13 — Remove data below seabed (optional). + 14 — Discard bins where seabed < 5 m deep; label as low confidence + (applied unconditionally via Ignore_Subsurface_Height_Thres). + """ # adaptive threshold - ( - sea_surface_height, - sea_surface_label, - solo_sea_surface_label, - subsurface_photon_dataset, - ) = get_sea_surface_height_adaptive(binned_dataset_sea_surface) + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + get_sea_surface_height_adaptive(binned_dataset_sea_surface) + + # Always compute per-bin surface sigma for quality flagging and wave-adaptive fit + sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) if apply_histogram_quality_filter: subsurface_photon_dataset = apply_optional_histogram_quality_filter( @@ -613,6 +786,8 @@ def get_subsurface_photon( min_ratio=histogram_quality_min_ratio, depth_min=histogram_quality_depth_min, depth_max=histogram_quality_depth_max, + reference_depth_min=histogram_quality_ref_depth_min, + reference_depth_max=histogram_quality_ref_depth_max ) if apply_surface_sigma_filter: @@ -620,7 +795,7 @@ def get_subsurface_photon( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=surface_sigma_max, + sigma_max=surface_sigma_max ) if apply_refraction_correction: @@ -629,40 +804,39 @@ def get_subsurface_photon( subsurface_photon_dataset, sea_surface_height, water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, + wavelength_nm=refraction_wavelength_nm ) if apply_post_refraction_refit: - corrected_full_dataset = apply_optional_refraction_correction( - binned_dataset_sea_surface, - binned_dataset_sea_surface.copy(), - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, - ) - rebinned_corrected = horizontal_vertical_bin_dataset( - corrected_full_dataset, horizontal_res, vertical_res - ) - ( - sea_surface_height, - sea_surface_label, - solo_sea_surface_label, - subsurface_photon_dataset, - ) = get_sea_surface_height_adaptive(rebinned_corrected) + sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ + rebuild_and_refit_surface_after_refraction( # steps 10 & 11 + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) + + # Apply minimum depth threshold unconditionally — keeps photons at height + # >= Ignore_Subsurface_Height_Thres regardless of whether GEBCO/ATL24 is on. + # Without this, deep noise photons (down to -70 m) contaminate the Beer's Law fit + # and produce near-zero Kd values, especially at shallow turbid sites. + subsurface_photon_dataset = subsurface_photon_dataset[ + subsurface_photon_dataset['photon_height'] >= Ignore_Subsurface_Height_Thres + ].copy() if use_atl24_filter: atl24_filtered, atl24_match_count = process_seafloor_data_atl24( subsurface_photon_dataset, atl24_file_path, abs(Ignore_Subsurface_Height_Thres), - max_match_distance_deg=atl24_max_match_distance_deg, + max_match_distance_deg=atl24_max_match_distance_deg ) if atl24_match_count > 0: filtered_seafloor_subsurface_photon_dataset = atl24_filtered elif use_gebco_filter: filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, - GEBCO_paths, - abs(Ignore_Subsurface_Height_Thres), + subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) ) else: filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset @@ -675,22 +849,24 @@ def get_subsurface_photon( if apply_flattening: lat_bin_keys = list( - binned_dataset_sea_surface.groupby( - ["lat_bins"], observed=False - ).groups.keys() + binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() ) filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( filtered_seafloor_subsurface_photon_dataset, sea_surface_height, lat_bin_keys, horizontal_res=horizontal_res, - flattening_window_m=flattening_window_m, + flattening_window_m=flattening_window_m ) - return ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) + # Attach per-bin surface_sigma as a quality flag column + filtered_seafloor_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset.copy() + if sigma_map: + filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = \ + filtered_seafloor_subsurface_photon_dataset['lat_bins'].map(sigma_map) + else: + filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = np.nan + + return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset # Main processing function to apply the subsurface photon filtering beam-by-beam @@ -700,13 +876,15 @@ def process_subsurface_photon_filtering( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path="", + atl24_file_path='', atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, histogram_quality_min_ratio=0.05, histogram_quality_depth_min=6.0, histogram_quality_depth_max=7.0, + histogram_quality_ref_depth_min=0.0, + histogram_quality_ref_depth_max=1.0, apply_surface_sigma_filter=False, surface_sigma_max=0.5, apply_refraction_correction=False, @@ -716,46 +894,53 @@ def process_subsurface_photon_filtering( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25, + vertical_res=0.25 ): + """ + Beam-by-beam wrapper that calls get_subsurface_photon for every beam and + concatenates the results. + + Flowchart steps 6–14 are all executed inside this function (delegated to + get_subsurface_photon per beam). See get_subsurface_photon for the + step-by-step breakdown. + """ # Initialize lists to store results for each beam sea_surface_heights = [] sea_surface_labels = [] filtered_beam_datasets = [] # Group the binned dataset by 'beam_id' and process each group separately - for beam_id, beam_data in binned_dataset_sea_surface.groupby("beam_id"): - print(f"Processing subsurface filtering for beam: {beam_id}") + for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): + print(f'Processing subsurface filtering for beam: {beam_id}') # Apply get_subsurface_photon to the current beam's dataset - ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) = get_subsurface_photon( - beam_data, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=use_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=atl24_max_match_distance_deg, - use_gebco_filter=use_gebco_filter, - apply_histogram_quality_filter=apply_histogram_quality_filter, - histogram_quality_min_ratio=histogram_quality_min_ratio, - histogram_quality_depth_min=histogram_quality_depth_min, - histogram_quality_depth_max=histogram_quality_depth_max, - apply_surface_sigma_filter=apply_surface_sigma_filter, - surface_sigma_max=surface_sigma_max, - apply_refraction_correction=apply_refraction_correction, - refraction_water_temp_c=refraction_water_temp_c, - refraction_wavelength_nm=refraction_wavelength_nm, - apply_post_refraction_refit=apply_post_refraction_refit, - apply_flattening=apply_flattening, - flattening_window_m=flattening_window_m, - horizontal_res=horizontal_res, - vertical_res=vertical_res, - ) + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res + ) # Append each result to the lists sea_surface_heights.append(sea_surface_height) @@ -764,5 +949,5 @@ def process_subsurface_photon_filtering( # Combine all filtered beam datasets into a single DataFrame combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) - + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/aok/core/kd_utils/config.py b/aok/core/kd_utils/config.py index c8b3b28..199a779 100644 --- a/aok/core/kd_utils/config.py +++ b/aok/core/kd_utils/config.py @@ -25,6 +25,7 @@ # # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# + # #China Bohai # # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# # # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# @@ -32,6 +33,7 @@ # ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# + # ATL03_h5_file_path = path + ATL03_h5_file # Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' @@ -64,316 +66,233 @@ import argparse - def get_args(): - parser = argparse.ArgumentParser( - description="Configure ICESat-2 HLS analysis script." - ) - + parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") + # General paths - parser.add_argument( - "--workspace_path", - type=str, - default="C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/", - help="Root path for all data processing.", - ) - + parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', + help="Root path for all data processing.") + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', # help="Base path for ICESat-2 ATL03 data.") - parser.add_argument( - "--atl03_path", - type=str, - default="Dataset/ATL03_ICESat2/", - help="Base path for ICESat-2 ATL03 data.", - ) + # # Bohai Sea (v006) + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/', + # help="Base path for ICESat-2 ATL03 data.") + + # Wax Delta (v007) — required for IR/AP filter sensitivity test + parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Wax_Delta/', + help="Base path for ICESat-2 ATL03 data.") + + + # #ChesapeakeBay # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", - # help="Name of the ATL03 H5 file to process.") - + # help="Name of the ATL03 H5 file to process.") + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # #ATL03_20210628230109_00811207_006_01_subsetted - + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + + # # WaxDelta # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") + + + # # Bohai (v006) — baseline sensitivity test site + # parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", + # help="Name of the ATL03 H5 file to process.") - # # Bohai - parser.add_argument( - "--atl03_file", - type=str, - default="ATL03_20190829161305_09560402_006_02_subsetted.h5", - help="Name of the ATL03 H5 file to process.", - ) + # Wax Delta (v007) — required for IR/AP filter; use with atl03_path = Dataset/ATL03_ICESat2/Wax_Delta/ + parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_007_01_subsetted.h5", + help="Name of the ATL03 H5 file to process.") # # Cook Inlet # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + + + # #Core Sound # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + # # Pamlico Sound # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + + # # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", # help="Name of the ATL03 H5 file to process.") - - parser.add_argument( - "--other_data_path", - type=str, - default="Dataset/", - help="Path to the current working directory.", - ) - - parser.add_argument( - "--output_path", - type=str, - default="Results/", - help="Directory for saving results.", - ) - + + parser.add_argument("--other_data_path", type=str, default='Dataset/', + help="Path to the current working directory.") + + parser.add_argument("--output_path", type=str, default='Results/', + help="Directory for saving results.") + # Shoreline data - parser.add_argument( - "--shoreline_data", - type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default="Shorelines/ne_10m_land/ne_10m_land.shp", - help="Path to the global shoreline dataset.", - ) - + parser.add_argument("--shoreline_data", type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default='Shorelines/ne_10m_land/ne_10m_land.shp', + help="Path to the global shoreline dataset.") + # Bathymetry datasets - parser.add_argument( - "--gebco_path", - nargs="+", - type=str, - default="Bathy/gebco_2024_geotiff/", - help="Path to GEBCO bathymetry datasets.", - ) - + parser.add_argument("--gebco_path", nargs='+', type=str, + default='Bathy/gebco_2024_geotiff/', + help="Path to GEBCO bathymetry datasets.") + # Resolution settings - parser.add_argument( - "--horizontal_res", - type=int, - default=500, - help="Horizontal resolution for the analysis (in meters).", - ) - - parser.add_argument( - "--vertical_res", - type=float, - default=0.25, - help="Vertical resolution for the analysis.", - ) - + parser.add_argument("--horizontal_res", type=int, default=500, + help="Horizontal resolution for the analysis (in meters).") + + parser.add_argument("--vertical_res", type=float, default=0.25, + help="Vertical resolution for the analysis.") + # Analysis parameters - parser.add_argument( - "--subsurface_thresh", - type=float, - default=1.0, - help="Threshold for photons below the sea surface.", - ) - - parser.add_argument( - "--ignore_subsurface_height_thres", - type=float, - default=-6, - help="Maximum depth thres for Kd calculations (in meters).", - ) # 5m - - parser.add_argument( - "--target_beams", - type=str, - default="", - help="Comma-separated beam IDs to process. Empty means auto-select strong beams.", - ) - - parser.add_argument( - "--enable_solar_background_filter", - action="store_true", - help="Optional: apply solar background filtering before binning.", - ) - parser.add_argument( - "--solar_elevation_day_threshold", - type=float, - default=6.0, - help="Solar elevation threshold (deg) to define daytime for optional filtering.", - ) - parser.add_argument( - "--solar_background_quantile", - type=float, - default=0.8, - help="Quantile threshold of daytime background rate considered high background.", - ) - parser.add_argument( - "--solar_background_min_signal_conf", - type=int, - default=2, - help="Minimum photon signal confidence kept in high daytime background.", - ) - - parser.add_argument( - "--enable_ir_ap_filter", - action="store_true", - help="Optional: remove photons using IR/afterpulse proxy flags.", - ) - parser.add_argument( - "--ir_ap_quality_max", - type=int, - default=0, - help="Maximum allowed quality_ph value when IR/AP filter is enabled.", - ) - parser.add_argument( - "--ir_ap_min_signal_conf", - type=int, - default=0, - help="Minimum allowed photon_conf value when IR/AP filter is enabled.", - ) - - parser.add_argument( - "--enable_gebco_filter", - action="store_true", - help="Optional: remove photons below GEBCO seafloor and shallow bins.", - ) - parser.add_argument( - "--enable_sea_surface_flattening", - action="store_true", - help="Optional: flatten small-bin sea-surface variation using larger along-track windows.", - ) - parser.add_argument( - "--sea_surface_flattening_window_m", - type=int, - default=500, - help="Window size in meters for sea-surface flattening mean level.", - ) - - parser.add_argument( - "--enable_convex_hull_filter", - action="store_true", - help="Optional: apply convex hull area filtering before Kd fitting.", - ) - parser.add_argument( - "--convex_hull_area_threshold", - type=float, - default=100, - help="Minimum convex hull area to keep a horizontal bin when enabled.", - ) - - parser.add_argument( - "--enable_histogram_quality_filter", - action="store_true", - help="Optional: discard along-track bins with weak 6-7 m signal.", - ) - parser.add_argument( - "--histogram_quality_min_ratio", - type=float, - default=0.05, - help="Minimum ratio of photons in depth band to keep a bin.", - ) - parser.add_argument( - "--histogram_quality_depth_min", - type=float, - default=6.0, - help="Minimum depth (m below surface) of quality-check band.", - ) - parser.add_argument( - "--histogram_quality_depth_max", - type=float, - default=7.0, - help="Maximum depth (m below surface) of quality-check band.", - ) - - parser.add_argument( - "--enable_surface_sigma_filter", - action="store_true", - help="Optional: discard bins where Gaussian surface sigma is too large.", - ) - parser.add_argument( - "--surface_sigma_max", - type=float, - default=0.5, - help="Maximum allowed Gaussian sigma (m) for sea-surface peak.", - ) - - parser.add_argument( - "--enable_refraction_correction", - action="store_true", - help="Optional: apply refraction correction to subsurface photons.", - ) - parser.add_argument( - "--refraction_water_temp_c", - type=float, - default=20.0, - help="Water temperature (deg C) used for refraction correction.", - ) - parser.add_argument( - "--refraction_wavelength_nm", - type=float, - default=532.0, - help="Laser wavelength (nm) used for refraction correction.", - ) - - parser.add_argument( - "--enable_post_refraction_refit", - action="store_true", - help="Optional: rebuild histograms and re-fit surface after refraction correction.", - ) - - parser.add_argument( - "--enable_atl24_filter", - action="store_true", - help="Optional: use ATL24 bathymetry matching before GEBCO fallback.", - ) - parser.add_argument( - "--atl24_file", - type=str, - default="", - help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.", - ) - parser.add_argument( - "--atl24_max_match_distance_deg", - type=float, - default=0.01, - help="Maximum lon/lat degree distance for ATL24 nearest-point matching.", - ) - - parser.add_argument( - "--enable_paired_beam_combine", - action="store_true", - help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.", - ) + parser.add_argument("--subsurface_thresh", type=float, default=1.0, + help="Threshold for photons below the sea surface.") + + parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, + help="Maximum depth thres for Kd calculations (in meters).") #5m + + # Empty string '' auto-selects the first strong beam. Pass comma-separated IDs to override. + # Valid IDs: gt1l, gt1r, gt2l, gt2r, gt3l, gt3r — only strong beams for the granule are kept. + # To process all strong beams, use --target_beams "gt1r,gt2r,gt3r". + parser.add_argument("--target_beams", type=str, default='', + help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.") + + parser.add_argument("--disable_solar_background_filter", action="store_true", + help="Disable solar background filtering (enabled by default; " + "self-gates on solar elevation so has no effect on nighttime passes).") + parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.") + parser.add_argument("--solar_bg_median_window_deg", type=float, default=10.0, + help="Rolling median window width (degrees of solar elevation) for " + "the solar-elevation-aware background filter.") + parser.add_argument("--solar_bg_noise_multiplier", type=float, default=1.5, + help="Ratio of actual/expected background rate above which a photon " + "is flagged as noisy by the solar background filter.") + parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, + help="Minimum photon signal confidence kept in high daytime background.") + + parser.add_argument("--enable_ir_ap_filter", action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.") + parser.add_argument("--ir_ap_quality_max", type=int, default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.") + parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.") + + parser.add_argument("--enable_gebco_filter", action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.") + parser.add_argument("--enable_sea_surface_flattening", action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") + parser.add_argument("--sea_surface_flattening_window_m", type=int, default=5000, + help="Window size in meters for sea-surface flattening mean level. " + "Must be larger than horizontal_res to have any effect.") + + parser.add_argument("--enable_convex_hull_filter", action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.") + parser.add_argument("--convex_hull_area_threshold", type=float, default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.") + + parser.add_argument("--enable_histogram_quality_filter", action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.") + parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.") + parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, + help="Minimum depth (m below surface) of quality-check band.") + parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, + help="Maximum depth (m below surface) of quality-check band.") + parser.add_argument("--histogram_quality_ref_depth_min", type=float, default=0.0, + help="Min depth (m) of near-surface reference band for decay ratio.") + parser.add_argument("--histogram_quality_ref_depth_max", type=float, default=1.0, + help="Max depth (m) of near-surface reference band for decay ratio.") + + parser.add_argument("--enable_surface_sigma_filter", action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.") + parser.add_argument("--surface_sigma_max", type=float, default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") + + parser.add_argument("--enable_refraction_correction", action="store_true", + help="Optional: apply refraction correction to subsurface photons.") + parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, + help="Water temperature (deg C) used for refraction correction.") + parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, + help="Laser wavelength (nm) used for refraction correction.") + + parser.add_argument("--enable_post_refraction_refit", action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.") + + parser.add_argument("--enable_atl24_filter", action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") + parser.add_argument("--atl24_file", type=str, default='', + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") + parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") + + parser.add_argument("--decay_zone_threshold", type=float, default=0.0, + help="Fraction of peak photon count below which depth bins are excluded " + "in Step 16 decay zone detection. 0.0 = original behaviour (only " + "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.") + + parser.add_argument("--kd_fit_method", type=str, default="log_linear", + choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], + help="Beer's Law fitting strategy for Kd calculation. " + "log_linear = original log-space linear regression; " + "bg_subtract = subtract noise floor then log-linear; " + "breakpoint = segmented regression with breakpoint detection; " + "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.") + + parser.add_argument("--enable_wave_adaptive_fit", action="store_true", + help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " + "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.") + parser.add_argument("--wave_exclusion_multiplier", type=float, default=4.0, + help="Number of surface sigmas to skip from the actual water surface when " + "wave-adaptive fit is enabled. The adaptive surface detection already " + "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " + "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.") + parser.add_argument("--wave_sigma_calm_threshold", type=float, default=0.1, + help="Surface sigma (m) below which a bin is considered calm and no " + "wave trimming is applied. Default 0.1 m.") + + parser.add_argument("--enable_paired_beam_combine", action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") + + parser.add_argument("--no_plot", action="store_true", + help="Suppress all plot generation (useful for batch runs).") # switch on consider the coastal water or inland water # if Inland water, we should use one mask and for coastal water, it should be another mask - + # consider bathymetry or not manual setting - - # night vs daytime - - # solar elevation detemine remove or not for solar background - - # 1-2 hours - + + + #night vs daytime + + #solar elevation detemine remove or not for solar background + + + # 1-2 hours + + return parser.parse_args() - if __name__ == "__main__": args = get_args() # Access parameters like this: atl03_file_path = args.workspace_path + args.atl03_file - + print("Processing file:", atl03_file_path) print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/data_processing.py b/aok/core/kd_utils/data_processing.py index 06dcb13..6d5b28e 100644 --- a/aok/core/kd_utils/data_processing.py +++ b/aok/core/kd_utils/data_processing.py @@ -1,30 +1,48 @@ # utils/data_processing.py +import re +import os import io -import logging +import time import math -import os -import re - -import geopandas as gpd import h5py - -# from pyproj import Transformer +import logging +import netCDF4 import numpy as np +import geopandas as gpd + import pandas as pd -from pyproj import Proj, Transformer +from datetime import datetime +import matplotlib.pyplot as plt +from sklearn.preprocessing import StandardScaler, MinMaxScaler from scipy.spatial import ConvexHull +import argparse +import subprocess # import fiona +import utm +import pyproj +from pyproj import Transformer, Proj + +# from pyproj import Transformer +from matplotlib.widgets import LassoSelector +from matplotlib.path import Path + +from sklearn.cluster import DBSCAN from .interpolation import * -# this function from icesat2_toolkit +logger = logging.getLogger(__name__) + + +#this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ - Reads ICESat-2 ATL03 Global Geolocated Photons data files + Reads ICESat-2 ATL03 Global Geolocated Photons data files. + + Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). Parameters ---------- @@ -44,9 +62,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, "r") + fileID = h5py.File(FILENAME, 'r') else: - fileID = h5py.File(os.path.expanduser(FILENAME), "r") + fileID = h5py.File(os.path.expanduser(FILENAME), 'r') # Output HDF5 file information logging.info(fileID.filename) @@ -58,12 +76,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]["geolocation"]["segment_id"] - fileID[gtx]["heights"]["delta_time"] + fileID[gtx]['geolocation']['segment_id'] + fileID[gtx]['heights']['delta_time'] except KeyError: pass else: @@ -71,81 +89,82 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # for each included beam for gtx in IS2_atl03_beams: + # ------------------------------------------- # 1. make sure the beam-level dict exists IS2_atl03_attrs.setdefault(gtx, {}) # 2. always save the two “must-have” attributes - for key in ("atlas_beam_type", "atlas_spot_number"): + for key in ('atlas_beam_type', 'atlas_spot_number'): IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] - + # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]["heights"] = {} - IS2_atl03_mds[gtx]["geolocation"] = {} - IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} - IS2_atl03_mds[gtx]["geophys_corr"] = {} + IS2_atl03_mds[gtx]['heights'] = {} + IS2_atl03_mds[gtx]['geolocation'] = {} + IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} + IS2_atl03_mds[gtx]['geophys_corr'] = {} # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_mds[gtx]["heights"][key] = val[:] + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_mds[gtx]['heights'][key] = val[:] # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_mds[gtx]["geolocation"][key] = val[:] + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_mds[gtx]['geolocation'][key] = val[:] # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]["heights"] = {} - IS2_atl03_attrs[gtx]["geolocation"] = {} - IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} - IS2_atl03_attrs[gtx]["geophys_corr"] = {} - + IS2_atl03_attrs[gtx]['heights'] = {} + IS2_atl03_attrs[gtx]['geolocation'] = {} + IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} + IS2_atl03_attrs[gtx]['geophys_corr'] = {} + # Global Group Attributes - for att_name, att_val in fileID[gtx].attrs.items(): + for att_name,att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_attrs[gtx]["heights"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val + for key,val in fileID[gtx]['heights'].items(): + IS2_atl03_attrs[gtx]['heights'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_attrs[gtx]["geolocation"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val + for key,val in fileID[gtx]['geolocation'].items(): + IS2_atl03_attrs[gtx]['geolocation'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val + for key,val in fileID[gtx]['bckgrd_atlas'].items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val # ICESat-2 Geophysical Corrections Group - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val + for key,val in fileID[gtx]['geophys_corr'].items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds["orbit_info"] = {} - IS2_atl03_attrs["orbit_info"] = {} - for key, val in fileID["orbit_info"].items(): - IS2_atl03_mds["orbit_info"][key] = val[:] + IS2_atl03_mds['orbit_info'] = {} + IS2_atl03_attrs['orbit_info'] = {} + for key,val in fileID['orbit_info'].items(): + IS2_atl03_mds['orbit_info'][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name, att_val in fileID["orbit_info"].attrs.items(): - IS2_atl03_attrs["orbit_info"][att_name] = att_val + for att_name,att_val in fileID['orbit_info'].attrs.items(): + IS2_atl03_attrs['orbit_info'][att_name] = att_val # Variable Attributes - IS2_atl03_attrs["orbit_info"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["orbit_info"][key][att_name] = att_val + IS2_atl03_attrs['orbit_info'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['orbit_info'][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -153,108 +172,91 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds["ancillary_data"] = {} - IS2_atl03_attrs["ancillary_data"] = {} - ancillary_keys = [ - "atlas_sdp_gps_epoch", - "data_end_utc", - "data_start_utc", - "end_cycle", - "end_geoseg", - "end_gpssow", - "end_gpsweek", - "end_orbit", - "end_region", - "end_rgt", - "granule_end_utc", - "granule_start_utc", - "release", - "start_cycle", - "start_geoseg", - "start_gpssow", - "start_gpsweek", - "start_orbit", - "start_region", - "start_rgt", - "version", - ] + IS2_atl03_mds['ancillary_data'] = {} + IS2_atl03_attrs['ancillary_data'] = {} + ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', + 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', + 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', + 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', + 'start_orbit','start_region','start_rgt','version'] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] + IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs["ancillary_data"][key] = {} - for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): - IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val + IS2_atl03_attrs['ancillary_data'][key] = {} + for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): + IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds["ancillary_data"]["tep"] = {} - IS2_atl03_attrs["ancillary_data"]["tep"] = {} - for key, val in fileID["ancillary_data"]["tep"].items(): + IS2_atl03_mds['ancillary_data']['tep'] = {} + IS2_atl03_attrs['ancillary_data']['tep'] = {} + for key,val in fileID['ancillary_data']['tep'].items(): # get each HDF5 variable - IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] + IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val + IS2_atl03_attrs['ancillary_data']['tep'][key] = {} + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1, cal2 = ("ancillary_data", "calibrations") - for var in ["dead_time", "first_photon_bias"]: + cal1,cal2 = ('ancillary_data','calibrations') + for var in ['dead_time','first_photon_bias']: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key, val in fileID[cal1][cal2][var].items(): + for key,val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k, v in val.items(): + for k,v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name, att_val in val.attrs.items(): + for att_name,att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k, v in val.items(): + for k,v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val + for att_name,att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1, tep2 = ("atlas_impulse_response", "tep_histogram") + tep1,tep2 = ('atlas_impulse_response','tep_histogram') IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ["pce1_spot1", "pce2_spot3"]: - IS2_atl03_mds[tep1][pce] = {tep2: {}} - IS2_atl03_attrs[tep1][pce] = {tep2: {}} + for pce in ['pce1_spot1','pce2_spot3']: + IS2_atl03_mds[tep1][pce] = {tep2:{}} + IS2_atl03_attrs[tep1][pce] = {tep2:{}} # for each TEP variable - for key, val in fileID[tep1][pce][tep2].items(): + for key,val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name, att_val in val.attrs.items(): + for att_name,att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name, att_val in fileID.attrs.items(): + for att_name,att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) + return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) + # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 @@ -262,22 +264,22 @@ def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = "0" + utm_band + utm_band = '0' + utm_band if lat >= 0: - epsg_code = "epsg:326" + utm_band + epsg_code = 'epsg:326' + utm_band return epsg_code - epsg_code = "epsg:327" + utm_band + epsg_code = 'epsg:327' + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - # To transform from WGS84 ellipsoidal height + #To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -295,11 +297,14 @@ def orthometric_correction(lat, lon, Z, epsg): return Y_utm, X_utm, Z_egm08 + def load_data(file_path, read_attributes=False): """ Wrapper around read_granule that lets you decide whether to pull the full attribute tree. + Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). + Parameters ---------- file_path : str @@ -308,8 +313,10 @@ def load_data(file_path, read_attributes=False): If True, read_granule returns the full IS2_atl03_attrs tree. Defaults to False. """ + + + return read_granule(file_path, ATTRIBUTES = read_attributes) - return read_granule(file_path, ATTRIBUTES=read_attributes) # requires that the input gdf has ranged index values i @@ -320,15 +327,16 @@ def load_data(file_path, read_attributes=False): def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) + ICESat2_GDF.insert(0, 'is_land', + zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -339,15 +347,13 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file( - shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" - ) + land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -359,6 +365,7 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: + print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -369,64 +376,37 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -def create_photon_dataframe( - lat_ph, - lon_ph, - ref_elev, - ref_azimuth, - geoid, - h_ph, - quality_ph, - is_land_label_interp1d, - signal_conf_photon, - x_atc, - relative_AT_dist, - solar_elevation=None, - background_rate=None, -): +def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ + quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, + solar_elevation=None, background_rate=None): # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights h_ph_geoid_cor = h_ph[:] - geoid[:] - - # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - + # Perform orthometric correction to obtain UTM coordinates and corrected heights lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - + # Put the data into the dataframe - sea_photon_dataset = pd.DataFrame( - { - "latitude": lat_ph, - "longitude": lon_ph, - "lat": lat_utm, - "lon": lon_utm, - "photon_height": h_ph_geoid_cor, - "quality_ph": quality_ph, - "is_land_label": is_land_label_interp1d, - "photon_conf": signal_conf_photon, - "ref_elevation": ref_elev, - "ref_azimuth": ref_azimuth, - "relative_AT_dist": relative_AT_dist, - }, - columns=[ - "latitude", - "longitude", - "lat", - "lon", - "photon_height", - "quality_ph", - "is_land_label", - "photon_conf", - "ref_elevation", - "ref_azimuth", - "relative_AT_dist", - ], - ) + sea_photon_dataset = pd.DataFrame({ + 'latitude': lat_ph, + 'longitude': lon_ph, + 'lat': lat_utm, + 'lon': lon_utm, + 'photon_height': h_ph_geoid_cor, + 'quality_ph': quality_ph, + 'is_land_label': is_land_label_interp1d, + 'photon_conf': signal_conf_photon, + 'ref_elevation': ref_elev, + 'ref_azimuth': ref_azimuth, + 'relative_AT_dist': relative_AT_dist + }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) if solar_elevation is not None: - sea_photon_dataset["solar_elevation"] = solar_elevation + sea_photon_dataset['solar_elevation'] = solar_elevation if background_rate is not None: - sea_photon_dataset["background_rate"] = background_rate + sea_photon_dataset['background_rate'] = background_rate return sea_photon_dataset @@ -450,75 +430,128 @@ def interpolate_by_time(source_time, source_values, target_time): if unique_time.size < 2: return np.full_like(target_time, unique_values[0], dtype=float) - return np.interp( - target_time, - unique_time, - unique_values, - left=unique_values[0], - right=unique_values[-1], - ) + return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) def apply_optional_solar_background_filter( sea_photon_dataset, enabled=False, day_threshold=6.0, - background_quantile=0.8, - min_signal_conf=2, + median_window_deg=10.0, + noise_multiplier=1.5, + min_signal_conf=2 ): """ Optionally filter photons under strong daytime solar background conditions. - Keeps high-confidence photons in high-background segments. + + Uses a degree-based rolling median of background_rate over solar_elevation + to compute the expected background at each elevation. Photons whose actual + background_rate exceeds expected * noise_multiplier are flagged noisy. + Noisy photons are removed unless their signal confidence >= min_signal_conf. + + Flowchart step: 5 — Compute solar background from atmospheric signal (using + selected x and z bin length scales) and subtract from subsurface photon + histogram data. """ if not enabled: return sea_photon_dataset - required_cols = {"solar_elevation", "background_rate", "photon_conf"} + required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} if not required_cols.issubset(set(sea_photon_dataset.columns)): - logger.warning( - "Solar background filter skipped because required columns are missing." - ) + logger.warning("Solar background filter skipped because required columns are missing.") return sea_photon_dataset filtered_dataset = sea_photon_dataset.copy() - daytime_mask = filtered_dataset["solar_elevation"] >= day_threshold - daytime_background = filtered_dataset.loc[daytime_mask, "background_rate"] - daytime_background = daytime_background[np.isfinite(daytime_background)] + daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold + + # Select valid daytime photons for rolling median computation + daytime = filtered_dataset.loc[daytime_mask].copy() + valid = np.isfinite(daytime['background_rate']) & np.isfinite(daytime['solar_elevation']) + daytime = daytime.loc[valid] - if daytime_background.empty: + if len(daytime) < 10: logger.info( - "Solar background filter enabled, but no valid daytime photons were found." + "Solar background filter enabled, but fewer than 10 valid daytime " + "photons were found. Skipping filter." ) return filtered_dataset - quantile_threshold = daytime_background.quantile(background_quantile) - noisy_daytime_mask = daytime_mask & ( - filtered_dataset["background_rate"] >= quantile_threshold + # Clamp solar elevations to physical range [DAY_THRESHOLD, 90] to avoid + # HDF5 fill values (e.g. 3.4e+38) blowing up the bin array. + daytime = daytime.copy() + daytime['solar_elevation'] = daytime['solar_elevation'].clip( + lower=day_threshold, upper=90.0 + ) + + # Vectorized degree-based rolling median: + # Bin elevations into fine steps, compute one median per bin using + # np.searchsorted on the sorted array, then map back to photons. + daytime = daytime.sort_values('solar_elevation') + elevations = daytime['solar_elevation'].values + bg_rates = daytime['background_rate'].values + half_window = median_window_deg / 2.0 + + bin_step = 0.1 # degrees — fine enough for smooth curve + bin_centers = np.arange( + elevations[0], elevations[-1] + bin_step, bin_step ) - keep_mask = (~noisy_daytime_mask) | ( - filtered_dataset["photon_conf"] >= min_signal_conf + bin_medians = np.empty(len(bin_centers)) + for i in range(len(bin_centers)): + lo = np.searchsorted(elevations, bin_centers[i] - half_window, side='left') + hi = np.searchsorted(elevations, bin_centers[i] + half_window, side='right') + if lo < hi: + bin_medians[i] = np.median(bg_rates[lo:hi]) + else: + bin_medians[i] = np.nan + + # Map bin medians back to each photon via nearest-bin lookup (vectorized) + bin_indices = np.searchsorted(bin_centers, elevations, side='right') - 1 + bin_indices = np.clip(bin_indices, 0, len(bin_centers) - 1) + expected_bg = bin_medians[bin_indices] + + # Map expected background onto the full DataFrame. Non-daytime and + # non-finite daytime rows stay NaN, so they are never flagged noisy. + filtered_dataset['expected_bg'] = np.nan + filtered_dataset.loc[daytime.index, 'expected_bg'] = expected_bg + + # Flag noisy daytime photons + noisy_daytime_mask = daytime_mask & ( + filtered_dataset['background_rate'] >= filtered_dataset['expected_bg'] * noise_multiplier ) + keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) + + # Clean up temporary column + filtered_dataset.drop(columns=['expected_bg'], inplace=True, errors='ignore') before_count = len(filtered_dataset) filtered_dataset = filtered_dataset.loc[keep_mask].copy() after_count = len(filtered_dataset) logger.info( - "Solar background filter removed %s photons (from %s to %s).", - before_count - after_count, - before_count, - after_count, + "Solar background filter (multiplier=%.1f) removed %s photons (from %s to %s).", + noise_multiplier, before_count - after_count, before_count, after_count ) return filtered_dataset def apply_optional_ir_ap_filter( - sea_photon_dataset, enabled=False, quality_max=0, min_signal_conf=0 + sea_photon_dataset, + enabled=False, + quality_max=0, + min_signal_conf=0 ): """ - Optionally remove likely flagged photons as an IR/afterpulse proxy. - This uses available ATL03 photon fields: - - quality_ph: keep <= quality_max - - photon_conf: keep >= min_signal_conf + Optionally remove photons flagged as afterpulse using the quality_ph bitmask. + + Flowchart step: 2 — Remove photons flagged as afterpulse or impulse response. + + ATL03 v007 quality_ph bit encoding: + bit 0 (value 1) = possible afterpulse <- targeted by this filter + bit 1 (value 2) = possible impulse response effect + bit 2 (value 4) = possible TEP + + Only photons with bit 0 set (afterpulse) are removed. Photons flagged + for impulse response or TEP only are kept, avoiding over-filtering on + turbid high-background sites where many photons carry non-zero flags. """ if not enabled: return sea_photon_dataset @@ -526,14 +559,19 @@ def apply_optional_ir_ap_filter( filtered_dataset = sea_photon_dataset.copy() keep_mask = np.ones(len(filtered_dataset), dtype=bool) - if "quality_ph" in filtered_dataset.columns: - quality_values = pd.to_numeric(filtered_dataset["quality_ph"], errors="coerce") - keep_mask &= quality_values <= quality_max + if 'quality_ph' in filtered_dataset.columns: + quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce').fillna(0).astype(int) + afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse + keep_mask &= ~afterpulse_mask + logger.info( + "IR/AP filter: %d photons flagged as afterpulse (quality_ph bit 0) will be removed.", + int(afterpulse_mask.sum()) + ) else: logger.warning("IR/AP filter enabled, but quality_ph is missing.") - if "photon_conf" in filtered_dataset.columns: - conf_values = pd.to_numeric(filtered_dataset["photon_conf"], errors="coerce") + if 'photon_conf' in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') keep_mask &= conf_values >= min_signal_conf else: logger.warning("IR/AP filter enabled, but photon_conf is missing.") @@ -543,9 +581,7 @@ def apply_optional_ir_ap_filter( after_count = len(filtered_dataset) logger.info( "IR/AP proxy filter removed %s photons (from %s to %s).", - before_count - after_count, - before_count, - after_count, + before_count - after_count, before_count, after_count ) return filtered_dataset @@ -565,126 +601,105 @@ def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path) Segment_ref_azimuth = {} background_rate = {} background_counts = {} - + # Initialize a list to store data for each beam beam_datasets = [] - + + # Loop over each strong beam in target_strong_beams for gtx in target_strong_beams: - print("Processing strong beam ID:", gtx) - + print('Processing strong beam ID:', gtx) + # Access the data for the current beam IS2_val = IS2_atl03_mds[gtx] - + # Initialize dictionaries to store segment data for the beam - Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] + Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] n_seg = len(Segment_ID[gtx]) - (n_pe,) = IS2_val["heights"]["delta_time"].shape - Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 - Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] - Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] - Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] - delta_time = IS2_val["geolocation"]["delta_time"] - segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() - segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() - ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() - ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() - geoid = IS2_val["geophys_corr"]["geoid"][:].copy() - h_ph = IS2_val["heights"]["h_ph"][:].copy() - photon_delta_time = IS2_val["heights"]["delta_time"][:].copy() - lat_ph = IS2_val["heights"]["lat_ph"][:].copy() - lon_ph = IS2_val["heights"]["lon_ph"][:].copy() - signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() - x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() - y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() - quality_ph = IS2_val["heights"]["quality_ph"] + n_pe, = IS2_val['heights']['delta_time'].shape + Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 + Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] + Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] + Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] + delta_time = IS2_val['geolocation']['delta_time'] + segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() + segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() + ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() + ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() + geoid = IS2_val['geophys_corr']['geoid'][:].copy() + h_ph = IS2_val['heights']['h_ph'][:].copy() + photon_delta_time = IS2_val['heights']['delta_time'][:].copy() + lat_ph = IS2_val['heights']['lat_ph'][:].copy() + lon_ph = IS2_val['heights']['lon_ph'][:].copy() + signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() + x_atc = IS2_val['heights']['dist_ph_along'][:].copy() + y_atc = IS2_val['heights']['dist_ph_across'][:].copy() + quality_ph = IS2_val['heights']['quality_ph'] # Optional variables for solar background sensitivity testing photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) - if "solar_elevation" in IS2_val["geolocation"]: - segment_solar_elevation = IS2_val["geolocation"]["solar_elevation"][ - : - ].copy() + if 'solar_elevation' in IS2_val['geolocation']: + segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() photon_solar_elevation = interpolate_by_time( delta_time, segment_solar_elevation, photon_delta_time ) - if ( - "bckgrd_atlas" in IS2_val - and "bckgrd_rate" in IS2_val["bckgrd_atlas"] - and "delta_time" in IS2_val["bckgrd_atlas"] - ): - bckgrd_rate = IS2_val["bckgrd_atlas"]["bckgrd_rate"][:].copy() - bckgrd_time = IS2_val["bckgrd_atlas"]["delta_time"][:].copy() + if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: + bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() + bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() photon_background_rate = interpolate_by_time( bckgrd_time, bckgrd_rate, photon_delta_time ) - # Adjust x_atc based on segment distances + # Adjust x_atc based on segment distances for seg_index in range(n_seg): idx = Segment_Index_begin[gtx][seg_index] cnt = Segment_PE_count[gtx][seg_index] - x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - # Calculate relative distances + # Calculate relative distances relative_AT_dist = (x_atc - x_atc[0]) / 1000 - relative_seg_dist = ( - Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] - ) / 1000 - + relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 + # Create a GeoDataFrame to hold segment data for shoreline check - Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) + Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - + # Determine if it is land by the land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons( - shoreline_data_path, ICESat2_GDF - ) - ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) + ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels # Apply interpolations for required data - is_land_label_interp1d = apply_interpolation( - interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph - ) - ph_ref_elev = apply_interpolation( - interpolate_labels(segment_lat, ref_elev), lat_ph - ) - ph_ref_azimuth = apply_interpolation( - interpolate_labels(segment_lat, ref_azimuth), lat_ph - ) + is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) + ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) + ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) - # Create photon DataFrame for the current beam - sea_photon_dataset = create_photon_dataframe( - lat_ph=lat_ph, - lon_ph=lon_ph, - ref_elev=ph_ref_elev, - ref_azimuth=ph_ref_azimuth, - geoid=ph_geoid, - h_ph=h_ph, - quality_ph=quality_ph, - is_land_label_interp1d=is_land_label_interp1d, - signal_conf_photon=signal_conf_photon, - x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. - relative_AT_dist=relative_AT_dist, - solar_elevation=photon_solar_elevation, - background_rate=photon_background_rate, - ) - # Filter out land photons - sea_photon_dataset = sea_photon_dataset[ - sea_photon_dataset["is_land_label"] != 1 - ] + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, + ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid,h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate) + # Filter out land photons + sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] + # Add a new column to indicate the beam ID - sea_photon_dataset["beam_id"] = gtx - + sea_photon_dataset['beam_id'] = gtx + # Append the processed dataset for the current beam to the list beam_datasets.append(sea_photon_dataset) - + # Concatenate all beam data into a single DataFrame all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) - + return all_beams_dataset @@ -693,116 +708,103 @@ def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000) Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, convex hull areas, and convex hull points. """ - lat_bins_grouped = photon_dataset.groupby("lat_bins", observed=False) + lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) filtered_dataset = photon_dataset.copy() - + convex_hulls = {} convex_hull_areas = {} - + for lat_bin, bin_data in lat_bins_grouped: if len(bin_data) >= 3: - points = bin_data[["lat", "photon_height"]].to_numpy() + points = bin_data[['lat', 'photon_height']].to_numpy() hull = ConvexHull(points) area = hull.volume convex_hull_areas[lat_bin] = area - + # Store the ConvexHull points if area meets the threshold if area >= hull_area_threshold: convex_hulls[lat_bin] = points[hull.vertices] else: # Remove bins with hull area below the threshold - filtered_dataset = filtered_dataset[ - filtered_dataset["lat_bins"] != lat_bin - ] - + filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] + return filtered_dataset, convex_hull_areas, convex_hulls + + + + + + + ########################## -##Discard Functions Below +##Discard Functions Below ########################## -# Extracts key information from filenames, +# Extracts key information from filenames, # which could include processed status, product ID, timestamps, identifiers, or metadata. def extract_file_params(file_path): - """ + ''' Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" - Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', '2023', '11', '15', '001', '12', '_data') - """ + ''' # Defines a regex pattern to match strings in the file path - rx = re.compile( - r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" - r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" - ) + rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' + r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') # Searches for all matches of the regex pattern in the given params = rx.findall(file_path).pop() return params def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): - """ + ''' create a mask for filtering sea surface, sea floor, and threshold_height - """ - mask = ( - (binned_data["height"] <= sea_surface_height) - & (binned_data["height"] >= seafloor_height) - & (binned_data["height"] >= threshold_height) - ) + ''' + mask = (binned_data['height'] <= sea_surface_height) & \ + (binned_data['height'] >= seafloor_height) & \ + (binned_data['height'] >= threshold_height) return mask -def generate_polygons_from_binned_data( - lat, height, mask, lat_interval, height_interval -): - """ +def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): + ''' horizontal_vertical_bin_dataset Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals - """ + ''' # Create latitude bins based on the specified interval lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) - + # Create height bins within the range [-12, 2] based on the specified interval height_bins = np.arange(-12, 2 + height_interval, height_interval) - + # Identify which bin each masked latitude and height value belongs to lat_bin_indices = np.digitize(lat[mask], lat_bins) - + # Determine which height bin each masked point belongs to height_bin_indices = np.digitize(height[mask], height_bins) - + # Combine latitude and height bin indices for each point - combined_bins = list(zip(lat_bin_indices, height_bin_indices, strict=False)) - - # Find unique bin combinations and count the number of points in each bin + combined_bins = list(zip(lat_bin_indices, height_bin_indices)) + + # Find unique bin combinations and count the number of points in each bin unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) - - # Filter bins to include only those within valid index ranges - valid_bins = [ - (bin[0], bin[1]) - for bin in unique_bins - if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins) - ] + + # Filter bins to include only those within valid index ranges + valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] polygons = [] for bin_lat, bin_height in valid_bins: - if ( - bin_lat > 0 - and bin_lat < len(lat_bins) - and bin_height > 0 - and bin_height < len(height_bins) - ): - bin_points = np.array( - [ - (lat[mask][i], height[mask][i]) - for i in range(sum(mask)) - if lat_bin_indices[i] == bin_lat - and height_bin_indices[i] == bin_height - ] - ) + if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): + bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) if len(bin_points) > 2: hull = ConvexHull(bin_points) polygons.append(bin_points[hull.vertices]) return lat_bins, height_bins, valid_bins, polygons + + + diff --git a/aok/core/kd_utils/main.py b/aok/core/kd_utils/main.py index d5e9a7f..bf046f3 100644 --- a/aok/core/kd_utils/main.py +++ b/aok/core/kd_utils/main.py @@ -4,9 +4,10 @@ import re import sys -from config import get_args import pandas as pd +from config import get_args +from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.bathy_processing import process_subsurface_photon_filtering from kd_utils.data_processing import ( Extract_sea_photons, @@ -15,36 +16,33 @@ filter_photon_dataset_by_hull_area, load_data, ) -from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.sea_photons_analysis import process_sea_photon_binning -from kd_utils.visualization import plot_convex_hulls, plot_kd_photons +from kd_utils.visualization import plot_convex_hulls, plot_kd_photons, plot_photon_quality_flags -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) PAIR_ID_MAP = { - "gt1l": "gt1_pair", - "gt1r": "gt1_pair", - "gt2l": "gt2_pair", - "gt2r": "gt2_pair", - "gt3l": "gt3_pair", - "gt3r": "gt3_pair", + 'gt1l': 'gt1_pair', + 'gt1r': 'gt1_pair', + 'gt2l': 'gt2_pair', + 'gt2r': 'gt2_pair', + 'gt3l': 'gt3_pair', + 'gt3r': 'gt3_pair', } def get_target_beams(all_beams, beam_attrs, target_beams_arg): strong_beams = [ - gtx - for gtx in all_beams - if beam_attrs[gtx]["atlas_beam_type"].decode("utf-8") == "strong" + gtx for gtx in all_beams + if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' ] if not target_beams_arg: - return strong_beams + return strong_beams[:1] # auto-select first strong beam - requested = [b.strip() for b in target_beams_arg.split(",") if b.strip()] + requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] return [b for b in requested if b in strong_beams] @@ -52,35 +50,46 @@ def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=Fals """ Optionally combine left/right beams into pair groups before Kd fitting. Uses shared along-track bins from relative_AT_dist when available. + + Flowchart step: 15 — Combine data from paired beams if appropriate (not + recommended in optically complex water). """ if (not enabled) or dataset.empty: return dataset combined = dataset.copy() - combined["source_beam_id"] = combined["beam_id"] - combined["beam_id"] = ( - combined["beam_id"].map(PAIR_ID_MAP).fillna(combined["beam_id"]) - ) + combined['source_beam_id'] = combined['beam_id'] + combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) - if "relative_AT_dist" in combined.columns: - rel_m = pd.to_numeric(combined["relative_AT_dist"], errors="coerce") * 1000.0 - pair_bins = (rel_m / max(horizontal_res, 1)).astype("Int64") + if 'relative_AT_dist' in combined.columns: + rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') pair_bins = pair_bins.fillna(-1).astype(int) - combined["lat_bins"] = pair_bins + combined['lat_bins'] = pair_bins return combined def run_pipeline(args): - atl03_h5_file_path = os.path.join( - args.workspace_path, args.atl03_path, args.atl03_file - ) - shoreline_data_path = os.path.join( - args.workspace_path, args.other_data_path, args.shoreline_data - ) - gebco_full_path = os.path.join( - args.workspace_path, args.other_data_path, args.gebco_path - ) + """ + Top-level pipeline orchestrator. Executes all 19 flowchart steps in sequence. + + Flowchart steps directly handled here: + 1 — Load ATL03 data (load_data / Extract_sea_photons). + 2 — Remove afterpulse/impulse-response photons (apply_optional_ir_ap_filter). + 3 — Bin sizes set via args.horizontal_res / args.vertical_res. + 4 — Build histograms (process_sea_photon_binning). + 5 — Solar background filter (apply_optional_solar_background_filter). + 6–14 — Subsurface filtering chain (process_subsurface_photon_filtering). + 15 — Paired beam combine (optionally_combine_paired_beams_for_kd). + 16–17 — Beer's Law Kd fit (process_kd_calculation). + 18 — Save Kd CSV output (subsurface_photon_df_added_kd.to_csv). + 19 — Check data for reasonableness (filter_photon_dataset_by_hull_area + + plot_kd_photons). + """ + atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) + shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) + gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) atl24_file_path = args.atl24_file if atl24_file_path and (not os.path.isabs(atl24_file_path)): atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) @@ -90,40 +99,72 @@ def run_pipeline(args): match = re.search(r"_(\d{14})_", atl03_h5_file_path) timestamp = match.group(1) if match else "unknown" + version_match = re.search(r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path)) + atl03_version = int(version_match.group(1)) if version_match else None + logger.info("ATL03 version detected: %s", f"{atl03_version:03d}" if atl03_version else "unknown") + + if args.enable_ir_ap_filter: + if atl03_version is None or atl03_version < 7: + logger.error( + "IR/AP filter requires ATL03 version 007 or later. " + "Detected version: %s. " + "Skipping IR/AP filter for this run. " + "Switch to a version 007 file (e.g., Wax Delta dataset) to enable this step.", + f"{atl03_version:03d}" if atl03_version else "unknown", + ) + args.enable_ir_ap_filter = False + gebco_pattern = os.path.join(gebco_full_path, "gebco_*.tif") gebco_file_path_lists = [p for p in glob.glob(gebco_pattern)] is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) if not target_strong_beams: - logger.error( - "No target strong beams found. Check input file and --target_beams." - ) + logger.error("No target strong beams found. Check input file and --target_beams.") sys.exit(1) logger.info("Strong beams selected: %s", target_strong_beams) plot_target_beam = [target_strong_beams[0]] - sea_photon_dataset = Extract_sea_photons( - is2_mds, target_strong_beams, shoreline_data_path - ) + sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + + if args.enable_ir_ap_filter and not args.no_plot: + plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, plot_target_beam) + sea_photon_dataset = apply_optional_ir_ap_filter( sea_photon_dataset, enabled=args.enable_ir_ap_filter, quality_max=args.ir_ap_quality_max, min_signal_conf=args.ir_ap_min_signal_conf, ) + + # Step 5 — Solar background filter is ON by default. + # It self-gates on solar_elevation so has zero effect on nighttime passes. + # Use --disable_solar_background_filter to turn it off. + solar_bg_enabled = not args.disable_solar_background_filter + if not solar_bg_enabled: + if 'solar_elevation' in sea_photon_dataset.columns: + max_solar_elev = sea_photon_dataset['solar_elevation'].max() + if max_solar_elev > args.solar_elevation_day_threshold: + logger.warning( + "Solar background filter is DISABLED via --disable_solar_background_filter " + "but this granule is a daytime pass " + "(max solar elevation = %.1f°, daytime threshold = %.1f°). " + "This filter is recommended for daytime passes.", + max_solar_elev, + args.solar_elevation_day_threshold, + ) + sea_photon_dataset = apply_optional_solar_background_filter( sea_photon_dataset, - enabled=args.enable_solar_background_filter, + enabled=solar_bg_enabled, day_threshold=args.solar_elevation_day_threshold, - background_quantile=args.solar_background_quantile, + median_window_deg=args.solar_bg_median_window_deg, + noise_multiplier=args.solar_bg_noise_multiplier, min_signal_conf=args.solar_background_min_signal_conf, ) binned_dataset_sea_surface = process_sea_photon_binning( - sea_photon_dataset, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, + sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res ) post_refraction_refit_enabled = args.enable_post_refraction_refit @@ -134,56 +175,50 @@ def run_pipeline(args): ) post_refraction_refit_enabled = False - ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) = process_subsurface_photon_filtering( - binned_dataset_sea_surface, - gebco_file_path_lists, - args.subsurface_thresh, - args.ignore_subsurface_height_thres, - use_atl24_filter=args.enable_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, - use_gebco_filter=args.enable_gebco_filter, - apply_histogram_quality_filter=args.enable_histogram_quality_filter, - histogram_quality_min_ratio=args.histogram_quality_min_ratio, - histogram_quality_depth_min=args.histogram_quality_depth_min, - histogram_quality_depth_max=args.histogram_quality_depth_max, - apply_surface_sigma_filter=args.enable_surface_sigma_filter, - surface_sigma_max=args.surface_sigma_max, - apply_refraction_correction=args.enable_refraction_correction, - refraction_water_temp_c=args.refraction_water_temp_c, - refraction_wavelength_nm=args.refraction_wavelength_nm, - apply_post_refraction_refit=post_refraction_refit_enabled, - apply_flattening=args.enable_sea_surface_flattening, - flattening_window_m=args.sea_surface_flattening_window_m, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) + sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ + process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) - final_filtered_subsurface_photon_dataset = ( - filtered_seafloor_subsurface_photon_dataset[ - filtered_seafloor_subsurface_photon_dataset["beam_id"].isin( - target_strong_beams - ) - ].copy() - ) + final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) + ].copy() if args.enable_convex_hull_filter: - final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = ( + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ filter_photon_dataset_by_hull_area( final_filtered_subsurface_photon_dataset, - hull_area_threshold=args.convex_hull_area_threshold, + hull_area_threshold=args.convex_hull_area_threshold ) - ) - if plot_target_beam: + if plot_target_beam and not args.no_plot: plot_convex_hulls( final_filtered_subsurface_photon_dataset, plot_target_beam, convex_hulls, - convex_hull_areas, + convex_hull_areas ) subsurface_output_path = os.path.join( @@ -195,50 +230,56 @@ def run_pipeline(args): kd_input_dataset = optionally_combine_paired_beams_for_kd( final_filtered_subsurface_photon_dataset, horizontal_res=args.horizontal_res, - enabled=args.enable_paired_beam_combine, + enabled=args.enable_paired_beam_combine ) - subsurface_photon_df_added_kd = process_kd_calculation(kd_input_dataset) - kd_output_path = os.path.join( - output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv" + wave_mult = args.wave_exclusion_multiplier if args.enable_wave_adaptive_fit else 0.0 + subsurface_photon_df_added_kd = process_kd_calculation( + kd_input_dataset, decay_zone_threshold=args.decay_zone_threshold, + kd_fit_method=args.kd_fit_method, + wave_exclusion_multiplier=wave_mult, + wave_sigma_calm_threshold=args.wave_sigma_calm_threshold, ) + kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) - if "relative_AT_dist" in kd_input_dataset.columns: + if not args.no_plot and 'relative_AT_dist' in kd_input_dataset.columns: unique_photon_dataset = kd_input_dataset[ - ["relative_AT_dist", "lat_bins", "photon_height"] + ['relative_AT_dist', 'lat_bins', 'photon_height'] ].drop_duplicates() - unique_photon_dataset["relative_AT_dist_center"] = ( - unique_photon_dataset.groupby( - "lat_bins", observed=False - )["relative_AT_dist"].transform("mean") - ) - - def find_closest(group): - center = group["relative_AT_dist_center"].iloc[0] - group = group.copy() - group["dist_to_center"] = abs(group["relative_AT_dist"] - center) - return group.loc[[group["dist_to_center"].idxmin()]] - - closest_to_center = unique_photon_dataset.groupby( - "lat_bins", observed=False - ).apply(find_closest, include_groups=True) - closest_to_center.index = closest_to_center.index.droplevel(0) - kd_df_merged_distance = closest_to_center.merge( - subsurface_photon_df_added_kd, on="lat_bins", how="left" - ).drop(columns=["relative_AT_dist_center", "dist_to_center"], errors="ignore") - - plot_kd_photons( - output_path, - timestamp, - plot_target_beam, - kd_input_dataset, - kd_df_merged_distance, - ) + unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( + 'lat_bins', observed=False + )['relative_AT_dist'].transform('mean') + + _closest_rows = [] + for _lat_bin, _group in unique_photon_dataset.groupby('lat_bins', observed=False): + if _group.empty: + continue + _center = _group['relative_AT_dist_center'].iloc[0] + _group = _group.copy() + _group['dist_to_center'] = abs(_group['relative_AT_dist'] - _center) + _closest_rows.append(_group.loc[[_group['dist_to_center'].idxmin()]]) + if not _closest_rows: + logger.warning("No along-track bins survived filtering. Skipping plot.") + else: + closest_to_center = pd.concat(_closest_rows).reset_index(drop=True) + kd_df_merged_distance = closest_to_center.merge( + subsurface_photon_df_added_kd, + on='lat_bins', + how='left' + ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + + plot_kd_photons( + output_path, + timestamp, + plot_target_beam, + kd_input_dataset, + kd_df_merged_distance, + ) logger.info("SUCCESS! Kd output: %s", kd_output_path) -if __name__ == "__main__": +if __name__ == '__main__': cli_args = get_args() try: run_pipeline(cli_args) diff --git a/aok/core/kd_utils/sea_photons_analysis.py b/aok/core/kd_utils/sea_photons_analysis.py index 8d06e49..729a22c 100644 --- a/aok/core/kd_utils/sea_photons_analysis.py +++ b/aok/core/kd_utils/sea_photons_analysis.py @@ -1,85 +1,82 @@ -from datetime import datetime -import netCDF4 import numpy as np +import geopandas as gpd +from datetime import datetime import pandas as pd +import netCDF4 from scipy.signal import find_peaks from scipy.stats import norm - # Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): + """ + Apply horizontal and vertical binning to each beam. + + Flowchart steps: + 3 — Choose along-track (x) and vertical (z) bin sizes + (e.g., 500 m in x, 0.25 m in z). + 4 — Build vertical histograms of photon counts using selected x, z bin sizes. + """ # Initialize list to store results from each beam binned_beam_datasets = [] # Group the dataset by 'beam_id' and process each group separately - for beam_id, beam_data in sea_photon_dataset.groupby("beam_id"): - print(f"Processing binning for beam: {beam_id}") + for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): + print(f'Processing binning for beam: {beam_id}') # Apply binning to the current beam dataset - binned_beam_data = horizontal_vertical_bin_dataset( - beam_data, horizontal_res, vertical_res - ) + binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) # Append the binned data for the current beam to the list binned_beam_datasets.append(binned_beam_data) # Combine all binned beam datasets into a single DataFrame binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) - + return binned_dataset_sea_surface # Bin data along vertical and horizontal scales def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): - """Bin data along vertical and horizontal scales - for later segmentation""" - + """ + Bin data along vertical and horizontal scales for later segmentation. + + Flowchart steps: + 3 — Choose along-track (x) and vertical (z) bin sizes + (e.g., 500 m in x, 0.25 m in z). + 4 — Build vertical histograms of photon counts using selected x, z bin sizes. + 10 — Re-build histograms based on corrected depths (called again after + refraction correction when apply_post_refraction_refit is enabled). + """ + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-70, 5) - valid_mask = (dataset["photon_height"] > valid_range[0]) & ( - dataset["photon_height"] < valid_range[1] - ) + valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) # Apply the valid_mask to filter unwanted values # and create a copy to avoid SettingWithCopyWarning filtered_dataset = dataset[valid_mask].copy() # Calculate the number of height bins - height_range = abs( - filtered_dataset["photon_height"].max() - - filtered_dataset["photon_height"].min() - ) - height_bin_number = max( - 1, round(height_range / vertical_res) - ) # Ensure at least one bin + height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) + height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) + lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut( - filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) + lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) # Create bins for height - height_bins = pd.cut( - filtered_dataset["photon_height"], - bins=height_bin_number, - labels=np.round( - np.linspace( - filtered_dataset["photon_height"].min(), - filtered_dataset["photon_height"].max(), - num=height_bin_number, - ), - decimals=1, - ), - ) + height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, + labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), + filtered_dataset['photon_height'].max(), + num=height_bin_number), decimals=1)) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, "lat_bins"] = lat_bins - filtered_dataset.loc[:, "height_bins"] = height_bins + filtered_dataset.loc[:, 'lat_bins'] = lat_bins + filtered_dataset.loc[:, 'height_bins'] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -87,7 +84,10 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): def get_sea_surface_height_static(binned_data, threshold): """ - Calculate sea surface height and filter subsurface photons using adaptive detection. + Calculate sea surface height and filter subsurface photons using static threshold. + + Flowchart step: 7 — Fit Gaussian curve to identify surface elevation; compute + standard deviation of Gaussian peak. Parameters: binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height @@ -98,66 +98,61 @@ def get_sea_surface_height_static(binned_data, threshold): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface """ - - # set flag for the df save + + #set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] + # Group dataset along horizental (latitude bins) - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups to detect sea surface for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v["lat"].mean() + lat_bin_average = v['lat'].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) + new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) # Check if new_df is not empty before finding the bin with the highest photon count - if not new_df.empty: + if not new_df.empty: # Find the vertical bin with the highest photon count - largest_h_bin = new_df["lat"].argmax() - + largest_h_bin = new_df['lat'].argmax() + # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] - + # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[ - v["height_bins"] == largest_h_index, "photon_height" - ] + photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] lat_bin_sea_median = photons_sea_surface.median() - + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = v.loc[ - (v["photon_height"] > (lat_bin_sea_median - threshold)) - & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) - ] - + photons_sea_surface_up = \ + v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & + (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] + # Calculate the photon ratio between surface and whole photons - if v["photon_height"].shape[0] > 0: - new_photons_ratio_sea_surface = ( - photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] - ) + if v['photon_height'].shape[0] > 0: + new_photons_ratio_sea_surface = \ + photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] else: new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append( - 1 - new_photons_ratio_sea_surface - ) - + + sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) + else: # Append NaNs if the group is empty sea_surface_height.append(np.nan) @@ -168,50 +163,53 @@ def get_sea_surface_height_static(binned_data, threshold): mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) + final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | + (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) - + sea_surface_dominated_label = \ + np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): + # Get all values below this bin if not np.isnan(final_sea_surface_height[i]): - NewPhotonDFBelowThresholdPeak = v.loc[ - v["photon_height"] < (final_sea_surface_height[i] - threshold) - ] - + NewPhotonDFBelowThresholdPeak = \ + v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex = False - else: - PhotonDFBelowThresholdPeak = pd.concat( - [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] - ) - - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowThresholdPeak, - ) + firstTimeIndex=False + else: + PhotonDFBelowThresholdPeak=\ + pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) + + + return final_sea_surface_height, \ + sea_surface_height_abnormal_label, \ + sea_surface_dominated_label,\ + PhotonDFBelowThresholdPeak def get_sea_surface_height_adaptive(binned_data): """ Calculate sea surface height and filter subsurface photons with enhanced wave removal. + Flowchart steps: + 7 — Fit Gaussian curve to identify surface elevation; compute standard + deviation of Gaussian peak. + 11 — Re-fit Gaussian curve to surface to identify surface peak; compute + standard deviation and remove histogram data within three standard + deviations. (This function is called a second time after refraction + correction and histogram rebuild when apply_post_refraction_refit + is enabled.) + Parameters: binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height @@ -221,18 +219,18 @@ def get_sea_surface_height_adaptive(binned_data): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed """ - + firstTimeIndex = True sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) + grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) for k, v in data_groups.items(): - lat_bin_average = v["lat"].mean() - + lat_bin_average = v['lat'].mean() + if len(v) < 20: # Increase minimum photon count for robustness sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -240,22 +238,18 @@ def get_sea_surface_height_adaptive(binned_data): continue # Finer histogram for better peak resolution - hist, bin_edges = np.histogram( - v["photon_height"], - bins=100, # Finer bins - density=True, - range=(v["photon_height"].min(), v["photon_height"].max()), - ) + hist, bin_edges = np.histogram(v['photon_height'], + bins=100, # Finer bins + density=True, + range=(v['photon_height'].min(), v['photon_height'].max())) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 # Enhanced peak detection - peaks, properties = find_peaks( - hist, - height=np.max(hist) * 0.3, # Stricter peak height - distance=10, # Wider separation - prominence=np.max(hist) * 0.1, - ) # Require prominent peaks - + peaks, properties = find_peaks(hist, + height=np.max(hist)*0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist)*0.1) # Require prominent peaks + if len(peaks) == 0: sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -266,26 +260,28 @@ def get_sea_surface_height_adaptive(binned_data): surface_peak_idx = peaks[np.argmax(hist[peaks])] surface_height = bin_centers[surface_peak_idx] - # Wider window for Gaussian fit to capture wave effects - peak_data = v[ - (v["photon_height"] > surface_height - 1.0) - & (v["photon_height"] < surface_height + 1.0) - ] + # Two-pass Gaussian fit: start with ±1 m, widen if sigma is large + # (the ±1 m window underestimates sigma for SWH > 2 m) + half_win = 1.0 + peak_data = v[(v['photon_height'] > surface_height - half_win) & + (v['photon_height'] < surface_height + half_win)] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data["photon_height"]) + mu, sigma = norm.fit(peak_data['photon_height']) + # Refit with wider window if sigma suggests truncation + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = v[(v['photon_height'] > mu - half_win2) & + (v['photon_height'] < mu + half_win2)] + if len(peak_data2) > 10: + mu, sigma = norm.fit(peak_data2['photon_height']) else: mu, sigma = surface_height, 0.2 # More conservative default - # Stricter threshold: 3σ below mean, plus minimum depth - # Waves can extend beyond 2.5σ, especially in rough conditions. - # 3σ is a common statistical cutoff for excluding outliers - adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # Ensure at least 1m below + adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # at least 1 m below # Extended surface layer to remove wave effects - surface_photons = v[ - (v["photon_height"] > adaptive_threshold) - & (v["photon_height"] < mu + 3.0 * sigma) - ] # Wider upper bound + surface_photons = v[(v['photon_height'] > adaptive_threshold) & + (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan sea_surface_height.append(mu) @@ -295,51 +291,45 @@ def get_sea_surface_height_adaptive(binned_data): # Outlier filtering mean = np.nanmean(sea_surface_height) sd = np.nanstd(sea_surface_height) - final_sea_surface_height = np.where( - (sea_surface_height > mean + 2 * sd) | (sea_surface_height < mean - 2 * sd), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) + final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | + (sea_surface_height < mean - 2*sd), + np.nan, + sea_surface_height).tolist() + + sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) # Filter subsurface photons with diagnostics PhotonDFBelowSurface = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): if not np.isnan(final_sea_surface_height[i]): - peak_data = v[ - (v["photon_height"] > final_sea_surface_height[i] - 1.0) - & (v["photon_height"] < final_sea_surface_height[i] + 1.0) - ] + half_win = 1.0 + peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - half_win) & + (v['photon_height'] < final_sea_surface_height[i] + half_win)] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data["photon_height"]) + mu, sigma = norm.fit(peak_data['photon_height']) + if sigma > 0.4: + half_win2 = min(2.5 * sigma, 3.0) + peak_data2 = v[(v['photon_height'] > mu - half_win2) & + (v['photon_height'] < mu + half_win2)] + if len(peak_data2) > 10: + mu, sigma = norm.fit(peak_data2['photon_height']) adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) else: - adaptive_threshold = ( - final_sea_surface_height[i] - 1.0 - ) # Stricter default - - NewPhotonDFBelowSurface = v[v["photon_height"] < adaptive_threshold] + adaptive_threshold = final_sea_surface_height[i] - 1.0 + NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] + if firstTimeIndex: PhotonDFBelowSurface = NewPhotonDFBelowSurface firstTimeIndex = False else: - PhotonDFBelowSurface = pd.concat( - [PhotonDFBelowSurface, NewPhotonDFBelowSurface] - ) + PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowSurface, - ) + return (final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface) def get_water_temp(date_year, date_month, date_day, latitude, longitude): @@ -359,7 +349,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -370,11 +360,8 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round( - ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) - * (new_lat_max - new_lat_min) - + new_lat_min - ) + new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * + (new_lat_max - new_lat_min) + new_lat_min) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -383,41 +370,24 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round( - ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) - * (new_lon_max - new_lon_min) - + new_lon_min - ) + new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * + (new_lon_max - new_lon_min) + new_lon_min) # Access the SST data using the JPL OpenDap interface - url = ( - "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" - + str(year) - + "/" - + str(zero_day_of_year) - + "/" - + str(date) - + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" - ) + url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ + + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ + + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 + water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction( - WTemp, - WSmodel, - Wavelength, - Photon_ref_elev, - Ph_ref_azimuth, - PhotonZ, - PhotonX, - PhotonY, - Ph_Conf, -): +def refraction_correction(WTemp, WSmodel, Wavelength, + Photon_ref_elev, Ph_ref_azimuth, + PhotonZ, PhotonX, PhotonY, Ph_Conf): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -447,18 +417,18 @@ def refraction_correction( n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = 1 - (n1 / n2) + correction_coef = (1 - (n1 / n2)) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) + theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) # eq 3. S # Approximate water Surface = 1.5 @@ -477,7 +447,7 @@ def refraction_correction( phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -501,27 +471,19 @@ def refraction_correction( outY = PhotonY + DN outZ = PhotonZ + DZ - """ + ''' print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - """ - return ( - outX, - outY, - outZ, - Ph_Conf, - PhotonX, - PhotonY, - PhotonZ, - Ph_ref_azimuth, - Photon_ref_elev, - ) # We are most interested in out-x, out-y, out-z + ''' + return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, + Photon_ref_elev) # We are most interested in out-x, out-y, out-z + diff --git a/aok/core/kd_utils/visualization.py b/aok/core/kd_utils/visualization.py index 29c39b0..8eee8b8 100644 --- a/aok/core/kd_utils/visualization.py +++ b/aok/core/kd_utils/visualization.py @@ -1,103 +1,66 @@ # utils/visualization.py +import numpy as np import os +import pandas as pd import time - import geopandas as gpd +from pyproj import Transformer, Proj import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from pyproj import Transformer from scipy.spatial import ConvexHull - def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): fig, ax = plt.subplots() - ax.scatter( - sea_photon_dataset["latitude"], - sea_photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="ATL03 Photons", - ) - ax.set_xlabel("Relative AT Distance") - ax.set_ylabel("Height (h_ph)") - ax.set_title("Scatter Plot of ATL03 Photons") + ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') + ax.set_xlabel('Relative AT Distance') + ax.set_ylabel('Height (h_ph)') + ax.set_title('Scatter Plot of ATL03 Photons') ax.set_ylim(hlims) ax.legend() plt.show() - -def plot_filtered_seafloor_photons( - filtered_seafloor_subsurface_dataset, - sea_photon_dataset, - sea_surface_height, - output_path, -): +def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): """ Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. Parameters: - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. - - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. - sea_surface_height: Array of sea surface height values. - output_path: Path to save the output plot. """ - + # get sea_surface_x_axis_bins - sea_surface_x_axis_bins = np.linspace( - filtered_seafloor_subsurface_dataset["relative_AT_dist"].min(), - filtered_seafloor_subsurface_dataset["relative_AT_dist"].max(), - len(sea_surface_height), - ) - + sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), + filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), + len(sea_surface_height)) + hlims = [-25, 10] fig, ax = plt.subplots() # Scatter plot of subsurface photons - ax.scatter( - sea_photon_dataset["relative_AT_dist"], - sea_photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="Subsurface ATL03 Photons", - ) + ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') # Overlay the seafloor elevation data as points - ax.plot( - filtered_seafloor_subsurface_dataset["relative_AT_dist"], - filtered_seafloor_subsurface_dataset["seafloor_elevation"], - linewidth=0.8, - c="b", - alpha=0.4, - label="Seafloor Elevation", - ) - - ax.plot( - sea_surface_x_axis_bins, - [x - 0.5 for x in sea_surface_height], - linewidth=0.8, - color="#DD571C", - alpha=0.4, - label="0.5 m Below Surface Peak", - ) + ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + + ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], + linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') # Set labels and title - ax.set_xlabel("Distance From Start of Track (km)") - ax.set_ylabel("Photon Height (m)") - ax.set_title("Scatter Plot of Filtered Sea Photon Dataset") + ax.set_xlabel('Distance From Start of Track (km)') + ax.set_ylabel('Photon Height (m)') + ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') ax.set_ylim(hlims) # Add a legend ax.legend() # Save the plot - plt.legend(loc="upper right") - plt.savefig(output_path, dpi=400, format="jpeg") + plt.legend(loc='upper right') + plt.savefig(output_path, dpi=400, format='jpeg') # Show the plot plt.show() @@ -107,210 +70,179 @@ def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull """ Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. """ - # filter beam type + #filter beam type # photon_dataset - photon_dataset = photon_dataset[photon_dataset["beam_id"].isin(target_beam_ids)] - + photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] + fig, ax = plt.subplots(figsize=(10, 6)) hlims = [-10, 2] - + for lat_bin, hull_points in convex_hulls.items(): hull = ConvexHull(hull_points) - ax.fill( - hull_points[hull.vertices, 0], - hull_points[hull.vertices, 1], - alpha=0.3, - label=f"Bin {lat_bin}", - ) - + ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') + # Calculate centroid to place the label centroid = np.mean(hull_points[hull.vertices], axis=0) - ax.text( - centroid[0], - centroid[1], - f"{convex_hull_areas[lat_bin]:.2f}", - horizontalalignment="center", - verticalalignment="center", - fontsize=10, - color="black", - ) - - ax.scatter( - photon_dataset["lat"], - photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="Subsurface ATL03 Photons", - ) + ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', + horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') + + ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], + s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') ax.set_ylim(hlims) - ax.set_xlabel("Latitude") - ax.set_ylabel("Photon Height") - ax.set_title("ConvexHull of Photons within each Lat Bin") + ax.set_xlabel('Latitude') + ax.set_ylabel('Photon Height') + ax.set_title('ConvexHull of Photons within each Lat Bin') plt.show() +# plot photons coloured by quality_ph flag for diagnostic purposes +def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target_beam_ids): + """ + Scatter plot of photon height vs. along-track distance, coloured by quality_ph value. + + quality_ph bit meanings (ATL03 v007): + bit 0 (value 1) = possible afterpulse + bit 1 (value 2) = possible impulse response effect + bit 2 (value 4) = possible TEP + """ + dataset = sea_photon_dataset[sea_photon_dataset['beam_id'].isin(target_beam_ids)].copy() + if dataset.empty: + return + + quality_colors = { + 0: ('dimgray', 'nominal (0)'), + 1: ('red', 'afterpulse (1)'), + 2: ('orange', 'impulse response (2)'), + 3: ('darkred', 'afterpulse + impulse (3)'), + 4: ('royalblue','TEP (4)'), + 5: ('purple', 'afterpulse + TEP (5)'), + 6: ('cyan', 'impulse + TEP (6)'), + 7: ('black', 'all flags (7)'), + } + + fig, ax = plt.subplots(figsize=(12, 5)) + unique_flags = sorted(dataset['quality_ph'].dropna().astype(int).unique()) + for flag in unique_flags: + subset = dataset[dataset['quality_ph'].astype(int) == flag] + color, label = quality_colors.get(flag, ('magenta', f'unknown ({flag})')) + ax.scatter(subset['relative_AT_dist'], subset['photon_height'], + s=0.5, c=color, alpha=0.3, edgecolors='none', label=label) + + ax.set_xlabel('Relative Along-Track Distance (km)', fontsize=12) + ax.set_ylabel('Photon Height (m)', fontsize=12) + ax.set_title('Photon quality_ph flag distribution', fontsize=13) + ax.set_ylim([-15, 5]) + ax.legend(loc='lower right', markerscale=6, fontsize=9) + fig.tight_layout() + save_path = os.path.join(output_path, f'{timestamp}_quality_ph_flags.jpg') + plt.savefig(save_path, dpi=300, format='jpeg') + plt.show() + print(f"Quality flag plot saved: {save_path}") + + # Print summary counts + print("\nquality_ph flag summary:") + for flag in unique_flags: + count = (dataset['quality_ph'].astype(int) == flag).sum() + _, label = quality_colors.get(flag, ('', f'unknown ({flag})')) + pct = 100.0 * count / len(dataset) + print(f" {label:35s}: {count:>8,d} ({pct:.1f}%)") + # plot the kd and photon -def plot_kd_photons( - OutputPath, - timestamp, - target_beam_ids, - subsurface_photon_dataset, - Kd_DF_MergedDistance, -): - # filter beam type +def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): + #filter beam type # photon_dataset - subsurface_photon_dataset = subsurface_photon_dataset[ - subsurface_photon_dataset["beam_id"].isin(target_beam_ids) - ] - Kd_DF_MergedDistance = Kd_DF_MergedDistance[ - Kd_DF_MergedDistance["beam_id"].isin(target_beam_ids) - ] - + subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] + hlims = [-45, 5] fig, ax1 = plt.subplots(figsize=(10, 6)) - ax1.scatter( - subsurface_photon_dataset["relative_AT_dist"], - subsurface_photon_dataset["photon_height"], - s=1.5, - c="k", - alpha=0.2, - edgecolors="none", - label="Subsurface ATL03 Photon Height", - ) - ax1.set_xlabel("Relative Along-Track Distance", fontsize=18) - ax1.tick_params(axis="x", labelsize=18) # Added line for x-tick label fontsize - ax1.set_ylabel("Photon Height", color="b", fontsize=18) - ax1.tick_params(axis="y", labelcolor="b", labelsize=18) - ax1.plot( - subsurface_photon_dataset["relative_AT_dist"], - subsurface_photon_dataset["seafloor_elevation"], - linewidth=0.8, - c="b", - alpha=0.4, - label="Seafloor Elevation", - ) + ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], + s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') + ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) + ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel('Photon Height', color='b', fontsize=18) + ax1.tick_params(axis='y', labelcolor='b', labelsize=18) + if 'seafloor_elevation' in subsurface_photon_dataset.columns: + ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], + linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') # ax1.set_xlim([1800, 2100]) # Set x-axis limits - # ax1.set_ylim(hlims) - + ax1.set_ylim(hlims) + ax2 = ax1.twinx() - ax2.scatter( - Kd_DF_MergedDistance["relative_AT_dist"], - Kd_DF_MergedDistance["kd"], - label="Kd values", - color="r", - alpha=0.6, - ) - ax2.set_ylabel("Kd Value", color="r", fontsize=18) - ax2.tick_params(axis="y", labelcolor="r", labelsize=18) + ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], + label='Kd values', color='r', alpha=0.6) + ax2.set_ylabel('Kd Value', color='r', fontsize=18) + ax2.tick_params(axis='y', labelcolor='r',labelsize=18) # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() - fig.legend( - handles1 + handles2, - labels1 + labels2, - loc="upper right", - bbox_to_anchor=(0.85, 0.85), - ) + fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) fig.tight_layout() - plt.savefig( - os.path.join(OutputPath, f"{timestamp}_IS2_subsurface_kd_2.jpg"), - dpi=400, - format="jpeg", - ) + plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') plt.show() -def plot_bin_polygon_data( - lat, - height, - seafloor_height, - mask, - lat_bins, - height_bins, - valid_bins, - polygons, - y_min, - y_max, -): + +def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title("Original Height Data") - plt.scatter(lat, height, c=height, cmap="viridis", s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") - plt.colorbar(label="Height") - plt.xlabel("Latitude") - plt.ylabel("Height") + plt.scatter(lat, height, c=height, cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') plt.ylim(y_min, y_max) plt.legend() plt.subplot(1, 2, 2) plt.title("Masked Region (ROI)") - plt.scatter(lat[mask], height[mask], c=height[mask], cmap="viridis", s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") + plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') for polygon in polygons: - plt.gca().add_patch( - plt.Polygon(polygon, edgecolor="red", facecolor="none", linewidth=1) - ) + plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) for lb in lat_bins: - plt.axvline(x=lb, color="gray", linestyle="--", linewidth=0.5) + plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) for hb in height_bins: - plt.axhline(y=hb, color="gray", linestyle="--", linewidth=0.5) + plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) - plt.colorbar(label="Height") - plt.xlabel("Latitude") - plt.ylabel("Height") + plt.colorbar(label='Height') + plt.xlabel('Latitude') + plt.ylabel('Height') plt.ylim(y_min, y_max) plt.legend() plt.tight_layout() plt.show() - -def produce_figures( - binned_data, - bath_height, - sea_height, - solo_sea_surface_label, - y_limit_top, - y_limit_bottom, - percentile, - file, - geo_df, - ref_y, - ref_z, - beam, - epsg_num, -): +# +def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, + y_limit_top, y_limit_bottom, percentile, file, geo_df, + ref_y, ref_z, beam, epsg_num): """Create figures""" # Create bins for latitude - bath_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 - ) + bath_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(bath_height))+20 - sea_surface_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 - ) + sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), + binned_data.lat.max(), len(sea_height))+10 # Create new dataframes for median values - bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) + bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) + sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame( - {"x": sea_surface_x_axis_bins, "y": sea_surface_label} - ) + sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -321,110 +253,65 @@ def produce_figures( # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") - plt.scatter( - geo_df.lat, - geo_df.photon_height, - s=0.8, - marker="o", - alpha=0.1, - c="red", - label="Classified Photons", - ) + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') + plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', + alpha=0.1, c='red', label='Classified Photons') # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter( - bath_median_df.x, - bath_median_df.y, - marker="o", - c="r", - alpha=0.8, - s=2, - label="Median bathymetry", - ) - - plt.scatter( - sea_median_df.x, - sea_median_df.y, - marker="o", - c="b", - alpha=1, - s=2, - label="Median sea surface", - ) - - plt.scatter( - sea_surface_label_df.iloc[idx_1].x, - sea_surface_label_df.iloc[idx_1].y, - marker="o", - c="pink", - alpha=1, - s=3, - label="solo_sea_surface", - ) - plt.scatter( - sea_surface_label_df.iloc[idx_0].x, - sea_surface_label_df.iloc[idx_0].y, - marker="o", - c="g", - alpha=1, - s=3, - label="non_solo_sea_surface", - ) + plt.scatter(bath_median_df.x, bath_median_df.y, + marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') + + plt.scatter(sea_median_df.x, sea_median_df.y, + marker='o', c='b', alpha=1, s=2, label='Median sea surface') + + plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, + marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') + plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, + marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') # Insert titles and subtitles - plt.title("Icesat2 Bathymetry\n" + file) - plt.xlabel("Latitude", fontsize=25) - plt.ylabel("Photon Height (m)", fontsize=25) + plt.title('Icesat2 Bathymetry\n' + file) + plt.xlabel('Latitude', fontsize=25) + plt.ylabel('Photon Height (m)', fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={"size": 20}) + plt.legend(loc="upper left", prop={'size': 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace(".h5", "") + file = file.replace('.h5', '') # Define where to save file plt.tight_layout() - plt.savefig( - "C:/Workstation/ICESat2_HLS/" - + file - + "_gt" - + str(beam) - + "_" - + str(percentile) - + "_EPSG" - + str(epsg_num) - + "_" - + timestr - + ".pdf" - ) + plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + + str(beam) + '_' + str(percentile) + + '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs( - "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True - ) + transformer = Transformer.from_crs("EPSG:" + str(epsg_num), + "EPSG:4326", always_xy=True) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform( + geo_df.lon.values, geo_df.lat.values) - geo_df["lon_wgs84"] = lon_wgs84 - geo_df["lat_wgs84"] = lat_wgs84 + geo_df['lon_wgs84'] = lon_wgs84 + geo_df['lat_wgs84'] = lat_wgs84 - geodf = gpd.GeoDataFrame( - geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) - ) + geodf = gpd.GeoDataFrame(geo_df, + geometry=gpd.points_from_xy(geo_df.lon_wgs84, + geo_df.lat_wgs84)) geodf.set_crs(epsg=4326, inplace=True) # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") + # driver="GPKG") \ No newline at end of file diff --git a/icesat-2_kdph_py/config.py b/icesat-2_kdph_py/config.py deleted file mode 100644 index e8dc43b..0000000 --- a/icesat-2_kdph_py/config.py +++ /dev/null @@ -1,464 +0,0 @@ -# # config.py - -# path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/' - -# #US, Orgon -# # ATL03_h5_file = "processed_ATL03_20181206092124_10500106_005_01.h5"# -# #South America -# # ATL03_h5_file = "processed_ATL03_20220530041141_10391513_005_02.h5"# - - -# #Alaska Cook Inlet -# # ATL03_h5_file = "processed_ATL03_20210801125753_05941205_005_01.h5"# -# ATL03_20200805175053_06320803_006_01_subsetted -# ATL03_20210828231447_10131203_006_01_subsetted - -# # Hawaii line without afterpulses -# # ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5"# - - -# # ChesapeakeBay -# ATL03_h5_file = "processed_ATL03_20230825074121_10102002_006_02.h5"# - - -# # India -# # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# - - -# #China Bohai -# # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# -# # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# - -# ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# - - -# ATL03_h5_file_path = path + ATL03_h5_file - -# Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' - -# shoreline_data_path = Current_Path + 'Shorelines/GeoPkgGlobalShoreline.gpkg' - -# #load the Global bathy dataset -# GEBCO_paths = [ -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w0.0_e90.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-90.0_e0.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w90.0_e180.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n0.0_s-90.0_w-180.0_e-90.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w0.0_e90.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-90.0_e0.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w90.0_e180.0.tif", -# Current_Path + "Bathy/gebco_2024_geotiff/gebco_2024_n90.0_s0.0_w-180.0_e-90.0.tif", -# ] - -# horizontal_res = 500 - -# vertical_res = 0.25 - -# # Use calculated sea height to determine photons at 0.5m below peak -# subsurface_thresh = 0.5 - -# # subsurface distance below the sea surface beginning to account -# Kd_max_depth = -1 - -# OutputPath='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Results/' - -import argparse - - -def get_args(): - parser = argparse.ArgumentParser( - description="Configure ICESat-2 HLS analysis script." - ) - - # General paths - parser.add_argument( - "--workspace_path", - type=str, - default="C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/", - help="Root path for all data processing.", - ) - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', - # help="Base path for ICESat-2 ATL03 data.") - - # # Bohai Sea (v006) - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/', - # help="Base path for ICESat-2 ATL03 data.") - - # Wax Delta (v007) — required for IR/AP filter sensitivity test - parser.add_argument( - "--atl03_path", - type=str, - default="Dataset/ATL03_ICESat2/Wax_Delta/", - help="Base path for ICESat-2 ATL03 data.", - ) - - # #ChesapeakeBay - # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", - # help="Name of the ATL03 H5 file to process.") - - # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", - # help="Name of the ATL03 H5 file to process.") - - # # India: processed_ATL03_20200331204156_00810707_006_01.h5 - # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", - # help="Name of the ATL03 H5 file to process.") - - # #ATL03_20210628230109_00811207_006_01_subsetted - - # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", - # help="Name of the ATL03 H5 file to process.") - - # # WaxDelta - # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", - # help="Name of the ATL03 H5 file to process.") - - # # Bohai (v006) — baseline sensitivity test site - # parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", - # help="Name of the ATL03 H5 file to process.") - - # Wax Delta (v007) — required for IR/AP filter; use with atl03_path = Dataset/ATL03_ICESat2/Wax_Delta/ - parser.add_argument( - "--atl03_file", - type=str, - default="ATL03_20231103172707_06982106_007_01_subsetted.h5", - help="Name of the ATL03 H5 file to process.", - ) - - # # Cook Inlet - # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", - # help="Name of the ATL03 H5 file to process.") - - # #Core Sound - # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", - # help="Name of the ATL03 H5 file to process.") - - # # Pamlico Sound - # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", - # help="Name of the ATL03 H5 file to process.") - - # - # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 - # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", - # help="Name of the ATL03 H5 file to process.") - - parser.add_argument( - "--other_data_path", - type=str, - default="Dataset/", - help="Path to the current working directory.", - ) - - parser.add_argument( - "--output_path", - type=str, - default="Results/", - help="Directory for saving results.", - ) - - # Shoreline data - parser.add_argument( - "--shoreline_data", - type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default="Shorelines/ne_10m_land/ne_10m_land.shp", - help="Path to the global shoreline dataset.", - ) - - # Bathymetry datasets - parser.add_argument( - "--gebco_path", - nargs="+", - type=str, - default="Bathy/gebco_2024_geotiff/", - help="Path to GEBCO bathymetry datasets.", - ) - - # Resolution settings - parser.add_argument( - "--horizontal_res", - type=int, - default=500, - help="Horizontal resolution for the analysis (in meters).", - ) - - parser.add_argument( - "--vertical_res", - type=float, - default=0.25, - help="Vertical resolution for the analysis.", - ) - - # Analysis parameters - parser.add_argument( - "--subsurface_thresh", - type=float, - default=1.0, - help="Threshold for photons below the sea surface.", - ) - - parser.add_argument( - "--ignore_subsurface_height_thres", - type=float, - default=-6, - help="Maximum depth thres for Kd calculations (in meters).", - ) # 5m - - # Empty string '' auto-selects the first strong beam. Pass comma-separated IDs to override. - # Valid IDs: gt1l, gt1r, gt2l, gt2r, gt3l, gt3r — only strong beams for the granule are kept. - # To process all strong beams, use --target_beams "gt1r,gt2r,gt3r". - parser.add_argument( - "--target_beams", - type=str, - default="", - help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.", - ) - - parser.add_argument( - "--disable_solar_background_filter", - action="store_true", - help="Disable solar background filtering (enabled by default; " - "self-gates on solar elevation so has no effect on nighttime passes).", - ) - parser.add_argument( - "--solar_elevation_day_threshold", - type=float, - default=6.0, - help="Solar elevation threshold (deg) to define daytime for optional filtering.", - ) - parser.add_argument( - "--solar_bg_median_window_deg", - type=float, - default=10.0, - help="Rolling median window width (degrees of solar elevation) for " - "the solar-elevation-aware background filter.", - ) - parser.add_argument( - "--solar_bg_noise_multiplier", - type=float, - default=1.5, - help="Ratio of actual/expected background rate above which a photon " - "is flagged as noisy by the solar background filter.", - ) - parser.add_argument( - "--solar_background_min_signal_conf", - type=int, - default=2, - help="Minimum photon signal confidence kept in high daytime background.", - ) - - parser.add_argument( - "--enable_ir_ap_filter", - action="store_true", - help="Optional: remove photons using IR/afterpulse proxy flags.", - ) - parser.add_argument( - "--ir_ap_quality_max", - type=int, - default=0, - help="Maximum allowed quality_ph value when IR/AP filter is enabled.", - ) - parser.add_argument( - "--ir_ap_min_signal_conf", - type=int, - default=0, - help="Minimum allowed photon_conf value when IR/AP filter is enabled.", - ) - - parser.add_argument( - "--enable_gebco_filter", - action="store_true", - help="Optional: remove photons below GEBCO seafloor and shallow bins.", - ) - parser.add_argument( - "--enable_sea_surface_flattening", - action="store_true", - help="Optional: flatten small-bin sea-surface variation using larger along-track windows.", - ) - parser.add_argument( - "--sea_surface_flattening_window_m", - type=int, - default=5000, - help="Window size in meters for sea-surface flattening mean level. " - "Must be larger than horizontal_res to have any effect.", - ) - - parser.add_argument( - "--enable_convex_hull_filter", - action="store_true", - help="Optional: apply convex hull area filtering before Kd fitting.", - ) - parser.add_argument( - "--convex_hull_area_threshold", - type=float, - default=100, - help="Minimum convex hull area to keep a horizontal bin when enabled.", - ) - - parser.add_argument( - "--enable_histogram_quality_filter", - action="store_true", - help="Optional: discard along-track bins with weak 6-7 m signal.", - ) - parser.add_argument( - "--histogram_quality_min_ratio", - type=float, - default=0.05, - help="Minimum ratio of photons in depth band to keep a bin.", - ) - parser.add_argument( - "--histogram_quality_depth_min", - type=float, - default=6.0, - help="Minimum depth (m below surface) of quality-check band.", - ) - parser.add_argument( - "--histogram_quality_depth_max", - type=float, - default=7.0, - help="Maximum depth (m below surface) of quality-check band.", - ) - parser.add_argument( - "--histogram_quality_ref_depth_min", - type=float, - default=0.0, - help="Min depth (m) of near-surface reference band for decay ratio.", - ) - parser.add_argument( - "--histogram_quality_ref_depth_max", - type=float, - default=1.0, - help="Max depth (m) of near-surface reference band for decay ratio.", - ) - - parser.add_argument( - "--enable_surface_sigma_filter", - action="store_true", - help="Optional: discard bins where Gaussian surface sigma is too large.", - ) - parser.add_argument( - "--surface_sigma_max", - type=float, - default=0.5, - help="Maximum allowed Gaussian sigma (m) for sea-surface peak.", - ) - - parser.add_argument( - "--enable_refraction_correction", - action="store_true", - help="Optional: apply refraction correction to subsurface photons.", - ) - parser.add_argument( - "--refraction_water_temp_c", - type=float, - default=20.0, - help="Water temperature (deg C) used for refraction correction.", - ) - parser.add_argument( - "--refraction_wavelength_nm", - type=float, - default=532.0, - help="Laser wavelength (nm) used for refraction correction.", - ) - - parser.add_argument( - "--enable_post_refraction_refit", - action="store_true", - help="Optional: rebuild histograms and re-fit surface after refraction correction.", - ) - - parser.add_argument( - "--enable_atl24_filter", - action="store_true", - help="Optional: use ATL24 bathymetry matching before GEBCO fallback.", - ) - parser.add_argument( - "--atl24_file", - type=str, - default="", - help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.", - ) - parser.add_argument( - "--atl24_max_match_distance_deg", - type=float, - default=0.01, - help="Maximum lon/lat degree distance for ATL24 nearest-point matching.", - ) - - parser.add_argument( - "--decay_zone_threshold", - type=float, - default=0.0, - help="Fraction of peak photon count below which depth bins are excluded " - "in Step 16 decay zone detection. 0.0 = original behaviour (only " - "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.", - ) - - parser.add_argument( - "--kd_fit_method", - type=str, - default="log_linear", - choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], - help="Beer's Law fitting strategy for Kd calculation. " - "log_linear = original log-space linear regression; " - "bg_subtract = subtract noise floor then log-linear; " - "breakpoint = segmented regression with breakpoint detection; " - "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.", - ) - - parser.add_argument( - "--enable_wave_adaptive_fit", - action="store_true", - help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " - "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.", - ) - parser.add_argument( - "--wave_exclusion_multiplier", - type=float, - default=4.0, - help="Number of surface sigmas to skip from the actual water surface when " - "wave-adaptive fit is enabled. The adaptive surface detection already " - "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " - "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.", - ) - parser.add_argument( - "--wave_sigma_calm_threshold", - type=float, - default=0.1, - help="Surface sigma (m) below which a bin is considered calm and no " - "wave trimming is applied. Default 0.1 m.", - ) - - parser.add_argument( - "--enable_paired_beam_combine", - action="store_true", - help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.", - ) - - parser.add_argument( - "--no_plot", - action="store_true", - help="Suppress all plot generation (useful for batch runs).", - ) - - # switch on consider the coastal water or inland water - # if Inland water, we should use one mask and for coastal water, it should be another mask - - # consider bathymetry or not manual setting - - # night vs daytime - - # solar elevation detemine remove or not for solar background - - # 1-2 hours - - return parser.parse_args() - - -if __name__ == "__main__": - args = get_args() - # Access parameters like this: - atl03_file_path = args.workspace_path + args.atl03_file - - print("Processing file:", atl03_file_path) - print("Output path:", args.output_path) diff --git a/icesat-2_kdph_py/kd_utils/Kd_analysis.py b/icesat-2_kdph_py/kd_utils/Kd_analysis.py deleted file mode 100644 index 9cf2210..0000000 --- a/icesat-2_kdph_py/kd_utils/Kd_analysis.py +++ /dev/null @@ -1,1003 +0,0 @@ -# utils/Kd_analysis.py -# updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. - -import logging - -import numpy as np -import pandas as pd -from scipy.optimize import curve_fit -from sklearn.linear_model import LinearRegression - -# def log_model(z, kd, e0): -# return np.log(e0) - kd * z - -## This is wrong because the input is already a bined data, -## it is not necessary to do a histogram again -# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): -# if df.empty or 'lat_bins' not in df.columns: -# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - -# # Get the latitude bin value -# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan -# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan -# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - -# # Calculate photon height range -# photon_height_min = df['photon_height'].min() -# photon_height_max = df['photon_height'].max() - -# # Check for sufficient data range -# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: -# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) -# height_bins_number = round(height_bins_range / vertical_res) - -# # Ensure there are enough bins -# if height_bins_number < 5: -# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# # Create histogram of photon heights -# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) -# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) -# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - -# # Store histogram data -# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - -# # x value for model -# # Reverse zdepth to align with the MATLAB approach -# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - -# # Log-transform photon counts, replacing zeros with NaN -# # y value for model -# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) -# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - -# # Filter out rows with NaNs in either column for regression -# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - -# # Check for enough valid data points -# # Skip the regression if there are fewer than 5 datapoints -# if valid_data['log_photon_counts'].notna().sum() > 3: -# # Drop NaNs for regression -# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) -# log_counts_valid = valid_data['log_photon_counts'].values - -# # Perform linear regression -# model = LinearRegression() -# model.fit(zdepth_valid, log_counts_valid) - -# # Extract kd as the negative of the slope and e0 from the intercept -# kd = -model.coef_[0] - -# print('zdepth_valid:',zdepth_valid) -# print('photon_counts:',hist_df['photon_counts']) -# print('kd:',kd) - -# e0 = np.exp(model.intercept_) - -# # Set kd to NaN if negative -# if kd < 0: -# kd = np.nan -# else: -# kd, e0 = np.nan, np.nan - -# return pd.DataFrame({ -# 'lat_bins': [lat_bin_value], -# 'kd': [kd], -# 'e0': [e0], -# 'latitude': [latitude], -# 'longitude': [longitude] -# }) - - -# # one solution is to adjust the vertical_res to 0.25 -# def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): -# if df.empty or 'lat_bins' not in df.columns: -# return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - -# # Get the latitude bin value -# lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan -# latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan -# longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - -# # Calculate photon height range -# photon_height_min = df['photon_height'].min() -# photon_height_max = df['photon_height'].max() - -# # Check for sufficient data range -# if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: -# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) -# height_bins_number = round(height_bins_range / vertical_res) - -# # Ensure there are enough bins -# if height_bins_number < 5: -# return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# # Create histogram of photon heights -# bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) -# counts, _ = np.histogram(df['photon_height'], bins=bin_edges) -# bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - -# # Store histogram data -# hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - -# # x value for model -# # Reverse zdepth to align with the MATLAB approach -# hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - -# # Log-transform photon counts, replacing zeros with NaN -# # y value for model -# hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) -# hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - -# # Filter out rows with NaNs in either column for regression -# valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - -# # Check for enough valid data points -# # Skip the regression if there are fewer than 5 datapoints -# if valid_data['log_photon_counts'].notna().sum() > 3: -# # Drop NaNs for regression -# zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) -# log_counts_valid = valid_data['log_photon_counts'].values - -# # Perform linear regression -# model = LinearRegression() -# model.fit(zdepth_valid, log_counts_valid) - -# # Extract kd as the negative of the slope and e0 from the intercept -# kd = -model.coef_[0] - -# print('zdepth_valid:',zdepth_valid) -# print('photon_counts:',hist_df['photon_counts']) -# print('kd:',kd) - -# e0 = np.exp(model.intercept_) - -# # Set kd to NaN if negative -# if kd < 0: -# kd = np.nan -# else: -# kd, e0 = np.nan, np.nan - -# return pd.DataFrame({ -# 'lat_bins': [lat_bin_value], -# 'kd': [kd], -# 'e0': [e0], -# 'latitude': [latitude], -# 'longitude': [longitude] -# }) - - -def find_exponential_decay_zone(hist_df, decay_threshold=0.0): - """ - Identify the depth bins that lie within the exponential decay zone. - - Flowchart step: 16 — Determine depth where signal decays to 1% of incoming - signal or where photon decay is no longer exponential. Bins with zero photon - counts (log undefined) are excluded; the remaining contiguous non-zero bins - define the exponential decay zone. - - Parameters - ---------- - hist_df : pd.DataFrame - Columns: 'zdepth' (depth from surface, ascending), 'photon_counts'. - decay_threshold : float, optional - Fraction of peak photon count below which bins are excluded. - 0.0 (default) preserves original behaviour (only zero-count bins removed). - E.g. 0.01 removes bins with < 1% of the peak signal. - - Returns - ------- - pd.DataFrame - Filtered rows with a 'log_photon_counts' column added; only bins inside - the exponential decay zone are kept (photon_counts > 0 and log finite). - """ - df = hist_df.copy() - - # Apply decay threshold: exclude bins below threshold fraction of peak signal - if decay_threshold > 0.0: - peak_count = df["photon_counts"].max() - if peak_count > 0: - df.loc[ - df["photon_counts"] < decay_threshold * peak_count, "photon_counts" - ] = 0 - - df["log_photon_counts"] = np.log(df["photon_counts"].replace(0, np.nan)) - df.loc[np.isinf(df["log_photon_counts"]), "log_photon_counts"] = np.nan - return df.dropna(subset=["zdepth", "log_photon_counts"]) - - -def fit_beers_law(valid_data): - """ - Fit Beer's Law exponential decay curve to the identified decay zone. - - Flowchart step: 17 — Fit exponential decay curve (Beer's Law) to data - within the zone of exponential decay. Implemented as linear regression on - log(photon_counts) vs depth; kd = -slope, e0 = exp(intercept). - - Parameters - ---------- - valid_data : pd.DataFrame - Output of find_exponential_decay_zone; columns 'zdepth' and - 'log_photon_counts'. - - Returns - ------- - tuple[float, float] - (kd, e0). kd is set to np.nan if negative or if fewer than 4 points. - """ - if valid_data["log_photon_counts"].notna().sum() <= 3: - return np.nan, np.nan - - zdepth_valid = valid_data["zdepth"].values.reshape(-1, 1) - log_counts_valid = valid_data["log_photon_counts"].values - - model = LinearRegression() - model.fit(zdepth_valid, log_counts_valid) - - kd = -model.coef_[0] - e0 = np.exp(model.intercept_) - - if kd < 0: - kd = np.nan - return kd, e0 - - -# --------------------------------------------------------------------------- -# Strategy A: Background subtraction + log-linear fit -# --------------------------------------------------------------------------- -def fit_beers_law_bg_subtract(hist_df, bg_fraction=0.2): - """ - Estimate the noise floor from the deepest bins, subtract it, then fit - Beer's Law in log-space on the noise-subtracted counts. - - Parameters - ---------- - hist_df : pd.DataFrame - Columns: 'zdepth' (ascending from surface), 'photon_counts'. - bg_fraction : float - Fraction of the deepest bins used to estimate the background level. - - Returns - ------- - tuple[float, float, float] - (kd, e0, noise_floor). - """ - df = hist_df.copy().sort_values("zdepth") - n_bins = len(df) - if n_bins < 5: - return np.nan, np.nan, np.nan - - # Estimate noise floor from the deepest bg_fraction of bins - n_bg = max(int(n_bins * bg_fraction), 2) - noise_floor = float(df.tail(n_bg)["photon_counts"].median()) - - # Subtract noise and keep only positive residuals - df["signal"] = df["photon_counts"] - noise_floor - df = df[df["signal"] > 0].copy() - if len(df) < 4: - return np.nan, np.nan, noise_floor - - df["log_signal"] = np.log(df["signal"]) - - zdepth = df["zdepth"].values.reshape(-1, 1) - log_signal = df["log_signal"].values - - model = LinearRegression() - model.fit(zdepth, log_signal) - - kd = -model.coef_[0] - e0 = np.exp(model.intercept_) - - if kd < 0: - kd = np.nan - return kd, e0, noise_floor - - -# --------------------------------------------------------------------------- -# Strategy B: Breakpoint / segmented regression -# --------------------------------------------------------------------------- -def fit_beers_law_breakpoint(hist_df): - """ - Find the optimal breakpoint between exponential decay and noise floor, - then fit Beer's Law only to the decay segment. - - The breakpoint is chosen by testing every candidate position and selecting - the one that minimises total residual sum of squares of a two-segment - model: linear slope above the breakpoint, flat constant below. - - Parameters - ---------- - hist_df : pd.DataFrame - Columns: 'zdepth' (ascending from surface), 'photon_counts'. - - Returns - ------- - tuple[float, float, float, float] - (kd, e0, breakpoint_depth, noise_floor). - """ - df = hist_df.copy().sort_values("zdepth") - df = df[df["photon_counts"] > 0].copy() - if len(df) < 5: - return np.nan, np.nan, np.nan, np.nan - - df["log_counts"] = np.log(df["photon_counts"]) - depths = df["zdepth"].values - log_counts = df["log_counts"].values - n = len(depths) - - best_rss = np.inf - best_bp_idx = None - - # Try each candidate breakpoint (need >= 4 points in decay segment, - # >= 1 in noise segment) - for bp_idx in range(4, n - 1): - # Decay segment: linear fit on bins 0..bp_idx-1 - z_decay = depths[:bp_idx].reshape(-1, 1) - lc_decay = log_counts[:bp_idx] - model = LinearRegression() - model.fit(z_decay, lc_decay) - rss_decay = float(np.sum((model.predict(z_decay) - lc_decay) ** 2)) - - # Noise segment: flat at the mean of bins bp_idx..end - lc_noise = log_counts[bp_idx:] - noise_mean = lc_noise.mean() - rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) - - total_rss = rss_decay + rss_noise - if total_rss < best_rss: - best_rss = total_rss - best_bp_idx = bp_idx - - if best_bp_idx is None: - return np.nan, np.nan, np.nan, np.nan - - # Final fit on the decay segment - z_decay = depths[:best_bp_idx].reshape(-1, 1) - lc_decay = log_counts[:best_bp_idx] - model = LinearRegression() - model.fit(z_decay, lc_decay) - - kd = -model.coef_[0] - e0 = np.exp(model.intercept_) - breakpoint_depth = float(depths[best_bp_idx]) - noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) - - if kd < 0: - kd = np.nan - return kd, e0, breakpoint_depth, noise_floor - - -# --------------------------------------------------------------------------- -# Strategy C: Nonlinear fit C(z) = A * exp(-Kd * z) + N -# --------------------------------------------------------------------------- -def _beer_plus_noise(z, A, kd, N): - """Model: signal = A * exp(-kd * z) + N.""" - return A * np.exp(-kd * z) + N - - -def fit_beers_law_nonlinear(hist_df): - """ - Fit the physically correct model C(z) = A·exp(-Kd·z) + N directly - to raw photon counts using nonlinear least-squares (no log transform). - - Parameters - ---------- - hist_df : pd.DataFrame - Columns: 'zdepth' (ascending from surface), 'photon_counts'. - - Returns - ------- - tuple[float, float, float] - (kd, e0, noise_floor). - """ - df = hist_df.copy().sort_values("zdepth") - depths = df["zdepth"].values - counts = df["photon_counts"].values.astype(float) - - if len(depths) < 5: - return np.nan, np.nan, np.nan - - # Initial guesses - A0 = float(counts.max()) - N0 = float(np.median(counts[len(counts) * 3 // 4 :])) # deepest 25% - # Quick log-linear Kd estimate for initial guess (ignore noise) - pos = counts > 0 - if pos.sum() >= 2: - log_c = np.log(counts[pos]) - z_pos = depths[pos] - kd0 = max(float(-(log_c[-1] - log_c[0]) / (z_pos[-1] - z_pos[0] + 1e-9)), 0.01) - else: - kd0 = 0.1 - - try: - popt, _ = curve_fit( - _beer_plus_noise, - depths, - counts, - p0=[A0, kd0, N0], - bounds=([0, 0, 0], [np.inf, np.inf, np.inf]), - maxfev=5000, - ) - A_fit, kd_fit, N_fit = popt - if kd_fit <= 0: - kd_fit = np.nan - return float(kd_fit), float(A_fit), float(N_fit) - except (RuntimeError, ValueError): - return np.nan, np.nan, np.nan - - -# --------------------------------------------------------------------------- -# Hybrid: Stabilised breakpoint zone detection + log-linear fit on decay only -# --------------------------------------------------------------------------- -def fit_beers_law_hybrid( - hist_df, - min_breakpoint_depth=2.0, - expected_noise_floor=None, - noise_floor_tolerance=3.0, -): - """ - Two-stage approach matching the flowchart intent: - Step 16 — Find where exponential decay transitions to noise floor - (stabilised breakpoint with BIC selection). - Step 17 — Fit Beer's Law log-linear ONLY on the decay segment. - - Stabilisation constraints: - * Minimum breakpoint depth prevents shallow breakpoints that yield - unstable regression from too few decay bins. - * Decay segment slope must be negative (Kd > 0). - * BIC model selection penalises over-fitting, preventing the breakpoint - from drifting too deep (where the decay segment gets long and noisy). - * When expected_noise_floor is provided (from beam-level median), - candidate breakpoints whose noise segment deviates too far from the - expected value are penalised. - * Falls back to full log-linear if no valid breakpoint improves BIC - over a single-line model. - - Parameters - ---------- - hist_df : pd.DataFrame - Columns: 'zdepth' (ascending from surface), 'photon_counts'. - min_breakpoint_depth : float - Minimum depth (m) below surface for a valid breakpoint. Prevents - breakpoints landing in the first few bins where regression is - unstable. Default 2.0 m. - expected_noise_floor : float or None - If provided, the beam-level median noise floor (photon counts). - Candidate breakpoints whose noise segment mean deviates by more - than ``noise_floor_tolerance`` times from the expected value - receive a BIC penalty. - noise_floor_tolerance : float - Factor controlling how far the candidate noise segment mean may - deviate from ``expected_noise_floor`` before a penalty is applied. - Default 3.0 (allow 3x variation). - - Returns - ------- - tuple[float, float, float, float] - (kd, e0, breakpoint_depth, noise_floor). - breakpoint_depth and noise_floor are np.nan if fallback to full fit. - """ - df = hist_df.copy().sort_values("zdepth") - df = df[df["photon_counts"] > 0].copy() - n = len(df) - if n < 6: - return np.nan, np.nan, np.nan, np.nan - - df["log_counts"] = np.log(df["photon_counts"]) - depths = df["zdepth"].values - log_counts = df["log_counts"].values - - # ------------------------------------------------------------------ - # Reference: BIC for a single-line model (no breakpoint) - # ------------------------------------------------------------------ - model_full = LinearRegression() - model_full.fit(depths.reshape(-1, 1), log_counts) - rss_full = float( - np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2) - ) - k_full = 2 # slope + intercept - bic_full = n * np.log(rss_full / n + 1e-10) + k_full * np.log(n) - - # ------------------------------------------------------------------ - # Search: best breakpoint using BIC - # ------------------------------------------------------------------ - best_bic = bic_full # must beat the single-line model - best_bp_idx = None - best_model = None - - for bp_idx in range(4, n - 1): - # -- Minimum breakpoint depth constraint -- - if depths[bp_idx] < min_breakpoint_depth: - continue - - # -- Decay segment (surface to breakpoint) -- - z_decay = depths[:bp_idx].reshape(-1, 1) - lc_decay = log_counts[:bp_idx] - model = LinearRegression() - model.fit(z_decay, lc_decay) - - # Physical constraint: slope must be negative - if model.coef_[0] >= 0: - continue - - pred_decay = model.predict(z_decay) - rss_decay = float(np.sum((pred_decay - lc_decay) ** 2)) - - # -- Noise segment (breakpoint to bottom) -- - lc_noise = log_counts[bp_idx:] - noise_mean = float(lc_noise.mean()) - rss_noise = float(np.sum((lc_noise - noise_mean) ** 2)) - - # -- BIC for piecewise model (3 params: slope, intercept, noise level) - rss_total = rss_decay + rss_noise - k_bp = 3 - bic_bp = n * np.log(rss_total / n + 1e-10) + k_bp * np.log(n) - - # -- Noise floor consistency penalty -- - # When we have a beam-level expected noise floor, penalise - # candidates whose noise segment deviates substantially. - if expected_noise_floor is not None and expected_noise_floor > 0: - candidate_nf = float(np.exp(noise_mean)) - ratio = candidate_nf / expected_noise_floor - if ratio > noise_floor_tolerance or ratio < 1.0 / noise_floor_tolerance: - # Add a penalty proportional to log-deviation - bic_bp += n * abs(np.log(ratio)) - - if bic_bp < best_bic: - best_bic = bic_bp - best_bp_idx = bp_idx - best_model = model - - # ------------------------------------------------------------------ - # Result - # ------------------------------------------------------------------ - if best_bp_idx is None: - # No breakpoint beats the single-line model — fall back - kd = -model_full.coef_[0] - e0 = np.exp(model_full.intercept_) - if kd < 0: - kd = np.nan - return kd, e0, np.nan, np.nan - - kd = -best_model.coef_[0] - e0 = np.exp(best_model.intercept_) - breakpoint_depth = float(depths[best_bp_idx]) - noise_floor = float(np.exp(log_counts[best_bp_idx:].mean())) - - if kd < 0: - kd = np.nan - return kd, e0, breakpoint_depth, noise_floor - - -# --------------------------------------------------------------------------- -# Dispatcher: select fitting strategy by name -# --------------------------------------------------------------------------- -KD_FIT_METHODS = ("log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid") - - -def _fit_kd_with_method( - hist_df, method="log_linear", decay_threshold=0.0, expected_noise_floor=None -): - """ - Run the requested fitting strategy on a single histogram. - - Returns - ------- - tuple[float, float, float] - (kd, e0, noise_floor). noise_floor is np.nan for log_linear. - """ - if method == "log_linear": - valid = find_exponential_decay_zone(hist_df, decay_threshold=decay_threshold) - kd, e0 = fit_beers_law(valid) - return kd, e0, np.nan - - if method == "bg_subtract": - return fit_beers_law_bg_subtract(hist_df) - - if method == "breakpoint": - kd, e0, bp, nf = fit_beers_law_breakpoint(hist_df) - return kd, e0, nf - - if method == "nonlinear": - return fit_beers_law_nonlinear(hist_df) - - if method == "hybrid": - kd, e0, bp, nf = fit_beers_law_hybrid( - hist_df, expected_noise_floor=expected_noise_floor - ) - return kd, e0, nf - - raise ValueError( - f"Unknown kd_fit_method: {method!r}. " f"Choose from {KD_FIT_METHODS}" - ) - - -# another solution is to calculate kd without hist -def CalculateKdFromFilteredSubsurfacePhoton( - df, - vertical_res=0.8, - decay_zone_threshold=0.0, - kd_fit_method="log_linear", - expected_noise_floor=None, - surface_sigma=None, - wave_exclusion_multiplier=0.0, - wave_sigma_calm_threshold=0.1, -): - """ - Calculate Kd for a single along-track bin by fitting Beer's Law in log-space. - - Orchestrates steps 16 and 17 by calling find_exponential_decay_zone and - fit_beers_law in sequence. - - Flowchart steps: - 16 — find_exponential_decay_zone: determine depth range of exponential decay. - 17 — fit_beers_law: fit Beer's Law curve; kd = -slope, e0 = exp(intercept). - """ - # Early exit if DataFrame is empty or missing required column - if df.empty or "lat_bins" not in df.columns: - return pd.DataFrame( - { - "lat_bins": [np.nan], - "kd": [np.nan], - "e0": [np.nan], - "noise_floor": [np.nan], - "surface_sigma": [np.nan], - "latitude": [np.nan], - "longitude": [np.nan], - } - ) - - # Retrieve latitude and longitude - lat_bin_value = df["lat_bins"].iloc[0] if not df["lat_bins"].empty else np.nan - latitude = ( - df["latitude"].mean() - if "latitude" in df.columns - else df["lat"].mean() - if "lat" in df.columns - else np.nan - ) - longitude = ( - df["longitude"].mean() - if "longitude" in df.columns - else df["lon"].mean() - if "lon" in df.columns - else np.nan - ) - - # Use value_counts to get photon counts in each height bin - height_counts = df["height_bins"].value_counts().sort_index() - bin_centers = height_counts.index.astype(float) - - # Create a DataFrame for the height bins and counts - hist_df = pd.DataFrame( - {"zdepth": bin_centers, "photon_counts": height_counts.values} - ) - - # Reverse zdepth for model alignment - hist_df["zdepth"] = hist_df["zdepth"].max() - hist_df["zdepth"] - - # Wave-adaptive fit: skip shallow bins contaminated by wave smearing/bubbles. - # zdepth=0 in the histogram corresponds to the adaptive surface detection - # threshold: max(3*sigma, 1.0 m) below the Gaussian surface peak - # (set by get_sea_surface_height_adaptive). The wave-adaptive trim adds - # an additional margin of (k - 3)*sigma beyond the 3-sigma cutoff to - # cover bubble injection beneath wave troughs and compensate for sigma - # underestimation from the truncated ±1 m Gaussian fit window. - if surface_sigma is not None and wave_exclusion_multiplier > 0: - if not np.isnan(surface_sigma) and surface_sigma > wave_sigma_calm_threshold: - adaptive_offset = max(3.0 * surface_sigma, 1.0) - wave_skip = max( - 0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset - ) - trimmed = hist_df[hist_df["zdepth"] >= wave_skip] - if len(trimmed) >= 4: - logging.info( - "Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " - "skip=%.2f m, %d->%d bins", - surface_sigma, - adaptive_offset, - wave_skip, - len(hist_df), - len(trimmed), - ) - hist_df = trimmed - # else: keep original hist_df (too few bins after trim) - - kd, e0, noise_floor = _fit_kd_with_method( - hist_df, - method=kd_fit_method, - decay_threshold=decay_zone_threshold, - expected_noise_floor=expected_noise_floor, - ) - - return pd.DataFrame( - { - "lat_bins": [lat_bin_value], - "kd": [kd], - "e0": [e0], - "noise_floor": [noise_floor], - "surface_sigma": [surface_sigma if surface_sigma is not None else np.nan], - "latitude": [latitude], - "longitude": [longitude], - } - ) - - -# Original kd calculation function remains unchanged -def calculate_kd( - filtered_seafloor_subsurface_photon_dataset, - decay_zone_threshold=0.0, - kd_fit_method="log_linear", - wave_exclusion_multiplier=0.0, - wave_sigma_calm_threshold=0.1, -): - """ - Loop over along-track bins and call CalculateKdFromFilteredSubsurfacePhoton - for each bin. - - For the 'hybrid' method, a two-pass approach is used: - Pass 1 — Fit each bin independently to estimate per-bin noise floors. - Pass 2 — Compute the beam-level median noise floor, then re-fit each - bin with the expected noise floor as a stabilisation constraint. - - After fitting, IQR-based outlier filtering removes extreme Kd values - (for hybrid method only). - - Flowchart steps: - 16 — Determine depth of exponential decay zone (per bin, inside - CalculateKdFromFilteredSubsurfacePhoton). - 17 — Fit Beer's Law exponential decay curve (per bin). - 18 — Save K_dph attenuation coefficient value and statistical terms - (kd, e0 columns in the returned DataFrame). - """ - logging.info( - "Calculating Kd from filtered subsurface photon dataset (method=%s)", - kd_fit_method, - ) - - empty_df = pd.DataFrame( - { - "lat_bins": [], - "kd": [], - "e0": [], - "noise_floor": [], - "surface_sigma": [], - "latitude": [], - "longitude": [], - } - ) - - groups = list( - filtered_seafloor_subsurface_photon_dataset.groupby("lat_bins", observed=False) - ) - if not groups: - return empty_df - - if kd_fit_method == "hybrid": - # ------------------------------------------------------------------ - # Pass 1: estimate per-bin noise floors (no expected_noise_floor) - # ------------------------------------------------------------------ - logging.info("Hybrid pass 1: estimating per-bin noise floors") - pass1_results = [] - for _, group in groups: - sigma = ( - float(group["surface_sigma"].iloc[0]) - if ("surface_sigma" in group.columns and len(group) > 0) - else None - ) - pass1_results.append( - CalculateKdFromFilteredSubsurfacePhoton( - group, - decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - ) - ) - if not pass1_results: - return empty_df - pass1_df = pd.concat(pass1_results, ignore_index=True) - - # ------------------------------------------------------------------ - # Compute sliding-window noise floor for pass 2 - # ------------------------------------------------------------------ - # 10 km window = ±10 bins at 500 m resolution (20 bins total). - # Each bin's expected noise floor = median of valid noise floors - # within ±half_window bins. Falls back to beam-level median when - # the local window has < 3 valid estimates. - HALF_WINDOW = 10 # ±10 bins = ±5 km at 500 m horizontal_res - MIN_LOCAL = 3 # minimum valid noise floors to use local median - - nf_array = pass1_df["noise_floor"].values.copy() - n_bins = len(nf_array) - - # Beam-level fallback - valid_nf_all = pass1_df["noise_floor"].dropna() - valid_nf_all = valid_nf_all[valid_nf_all > 0] - beam_median_nf = ( - float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None - ) - - if beam_median_nf is not None: - # Build per-bin expected noise floor via sliding window - expected_nf_per_bin = np.full(n_bins, np.nan) - for i in range(n_bins): - lo = max(0, i - HALF_WINDOW) - hi = min(n_bins, i + HALF_WINDOW + 1) - window_nf = nf_array[lo:hi] - valid = window_nf[~np.isnan(window_nf) & (window_nf > 0)] - if len(valid) >= MIN_LOCAL: - expected_nf_per_bin[i] = float(np.median(valid)) - else: - expected_nf_per_bin[i] = beam_median_nf # fallback - - logging.info( - "Hybrid pass 2: sliding window noise floor " - "(half_window=%d bins, beam_median=%.3f, " - "local range=%.3f-%.3f)", - HALF_WINDOW, - beam_median_nf, - float(np.nanmin(expected_nf_per_bin)), - float(np.nanmax(expected_nf_per_bin)), - ) - else: - expected_nf_per_bin = None - logging.info( - "Hybrid: insufficient noise floor estimates (%d), " "skipping pass 2", - len(valid_nf_all), - ) - - # ------------------------------------------------------------------ - # Pass 2: re-fit with per-bin expected noise floor constraint - # ------------------------------------------------------------------ - if expected_nf_per_bin is not None: - results = [] - for idx, (_, group) in enumerate(groups): - sigma = ( - float(group["surface_sigma"].iloc[0]) - if ("surface_sigma" in group.columns and len(group) > 0) - else None - ) - results.append( - CalculateKdFromFilteredSubsurfacePhoton( - group, - decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - expected_noise_floor=float(expected_nf_per_bin[idx]), - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - ) - ) - SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) - else: - SubsurfacePhotonDFAddedKd = pass1_df - - # ------------------------------------------------------------------ - # IQR-based outlier filtering (hybrid only) - # Uses 3×IQR to accommodate real spatial variation (e.g. turbidity - # gradients near river mouths) while removing fitting artefacts. - # ------------------------------------------------------------------ - kd_vals = SubsurfacePhotonDFAddedKd["kd"].dropna() - if len(kd_vals) >= 5: - q1 = float(kd_vals.quantile(0.25)) - q3 = float(kd_vals.quantile(0.75)) - iqr = q3 - q1 - lower = q1 - 3.0 * iqr - upper = q3 + 3.0 * iqr - # Also apply physical cap: Kd > 5.0 m⁻¹ is unrealistic - upper = min(upper, 5.0) - outlier_mask = (SubsurfacePhotonDFAddedKd["kd"] < lower) | ( - SubsurfacePhotonDFAddedKd["kd"] > upper - ) - n_outliers = int(outlier_mask.sum()) - if n_outliers > 0: - logging.info( - "Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", - n_outliers, - lower, - upper, - ) - SubsurfacePhotonDFAddedKd.loc[outlier_mask, "kd"] = np.nan - - else: - # ------------------------------------------------------------------ - # Non-hybrid methods: single pass, no IQR filtering - # ------------------------------------------------------------------ - results = [] - for _, group in groups: - sigma = ( - float(group["surface_sigma"].iloc[0]) - if ("surface_sigma" in group.columns and len(group) > 0) - else None - ) - results.append( - CalculateKdFromFilteredSubsurfacePhoton( - group, - decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - ) - ) - if not results: - return empty_df - SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) - - return SubsurfacePhotonDFAddedKd - - -# Updated function to apply kd calculation beam-by-beam -def process_kd_calculation( - Final_filtered_subsurface_photon_dataset, - decay_zone_threshold=0.0, - kd_fit_method="log_linear", - wave_exclusion_multiplier=0.0, - wave_sigma_calm_threshold=0.1, -): - """ - Beam-by-beam wrapper that calls calculate_kd for every beam and concatenates - the results into a single Kd output DataFrame. - - Flowchart steps: - 16 — Determine depth of exponential decay zone (delegated to - CalculateKdFromFilteredSubsurfacePhoton). - 17 — Fit Beer's Law exponential decay curve (delegated). - 18 — Save K_dph attenuation coefficient value (slope of the Beer's Law - curve) and statistical terms (kd, e0) — the returned DataFrame is - written to CSV by run_pipeline. - """ - # Initialize list to store results for each beam - kd_beam_datasets = [] - - # Group by 'beam_id' to process each beam independently - for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby( - "beam_id" - ): - logging.info(f"Calculating Kd for beam: {beam_id}") - - # Apply the calculate_kd function to the current beam's dataset - SubsurfacePhotonDFAddedKd = calculate_kd( - beam_data, - decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - ) - - # Add a column to track the beam_id in the results - SubsurfacePhotonDFAddedKd["beam_id"] = beam_id - - # Append the result to the list - kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) - - if not kd_beam_datasets: - logging.warning( - "process_kd_calculation: no beam data remained after filtering. " - "All bins may have been discarded by an upstream filter (e.g. histogram quality). " - "Returning empty DataFrame." - ) - return pd.DataFrame( - columns=[ - "lat_bins", - "kd", - "e0", - "noise_floor", - "surface_sigma", - "latitude", - "longitude", - "beam_id", - ] - ) - - # Combine results from all beams into a single DataFrame - combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) - - return combined_kd_dataset diff --git a/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py b/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py deleted file mode 100644 index a1ea0df..0000000 --- a/icesat-2_kdph_py/kd_utils/SeafloorFilterByGEBCO_Module.py +++ /dev/null @@ -1,122 +0,0 @@ -import numpy as np -import rasterio -from rtree import index -from shapely.geometry import box - - -# 1. Function to create an R-tree spatial index for raster bounds -def create_spatial_index(gebco_paths): - """ - Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - - Returns: - raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - """ - idx = index.Index() - raster_data_dict = {} - for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): - gebco_raster = rasterio.open(path) - raster_data = gebco_raster.read(1) - raster_data_dict[path] = (gebco_raster, raster_data) - bounds = gebco_raster.bounds - idx.insert( - i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path - ) - return raster_data_dict, idx - - -# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner -def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): - """ - Determine which raster datasets are relevant for the given coordinates using spatial index. - This function uses a more efficient approach by performing a bounding box query for batches of points. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - raster_data_dict (dict): Dictionary containing raster datasets and their data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - - Returns: - relevant_rasters (list): List of relevant raster datasets and their respective data. - """ - # Create a bounding box that covers all points - min_lon, max_lon = lons.min(), lons.max() - min_lat, max_lat = lats.min(), lats.max() - bounding_box = box(min_lon, min_lat, max_lon, max_lat) - - # Get all rasters that intersect with the bounding box - matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) - relevant_paths = {match.object for match in matches} - relevant_rasters = [ - (raster_data_dict[path][0], raster_data_dict[path][1]) - for path in relevant_paths - ] - return relevant_rasters - - -# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner -def get_seafloor_elevation(lons, lats, relevant_rasters): - """ - Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - relevant_rasters (list): List of relevant raster datasets and their respective data. - - Returns: - seafloor_elevations (numpy.ndarray): Array of seafloor elevations. - """ - points = np.vstack((lons, lats)).T - seafloor_elevations = np.full(len(lons), np.nan) - - for gebco_raster, raster_data in relevant_rasters: - # Use rasterio.sample to get values for multiple points in a batch - values = list(gebco_raster.sample(points)) - for idx, value in enumerate(values): - if np.isnan(seafloor_elevations[idx]) and value is not None: - seafloor_elevations[idx] = value[0] - - return seafloor_elevations - - -# 4. The main process function that ties everything together -def process_seafloor_data(gebco_paths, sea_photon_dataset): - """ - Main function to process the seafloor data. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - - Returns: - filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. - """ - # Create spatial index and load GEBCO Raster Data - raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - - # Get the relevant rasters using spatial index - lons = sea_photon_dataset["longitude"].values - lats = sea_photon_dataset["latitude"].values - relevant_rasters = get_relevant_rasters_using_index( - lons, lats, raster_data_dict, spatial_index - ) - - # Get seafloor elevation for all points - sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( - lons, lats, relevant_rasters - ) - - # Filter out points below the seafloor - filtered_sea_photon_dataset = sea_photon_dataset[ - sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] - ] - # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) - - return filtered_sea_photon_dataset diff --git a/icesat-2_kdph_py/kd_utils/__init__.py b/icesat-2_kdph_py/kd_utils/__init__.py deleted file mode 100644 index 8976647..0000000 --- a/icesat-2_kdph_py/kd_utils/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# utils/__init__.py - -from .bathy_processing import * -from .data_processing import ( - Extract_sea_photons, - create_photon_dataframe, - extract_file_params, - load_data, -) -from .interpolation import ( - apply_interpolation, - geoid_correction, - interpolate_labels, - refraction_correction, -) -from .Kd_analysis import CalculateKdFromFilteredSubsurfacePhoton -from .sea_photons_analysis import * -from .visualization import plot_kd_photons, plot_photon_height diff --git a/icesat-2_kdph_py/kd_utils/bathy_processing.py b/icesat-2_kdph_py/kd_utils/bathy_processing.py deleted file mode 100644 index f29418b..0000000 --- a/icesat-2_kdph_py/kd_utils/bathy_processing.py +++ /dev/null @@ -1,1101 +0,0 @@ -# utils/bathy_processing.py -import logging -import os - -import numpy as np -import pandas as pd -import rasterio -from rtree import index -from scipy.spatial import cKDTree -from scipy.stats import norm -from shapely.geometry import box - -from kd_utils.sea_photons_analysis import ( - get_sea_surface_height_adaptive, - horizontal_vertical_bin_dataset, -) - - -def apply_optional_histogram_quality_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - min_ratio=0.05, - depth_min=6.0, - depth_max=7.0, - reference_depth_min=0.0, - reference_depth_max=1.0, -): - """ - Optional histogram quality check: keep along-track bins only if enough - signal remains at the target depth band relative to near-surface signal. - - Flowchart step: 6 — Review histograms for quality: if <5% of signal remains - at an uncorrected water depth of 6-7 m below the surface, quality-flag this - along-track bin as low confidence and exclude it from further analyses - (Possible discard). - - The decay ratio is computed as: - photon_count_in_depth_band / photon_count_in_reference_band - - This measures how much signal has decayed at depth compared to near the - surface, consistent with Beer's Law attenuation. - - Parameters - ---------- - binned_dataset_sea_surface : pd.DataFrame - Full binned photon dataset with 'lat_bins' and 'photon_height'. - subsurface_photon_dataset : pd.DataFrame - Subsurface photon dataset to filter. - sea_surface_height : list - Sea surface height per along-track bin. - min_ratio : float - Minimum decay ratio to keep a bin (default 0.05 = 5%). - depth_min : float - Minimum depth of the quality-check band (default 6.0 m). - depth_max : float - Maximum depth of the quality-check band (default 7.0 m). - reference_depth_min : float - Minimum depth of the near-surface reference band (default 0.0 m). - reference_depth_max : float - Maximum depth of the near-surface reference band (default 1.0 m). - """ - if subsurface_photon_dataset.empty: - return subsurface_photon_dataset - - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) - lat_bin_keys = list(grouped.groups.keys()) - if len(lat_bin_keys) != len(sea_surface_height): - return subsurface_photon_dataset - - quality_df = ( - pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} - ) - .dropna(subset=["sea_surface_height"]) - .copy() - ) - if quality_df.empty: - return subsurface_photon_dataset.iloc[0:0].copy() - - ratios = [] - for _, row in quality_df.iterrows(): - lat_bin = row["lat_bins"] - surface = row["sea_surface_height"] - bin_data = binned_dataset_sea_surface[ - binned_dataset_sea_surface["lat_bins"] == lat_bin - ] - if bin_data.empty: - ratios.append(0.0) - continue - - depth = surface - bin_data["photon_height"] - underwater_mask = depth > 0 - - # Reference band: near-surface photon count (incoming signal) - ref_mask = ( - underwater_mask - & (depth >= reference_depth_min) - & (depth <= reference_depth_max) - ) - ref_count = float(ref_mask.sum()) - if ref_count == 0: - ratios.append(0.0) - continue - - # Target band: photon count at the check depth - band_mask = underwater_mask & (depth >= depth_min) & (depth <= depth_max) - band_count = float(band_mask.sum()) - - # Normalize by band width so bands of different sizes are comparable - ref_width = max(reference_depth_max - reference_depth_min, 0.01) - band_width = max(depth_max - depth_min, 0.01) - decay_ratio = (band_count / band_width) / (ref_count / ref_width) - - ratios.append(decay_ratio) - - quality_df["hist_quality_ratio"] = ratios - valid_bins = set( - quality_df.loc[ - quality_df["hist_quality_ratio"] >= min_ratio, "lat_bins" - ].tolist() - ) - return subsurface_photon_dataset[ - subsurface_photon_dataset["lat_bins"].isin(valid_bins) - ].copy() - - -def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): - """Compute per-bin Gaussian surface sigma from photons near the surface peak. - - Uses a two-pass fit: initial ±1 m window, then expands to ±min(2.5σ, 3 m) - if the initial sigma > 0.4 m (indicating the ±1 m window truncates the - distribution). Returns a dict mapping lat_bin -> sigma (float or NaN). - Bins with fewer than 10 surface photons get NaN. - """ - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) - lat_bin_keys = list(grouped.groups.keys()) - if len(lat_bin_keys) != len(sea_surface_height): - return {} - - sigma_map = {} - for lat_bin, surface in zip(lat_bin_keys, sea_surface_height, strict=False): - if pd.isna(surface): - continue - bin_data = binned_dataset_sea_surface[ - binned_dataset_sea_surface["lat_bins"] == lat_bin - ] - if bin_data.empty: - continue - peak_data = bin_data[ - (bin_data["photon_height"] > surface - 1.0) - & (bin_data["photon_height"] < surface + 1.0) - ] - if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data["photon_height"]) - if sigma > 0.4: - half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = bin_data[ - (bin_data["photon_height"] > mu - half_win2) - & (bin_data["photon_height"] < mu + half_win2) - ] - if len(peak_data2) > 10: - _, sigma = norm.fit(peak_data2["photon_height"]) - else: - sigma = np.nan - sigma_map[lat_bin] = sigma - - return sigma_map - - -def apply_optional_surface_sigma_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - sigma_max=0.5, -): - """ - Optional Gaussian surface sigma filter: discard along-track bins if the - fitted surface Gaussian sigma exceeds sigma_max. - - Flowchart step: 8 — If standard deviation of Gaussian peak is >X m, - quality-flag this along-track bin as low confidence and discard it - (addresses surface waves) (Possible discard). - """ - if subsurface_photon_dataset.empty: - return subsurface_photon_dataset - - sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) - if not sigma_map: - return subsurface_photon_dataset - - sigma_df = pd.DataFrame( - list(sigma_map.items()), columns=["lat_bins", "surface_sigma"] - ) - valid_bins = set( - sigma_df.loc[sigma_df["surface_sigma"] <= sigma_max, "lat_bins"].tolist() - ) - return subsurface_photon_dataset[ - subsurface_photon_dataset["lat_bins"].isin(valid_bins) - ].copy() - - -def apply_optional_refraction_correction( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - water_temp_c=20.0, - wavelength_nm=532.0, -): - """ - Optional refraction correction for subsurface photons. - Updates photon positions/heights using Snell-based geometry. - - Flowchart step: 9 — Compute water depths (distance below surface) and - correct depths for refraction. - """ - required_cols = { - "lat_bins", - "photon_height", - "ref_elevation", - "ref_azimuth", - "lon", - "lat", - } - if subsurface_photon_dataset.empty or ( - not required_cols.issubset(set(subsurface_photon_dataset.columns)) - ): - return subsurface_photon_dataset - - grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) - lat_bin_keys = list(grouped.groups.keys()) - if len(lat_bin_keys) != len(sea_surface_height): - return subsurface_photon_dataset - - surface_df = pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} - ) - corrected = subsurface_photon_dataset.merge( - surface_df, on="lat_bins", how="left" - ).copy() - if corrected["sea_surface_height"].isna().all(): - return subsurface_photon_dataset - - corrected["photon_height_pre_refraction"] = corrected["photon_height"] - corrected["lon_pre_refraction"] = corrected["lon"] - corrected["lat_pre_refraction"] = corrected["lat"] - - # Refraction index parameterization from legacy module. - a = -0.000001501562500 - b = 0.000000107084865 - c = -0.000042759374989 - d = -0.000160475520686 - e = 1.398067112092424 - n1 = 1.00029 - n2 = ( - (a * water_temp_c**2) - + (b * wavelength_nm**2) - + (c * water_temp_c) - + (d * wavelength_nm) - + e - ) - - ref_elev = pd.to_numeric(corrected["ref_elevation"], errors="coerce").to_numpy( - dtype=float - ) - ref_az = pd.to_numeric(corrected["ref_azimuth"], errors="coerce").to_numpy( - dtype=float - ) - z = pd.to_numeric(corrected["photon_height"], errors="coerce").to_numpy(dtype=float) - ws = pd.to_numeric(corrected["sea_surface_height"], errors="coerce").to_numpy( - dtype=float - ) - x = pd.to_numeric(corrected["lon"], errors="coerce").to_numpy(dtype=float) - y = pd.to_numeric(corrected["lat"], errors="coerce").to_numpy(dtype=float) - - # Convert to radians if values look like degrees. - if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): - ref_elev = np.deg2rad(ref_elev) - if np.nanmax(np.abs(ref_az)) > (2 * np.pi): - ref_az = np.deg2rad(ref_az) - - valid_mask = ( - np.isfinite(ref_elev) - & np.isfinite(ref_az) - & np.isfinite(z) - & np.isfinite(ws) - & (z <= ws) - ) - if not np.any(valid_mask): - return subsurface_photon_dataset - - theta1 = (np.pi / 2.0) - ref_elev[valid_mask] - theta2_arg = (n1 * np.sin(theta1)) / n2 - theta2_arg = np.clip(theta2_arg, -1.0, 1.0) - theta2 = np.arcsin(theta2_arg) - - D = ws[valid_mask] - z[valid_mask] - cos_theta1 = np.cos(theta1) - cos_theta1 = np.where(np.abs(cos_theta1) < 1e-6, np.nan, cos_theta1) - S = D / cos_theta1 - R = (S * n1) / n2 - Gamma = (np.pi / 2.0) - theta1 - phi = theta1 - theta2 - P_sq = np.maximum(R**2 + S**2 - 2 * R * S * np.cos(phi), 0.0) - P = np.sqrt(P_sq) - alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) - alpha_arg = np.clip(alpha_arg, -1.0, 1.0) - alpha = np.arcsin(alpha_arg) - Beta = Gamma - alpha - - DY = P * np.cos(Beta) - DZ = P * np.sin(Beta) - DE = DY * np.sin(ref_az[valid_mask]) - DN = DY * np.cos(ref_az[valid_mask]) - - x_corr = x[valid_mask] + DE - y_corr = y[valid_mask] + DN - z_corr = z[valid_mask] + DZ - - corrected.loc[valid_mask, "lon"] = x_corr - corrected.loc[valid_mask, "lat"] = y_corr - corrected.loc[valid_mask, "photon_height"] = z_corr - return corrected - - -def apply_sea_surface_flattening( - subsurface_photon_dataset, - sea_surface_height, - lat_bin_keys, - horizontal_res, - flattening_window_m=500, -): - """ - Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. - This follows the standalone SeaSurfaceFlattening module logic, but is optional. - """ - if subsurface_photon_dataset.empty: - return subsurface_photon_dataset - - if len(sea_surface_height) != len(lat_bin_keys): - return subsurface_photon_dataset - - surface_df = ( - pd.DataFrame( - {"lat_bins": lat_bin_keys, "sea_surface_height_local": sea_surface_height} - ) - .dropna(subset=["sea_surface_height_local"]) - .copy() - ) - if surface_df.empty: - return subsurface_photon_dataset - - surface_df["lat_bins_num"] = pd.to_numeric(surface_df["lat_bins"], errors="coerce") - surface_df = surface_df.dropna(subset=["lat_bins_num"]) - if surface_df.empty: - return subsurface_photon_dataset - - big_bin_size = max(1, int(round(flattening_window_m / max(horizontal_res, 1)))) - if big_bin_size <= 1: - logging.warning( - "Sea surface flattening window (%d m) <= horizontal_res (%d m) → " - "big_bin_size=1, flattening has no effect. " - "Use --sea_surface_flattening_window_m > --horizontal_res.", - flattening_window_m, - horizontal_res, - ) - surface_df["big_bin"] = ( - surface_df["lat_bins_num"].astype(int) // big_bin_size - ).astype(int) - mean_surface = ( - surface_df.groupby("big_bin", observed=False)["sea_surface_height_local"] - .mean() - .rename("sea_surface_height_bigbin_mean") - .reset_index() - ) - surface_df = surface_df.merge(mean_surface, on="big_bin", how="left") - surface_df["sea_surface_flattening_offset"] = ( - surface_df["sea_surface_height_bigbin_mean"] - - surface_df["sea_surface_height_local"] - ) - - flattened = subsurface_photon_dataset.copy() - flattened["lat_bins_num"] = pd.to_numeric(flattened["lat_bins"], errors="coerce") - flattened["big_bin"] = ( - flattened["lat_bins_num"].fillna(-1).astype(int) // big_bin_size - ).astype(int) - flattened = flattened.merge( - surface_df[ - [ - "lat_bins", - "sea_surface_height_local", - "sea_surface_height_bigbin_mean", - "sea_surface_flattening_offset", - ] - ], - on="lat_bins", - how="left", - ) - flattened["photon_height_original"] = flattened["photon_height"] - flattened["photon_height"] = flattened["photon_height"] - flattened[ - "sea_surface_flattening_offset" - ].fillna(0.0) - - # Re-bin height_bins from flattened photon_height so that downstream - # Kd calculation (which reads height_bins) uses the corrected depths. - if "height_bins" in flattened.columns: - vertical_res_actual = horizontal_res # not used; infer from existing bins - # Infer the vertical resolution from the original height_bins spacing - orig_bins = ( - pd.to_numeric(flattened["height_bins"], errors="coerce").dropna().unique() - ) - if len(orig_bins) >= 2: - sorted_bins = np.sort(orig_bins) - diffs = np.diff(sorted_bins) - v_res = float(np.median(diffs[diffs > 0])) if np.any(diffs > 0) else 0.25 - else: - v_res = 0.25 - h_min = flattened["photon_height"].min() - h_max = flattened["photon_height"].max() - n_hbins = max(1, int(round((h_max - h_min) / v_res))) - bin_edges = np.linspace(h_min, h_max, n_hbins + 1) - bin_labels = np.round((bin_edges[:-1] + bin_edges[1:]) / 2, decimals=1) - flattened["height_bins"] = pd.cut( - flattened["photon_height"], - bins=bin_edges, - labels=bin_labels, - include_lowest=True, - ) - - return flattened.drop(columns=["lat_bins_num", "big_bin"], errors="ignore") - - -# 1. Function to create an R-tree spatial index for raster bounds -def create_spatial_index(gebco_paths): - """ - Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - - Returns: - raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - """ - idx = index.Index() - raster_data_dict = {} - for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): - gebco_raster = rasterio.open(path) - raster_data = gebco_raster.read(1) - raster_data_dict[path] = (gebco_raster, raster_data) - bounds = gebco_raster.bounds - idx.insert( - i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path - ) - return raster_data_dict, idx - - -# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner -def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): - """ - Determine which raster datasets are relevant for the given coordinates using spatial index. - This function uses a more efficient approach by performing a bounding box query for batches of points. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - raster_data_dict (dict): Dictionary containing raster datasets and their data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - - Returns: - relevant_rasters (list): List of relevant raster datasets and their respective data. - """ - # Create a bounding box that covers all points - min_lon, max_lon = lons.min(), lons.max() - min_lat, max_lat = lats.min(), lats.max() - bounding_box = box(min_lon, min_lat, max_lon, max_lat) - - # Get all rasters that intersect with the bounding box - matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) - relevant_paths = {match.object for match in matches} - relevant_rasters = [ - (raster_data_dict[path][0], raster_data_dict[path][1]) - for path in relevant_paths - ] - return relevant_rasters - - -# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner -def get_seafloor_elevation(lons, lats, relevant_rasters): - """ - Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - relevant_rasters (list): List of relevant raster datasets and their respective data. - - Returns: - seafloor_elevations (numpy.ndarray): Array of seafloor elevations. - """ - points = np.vstack((lons, lats)).T - seafloor_elevations = np.full(len(lons), np.nan) - - for gebco_raster, raster_data in relevant_rasters: - # Use rasterio.sample to get values for multiple points in a batch - values = list(gebco_raster.sample(points)) - for idx, value in enumerate(values): - if np.isnan(seafloor_elevations[idx]) and value is not None: - seafloor_elevations[idx] = value[0] - - return seafloor_elevations - - -def query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths): - """ - Query GEBCO raster tiles and attach a 'seafloor_elevation' column to the - photon dataset. Does not remove any rows. - - Flowchart step: 12 — Retrieve GEBCO data (or other regional raster - bathymetry data) for the region when ATL24 is unavailable or returns no - matching points. - - Parameters - ---------- - sea_photon_dataset : pd.DataFrame - Must contain 'longitude' and 'latitude' columns. - gebco_paths : list - Paths to GEBCO GeoTIFF raster files. - - Returns - ------- - pd.DataFrame - Input dataset with 'seafloor_elevation' column added (negative = below - sea level). - """ - dataset = sea_photon_dataset.copy() - raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - lons = dataset["longitude"].values - lats = dataset["latitude"].values - relevant_rasters = get_relevant_rasters_using_index( - lons, lats, raster_data_dict, spatial_index - ) - dataset["seafloor_elevation"] = get_seafloor_elevation(lons, lats, relevant_rasters) - return dataset - - -def remove_photons_below_seafloor(sea_photon_dataset): - """ - Remove photons whose height is at or below the seabed elevation. - - Flowchart step: 13 — Remove data below seabed (seabed defined by ATL24 or - GEBCO). - - Parameters - ---------- - sea_photon_dataset : pd.DataFrame - Must contain 'photon_height' and 'seafloor_elevation' columns. - - Returns - ------- - pd.DataFrame - Only rows where photon_height > seafloor_elevation. - """ - return sea_photon_dataset[ - sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] - ].copy() - - -def discard_shallow_seabed_bins(sea_photon_dataset, min_depth_m): - """ - Discard all along-track bins where the seabed is shallower than the minimum - depth threshold. Kd is not calculated for these bins; they are labelled as - low confidence by exclusion. - - Flowchart step: 14 — Discard all photon data in horizontal bins where the - seabed is <5 m deep; Kd is not calculated for these bins — label discarded - bins as low confidence. - - Parameters - ---------- - sea_photon_dataset : pd.DataFrame - Must contain a 'depth' column (= -seafloor_elevation, positive downward). - min_depth_m : float - Minimum seabed depth to retain (e.g., 5.0 m). Bins shallower than this - are discarded. - - Returns - ------- - pd.DataFrame - Only rows where depth >= min_depth_m. - """ - return sea_photon_dataset[sea_photon_dataset["depth"] >= min_depth_m].copy() - - -# 4. The main process function that ties everything together -def process_seafloor_data( - sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres -): - """ - Query GEBCO bathymetry then apply seabed and shallow-water filters. - - Delegates to three single-step helpers: - query_gebco_seafloor_elevation → step 12 - remove_photons_below_seafloor → step 13 - discard_shallow_seabed_bins → step 14 - - Parameters: - gebco_paths (list): List of path to GEBCO raster files. - sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, - latitude, and photon height. - - Returns: - filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset - containing points above the seafloor in water deeper than threshold. - """ - dataset = query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths) # step 12 - dataset["depth"] = -dataset["seafloor_elevation"] - dataset = remove_photons_below_seafloor(dataset) # step 13 - return discard_shallow_seabed_bins( - dataset, abs(Ignore_Subsurface_Height_Thres) - ) # step 14 - - -def load_atl24_points(atl24_file_path): - """ - Load ATL24 point dataset and normalize to columns: longitude, latitude, - seafloor_elevation_atl24. - - Flowchart step: 12 — Query ATL24 data for bathymetry match; if no match, - retrieve GEBCO data (or other regional raster bathymetry data) for region. - """ - if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - - lower_path = atl24_file_path.lower() - if lower_path.endswith((".h5", ".hdf5")): - try: - import h5py - - beams = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] - frames = [] - with h5py.File(atl24_file_path, "r") as f: - for beam in beams: - if beam not in f: - continue - grp = f[beam] - if not all( - k in grp for k in ("lon_ph", "lat_ph", "ortho_h", "class_ph") - ): - continue - class_ph = grp["class_ph"][:] - bathy_mask = class_ph == 40 # ATL24 bathymetry class - # Apply low_confidence_flag filter if available - if "low_confidence_flag" in grp: - bathy_mask = bathy_mask & (grp["low_confidence_flag"][:] == 0) - if bathy_mask.sum() == 0: - continue - frames.append( - pd.DataFrame( - { - "longitude": grp["lon_ph"][:][bathy_mask].astype(float), - "latitude": grp["lat_ph"][:][bathy_mask].astype(float), - "seafloor_elevation_atl24": grp["ortho_h"][:][ - bathy_mask - ].astype(float), - } - ) - ) - if not frames: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - return pd.concat(frames, ignore_index=True).dropna() - except Exception: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - elif lower_path.endswith((".csv", ".txt")): - atl24_df = pd.read_csv(atl24_file_path) - elif lower_path.endswith(".parquet"): - atl24_df = pd.read_parquet(atl24_file_path) - elif lower_path.endswith((".gpkg", ".shp", ".geojson")): - try: - import geopandas as gpd - - atl24_gdf = gpd.read_file(atl24_file_path) - atl24_df = pd.DataFrame(atl24_gdf) - if ( - "longitude" not in atl24_df.columns - or "latitude" not in atl24_df.columns - ) and ("geometry" in atl24_gdf.columns): - atl24_df["longitude"] = atl24_gdf.geometry.x - atl24_df["latitude"] = atl24_gdf.geometry.y - except Exception: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - else: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - - lon_candidates = ["longitude", "lon", "x", "LONGITUDE", "LON"] - lat_candidates = ["latitude", "lat", "y", "LATITUDE", "LAT"] - elev_candidates = [ - "seafloor_elevation", - "bottom_elevation", - "bathymetry", - "elevation", - "z", - "depth", - "water_depth", - ] - - lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) - lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) - elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) - if lon_col is None or lat_col is None or elev_col is None: - return pd.DataFrame( - columns=["longitude", "latitude", "seafloor_elevation_atl24"] - ) - - out_df = ( - pd.DataFrame( - { - "longitude": pd.to_numeric(atl24_df[lon_col], errors="coerce"), - "latitude": pd.to_numeric(atl24_df[lat_col], errors="coerce"), - "atl24_raw_bathy": pd.to_numeric(atl24_df[elev_col], errors="coerce"), - } - ) - .dropna(subset=["longitude", "latitude", "atl24_raw_bathy"]) - .copy() - ) - - if "depth" in elev_col.lower() and ("elev" not in elev_col.lower()): - out_df["seafloor_elevation_atl24"] = -out_df["atl24_raw_bathy"].abs() - else: - out_df["seafloor_elevation_atl24"] = out_df["atl24_raw_bathy"] - - return out_df[["longitude", "latitude", "seafloor_elevation_atl24"]] - - -def process_seafloor_data_atl24( - sea_photon_dataset, - atl24_file_path, - Ignore_Subsurface_Height_Thres, - max_match_distance_deg=0.01, -): - """ - Match ATL24 bathymetry points to photons, then filter below seafloor and - shallow bins. Returns filtered dataset and number of matched photons. - - Flowchart steps: - 12 — Query ATL24 data for bathymetry match; if no match, retrieve GEBCO data. - 13 — Remove data below seabed (seabed defined by ATL24 or GEBCO). - 14 — Discard all photon data in horizontal bins where the seabed is <5 m - deep; Kd is not calculated for these bins — label discarded bins as - low confidence. - """ - atl24_points = load_atl24_points(atl24_file_path) - if atl24_points.empty: - return sea_photon_dataset, 0 - - photon_coords = sea_photon_dataset[["longitude", "latitude"]].to_numpy(dtype=float) - atl24_coords = atl24_points[["longitude", "latitude"]].to_numpy(dtype=float) - if len(photon_coords) == 0 or len(atl24_coords) == 0: - return sea_photon_dataset, 0 - - tree = cKDTree(atl24_coords) - distances, indices = tree.query( - photon_coords, k=1, distance_upper_bound=max_match_distance_deg - ) - valid_match = np.isfinite(distances) & (indices < len(atl24_points)) - matched_count = int(valid_match.sum()) - if matched_count == 0: - return sea_photon_dataset, 0 - - matched_dataset = sea_photon_dataset.copy() - matched_dataset["seafloor_elevation"] = np.nan - matched_dataset.loc[valid_match, "seafloor_elevation"] = atl24_points[ - "seafloor_elevation_atl24" - ].to_numpy()[indices[valid_match]] - matched_dataset = matched_dataset.dropna(subset=["seafloor_elevation"]).copy() - matched_dataset["depth"] = -matched_dataset["seafloor_elevation"] - - above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 - filtered_dataset = discard_shallow_seabed_bins( - above_floor, abs(Ignore_Subsurface_Height_Thres) - ) # step 14 - return filtered_dataset, matched_count - - -def rebuild_and_refit_surface_after_refraction( - binned_dataset_sea_surface, - sea_surface_height, - water_temp_c, - wavelength_nm, - horizontal_res, - vertical_res, -): - """ - Apply refraction correction to the full binned dataset, rebuild histograms - on the corrected depths, and re-fit the Gaussian surface peak. - - Flowchart steps: - 10 — Re-build histograms based on corrected depths - (apply_optional_refraction_correction → horizontal_vertical_bin_dataset). - 11 — Re-fit Gaussian curve to surface to identify surface peak; compute - standard deviation and remove histogram data within three standard - deviations (get_sea_surface_height_adaptive on the rebinned data). - - Parameters - ---------- - binned_dataset_sea_surface : pd.DataFrame - Full binned photon dataset before refraction correction. - sea_surface_height : list - Per-bin surface heights from step 7. - water_temp_c : float - Water temperature for the refraction index calculation. - wavelength_nm : float - Laser wavelength (nm) for the refraction index calculation. - horizontal_res : float - Along-track bin size (m) for histogram rebuild. - vertical_res : float - Vertical bin size (m) for histogram rebuild. - - Returns - ------- - tuple - (sea_surface_height, sea_surface_height_abnormal_label, - solo_sea_surface_label, subsurface_photon_dataset) — same 4-tuple as - get_sea_surface_height_adaptive. - """ - corrected_full_dataset = apply_optional_refraction_correction( - binned_dataset_sea_surface, - binned_dataset_sea_surface.copy(), - sea_surface_height, - water_temp_c=water_temp_c, - wavelength_nm=wavelength_nm, - ) - rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 - corrected_full_dataset, horizontal_res, vertical_res - ) - return get_sea_surface_height_adaptive(rebinned_corrected) # step 11 - - -# 5. The function to get the subsurface photon dataset -def get_subsurface_photon( - binned_dataset_sea_surface, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=False, - atl24_file_path="", - atl24_max_match_distance_deg=0.01, - use_gebco_filter=False, - apply_histogram_quality_filter=False, - histogram_quality_min_ratio=0.05, - histogram_quality_depth_min=6.0, - histogram_quality_depth_max=7.0, - histogram_quality_ref_depth_min=0.0, - histogram_quality_ref_depth_max=1.0, - apply_surface_sigma_filter=False, - surface_sigma_max=0.5, - apply_refraction_correction=False, - refraction_water_temp_c=20.0, - refraction_wavelength_nm=532.0, - apply_post_refraction_refit=False, - apply_flattening=False, - flattening_window_m=500, - horizontal_res=500, - vertical_res=0.25, -): - """ - Orchestrate per-beam subsurface photon extraction and all optional quality filters. - - Flowchart steps executed in order: - 7 — Fit Gaussian curve to identify surface elevation; compute std dev of - Gaussian peak (via get_sea_surface_height_adaptive). - 6 — Review histograms for quality; quality-flag low-confidence bins and - exclude (optional, apply_histogram_quality_filter). - 8 — If Gaussian std dev > X m, quality-flag and discard bin (optional, - apply_surface_sigma_filter). - 9 — Compute water depths and correct for refraction (optional, - apply_refraction_correction). - 10 — Re-build histograms based on corrected depths (optional, - apply_post_refraction_refit → rebuild_and_refit_surface_after_refraction). - 11 — Re-fit Gaussian curve to identify surface peak; remove histogram data - within three standard deviations (optional, - rebuild_and_refit_surface_after_refraction). - 12 — Query ATL24 / retrieve GEBCO bathymetry data (optional, - use_atl24_filter / use_gebco_filter). - 13 — Remove data below seabed (optional). - 14 — Discard bins where seabed < 5 m deep; label as low confidence - (applied unconditionally via Ignore_Subsurface_Height_Thres). - """ - # adaptive threshold - ( - sea_surface_height, - sea_surface_label, - solo_sea_surface_label, - subsurface_photon_dataset, - ) = get_sea_surface_height_adaptive(binned_dataset_sea_surface) - - # Always compute per-bin surface sigma for quality flagging and wave-adaptive fit - sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) - - if apply_histogram_quality_filter: - subsurface_photon_dataset = apply_optional_histogram_quality_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - min_ratio=histogram_quality_min_ratio, - depth_min=histogram_quality_depth_min, - depth_max=histogram_quality_depth_max, - reference_depth_min=histogram_quality_ref_depth_min, - reference_depth_max=histogram_quality_ref_depth_max, - ) - - if apply_surface_sigma_filter: - subsurface_photon_dataset = apply_optional_surface_sigma_filter( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - sigma_max=surface_sigma_max, - ) - - if apply_refraction_correction: - subsurface_photon_dataset = apply_optional_refraction_correction( - binned_dataset_sea_surface, - subsurface_photon_dataset, - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, - ) - if apply_post_refraction_refit: - ( - sea_surface_height, - sea_surface_label, - solo_sea_surface_label, - subsurface_photon_dataset, - ) = rebuild_and_refit_surface_after_refraction( # steps 10 & 11 - binned_dataset_sea_surface, - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, - horizontal_res=horizontal_res, - vertical_res=vertical_res, - ) - - # Apply minimum depth threshold unconditionally — keeps photons at height - # >= Ignore_Subsurface_Height_Thres regardless of whether GEBCO/ATL24 is on. - # Without this, deep noise photons (down to -70 m) contaminate the Beer's Law fit - # and produce near-zero Kd values, especially at shallow turbid sites. - subsurface_photon_dataset = subsurface_photon_dataset[ - subsurface_photon_dataset["photon_height"] >= Ignore_Subsurface_Height_Thres - ].copy() - - if use_atl24_filter: - atl24_filtered, atl24_match_count = process_seafloor_data_atl24( - subsurface_photon_dataset, - atl24_file_path, - abs(Ignore_Subsurface_Height_Thres), - max_match_distance_deg=atl24_max_match_distance_deg, - ) - if atl24_match_count > 0: - filtered_seafloor_subsurface_photon_dataset = atl24_filtered - elif use_gebco_filter: - filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, - GEBCO_paths, - abs(Ignore_Subsurface_Height_Thres), - ) - else: - filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset - elif use_gebco_filter: - filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) - ) - else: - filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset - - if apply_flattening: - lat_bin_keys = list( - binned_dataset_sea_surface.groupby( - ["lat_bins"], observed=False - ).groups.keys() - ) - filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( - filtered_seafloor_subsurface_photon_dataset, - sea_surface_height, - lat_bin_keys, - horizontal_res=horizontal_res, - flattening_window_m=flattening_window_m, - ) - # Attach per-bin surface_sigma as a quality flag column - filtered_seafloor_subsurface_photon_dataset = ( - filtered_seafloor_subsurface_photon_dataset.copy() - ) - if sigma_map: - filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = ( - filtered_seafloor_subsurface_photon_dataset["lat_bins"].map(sigma_map) - ) - else: - filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = np.nan - - return ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) - - -# Main processing function to apply the subsurface photon filtering beam-by-beam -def process_subsurface_photon_filtering( - binned_dataset_sea_surface, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=False, - atl24_file_path="", - atl24_max_match_distance_deg=0.01, - use_gebco_filter=False, - apply_histogram_quality_filter=False, - histogram_quality_min_ratio=0.05, - histogram_quality_depth_min=6.0, - histogram_quality_depth_max=7.0, - histogram_quality_ref_depth_min=0.0, - histogram_quality_ref_depth_max=1.0, - apply_surface_sigma_filter=False, - surface_sigma_max=0.5, - apply_refraction_correction=False, - refraction_water_temp_c=20.0, - refraction_wavelength_nm=532.0, - apply_post_refraction_refit=False, - apply_flattening=False, - flattening_window_m=500, - horizontal_res=500, - vertical_res=0.25, -): - """ - Beam-by-beam wrapper that calls get_subsurface_photon for every beam and - concatenates the results. - - Flowchart steps 6–14 are all executed inside this function (delegated to - get_subsurface_photon per beam). See get_subsurface_photon for the - step-by-step breakdown. - """ - # Initialize lists to store results for each beam - sea_surface_heights = [] - sea_surface_labels = [] - filtered_beam_datasets = [] - - # Group the binned dataset by 'beam_id' and process each group separately - for beam_id, beam_data in binned_dataset_sea_surface.groupby("beam_id"): - print(f"Processing subsurface filtering for beam: {beam_id}") - - # Apply get_subsurface_photon to the current beam's dataset - ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) = get_subsurface_photon( - beam_data, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=use_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=atl24_max_match_distance_deg, - use_gebco_filter=use_gebco_filter, - apply_histogram_quality_filter=apply_histogram_quality_filter, - histogram_quality_min_ratio=histogram_quality_min_ratio, - histogram_quality_depth_min=histogram_quality_depth_min, - histogram_quality_depth_max=histogram_quality_depth_max, - histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, - apply_surface_sigma_filter=apply_surface_sigma_filter, - surface_sigma_max=surface_sigma_max, - apply_refraction_correction=apply_refraction_correction, - refraction_water_temp_c=refraction_water_temp_c, - refraction_wavelength_nm=refraction_wavelength_nm, - apply_post_refraction_refit=apply_post_refraction_refit, - apply_flattening=apply_flattening, - flattening_window_m=flattening_window_m, - horizontal_res=horizontal_res, - vertical_res=vertical_res, - ) - - # Append each result to the lists - sea_surface_heights.append(sea_surface_height) - sea_surface_labels.append(sea_surface_label) - filtered_beam_datasets.append(filtered_seafloor_subsurface_photon_dataset) - - # Combine all filtered beam datasets into a single DataFrame - combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) - - return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/icesat-2_kdph_py/kd_utils/data_processing.py b/icesat-2_kdph_py/kd_utils/data_processing.py deleted file mode 100644 index f21217a..0000000 --- a/icesat-2_kdph_py/kd_utils/data_processing.py +++ /dev/null @@ -1,887 +0,0 @@ -# utils/data_processing.py - - -import io -import logging -import math -import os -import re - -import geopandas as gpd -import h5py - -# from pyproj import Transformer -import numpy as np -import pandas as pd -from pyproj import Proj, Transformer -from scipy.spatial import ConvexHull - -# import fiona -from .interpolation import * - -logger = logging.getLogger(__name__) - - -# this function from icesat2_toolkit -# PURPOSE: read ICESat-2 ATL03 HDF5 data files -def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): - """ - Reads ICESat-2 ATL03 Global Geolocated Photons data files. - - Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). - - Parameters - ---------- - FILENAME: str - full path to ATL03 file - ATTRIBUTES: bool, default False - read file, group and variable attributes - - Returns - ------- - IS2_atl03_mds: dict - ATL03 variables - IS2_atl03_attrs: dict - ATL03 attributes - IS2_atl03_beams: list - valid ICESat-2 beams within ATL03 file - """ - # Open the HDF5 file for reading - if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, "r") - else: - fileID = h5py.File(os.path.expanduser(FILENAME), "r") - - # Output HDF5 file information - logging.info(fileID.filename) - logging.info(list(fileID.keys())) - - # allocate python dictionaries for ICESat-2 ATL03 variables and attributes - IS2_atl03_mds = {} - IS2_atl03_attrs = {} - - # read each input beam within the file - IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: - # check if subsetted beam contains data - # check in both the geolocation and heights groups - try: - fileID[gtx]["geolocation"]["segment_id"] - fileID[gtx]["heights"]["delta_time"] - except KeyError: - pass - else: - IS2_atl03_beams.append(gtx) - - # for each included beam - for gtx in IS2_atl03_beams: - # ------------------------------------------- - # 1. make sure the beam-level dict exists - IS2_atl03_attrs.setdefault(gtx, {}) - # 2. always save the two “must-have” attributes - for key in ("atlas_beam_type", "atlas_spot_number"): - IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] - - # get each HDF5 variable - IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]["heights"] = {} - IS2_atl03_mds[gtx]["geolocation"] = {} - IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} - IS2_atl03_mds[gtx]["geophys_corr"] = {} - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_mds[gtx]["heights"][key] = val[:] - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_mds[gtx]["geolocation"][key] = val[:] - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] - # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, - # solid earth, pole, load, and equilibrium), inverted barometer (IB) - # effects, and range corrections for tropospheric delays - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] - - # Getting attributes of included variables - if ATTRIBUTES: - # Getting attributes of IS2_atl03_mds beam variables - IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]["heights"] = {} - IS2_atl03_attrs[gtx]["geolocation"] = {} - IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} - IS2_atl03_attrs[gtx]["geophys_corr"] = {} - - # Global Group Attributes - for att_name, att_val in fileID[gtx].attrs.items(): - IS2_atl03_attrs[gtx][att_name] = att_val - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_attrs[gtx]["heights"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_attrs[gtx]["geolocation"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val - # ICESat-2 Geophysical Corrections Group - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val - - # ICESat-2 spacecraft orientation at time - IS2_atl03_mds["orbit_info"] = {} - IS2_atl03_attrs["orbit_info"] = {} - for key, val in fileID["orbit_info"].items(): - IS2_atl03_mds["orbit_info"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID["orbit_info"].attrs.items(): - IS2_atl03_attrs["orbit_info"][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs["orbit_info"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["orbit_info"][key][att_name] = att_val - - # information ancillary to the data product - # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) - # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) - # Add this value to delta time parameters to compute full gps_seconds - # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 - # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds["ancillary_data"] = {} - IS2_atl03_attrs["ancillary_data"] = {} - ancillary_keys = [ - "atlas_sdp_gps_epoch", - "data_end_utc", - "data_start_utc", - "end_cycle", - "end_geoseg", - "end_gpssow", - "end_gpsweek", - "end_orbit", - "end_region", - "end_rgt", - "granule_end_utc", - "granule_start_utc", - "release", - "start_cycle", - "start_geoseg", - "start_gpssow", - "start_gpsweek", - "start_orbit", - "start_region", - "start_rgt", - "version", - ] - for key in ancillary_keys: - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"][key] = {} - for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): - IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val - - # transmit-echo-path (tep) parameters - IS2_atl03_mds["ancillary_data"]["tep"] = {} - IS2_atl03_attrs["ancillary_data"]["tep"] = {} - for key, val in fileID["ancillary_data"]["tep"].items(): - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val - - # channel dead time and first photon bias derived from ATLAS calibration - cal1, cal2 = ("ancillary_data", "calibrations") - for var in ["dead_time", "first_photon_bias"]: - IS2_atl03_mds[cal1][var] = {} - IS2_atl03_attrs[cal1][var] = {} - for key, val in fileID[cal1][cal2][var].items(): - # get each HDF5 variable - if isinstance(val, h5py.Dataset): - IS2_atl03_mds[cal1][var][key] = val[:] - elif isinstance(val, h5py.Group): - IS2_atl03_mds[cal1][var][key] = {} - for k, v in val.items(): - IS2_atl03_mds[cal1][var][key][k] = v[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs[cal1][var][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][att_name] = att_val - if isinstance(val, h5py.Group): - for k, v in val.items(): - IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val - - # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1, tep2 = ("atlas_impulse_response", "tep_histogram") - IS2_atl03_mds[tep1] = {} - IS2_atl03_attrs[tep1] = {} - for pce in ["pce1_spot1", "pce2_spot3"]: - IS2_atl03_mds[tep1][pce] = {tep2: {}} - IS2_atl03_attrs[tep1][pce] = {tep2: {}} - # for each TEP variable - for key, val in fileID[tep1][pce][tep2].items(): - IS2_atl03_mds[tep1][pce][tep2][key] = val[:] - # Getting attributes of included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val - - # Global File Attributes - if ATTRIBUTES: - for att_name, att_val in fileID.attrs.items(): - IS2_atl03_attrs[att_name] = att_val - - # Closing the HDF5 file - fileID.close() - # Return the datasets and variables - return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) - - -# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 -def convert_wgs_to_utm(lon: float, lat: float): - """Based on lat and lng, return best utm epsg-code""" - utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) - if len(utm_band) == 1: - utm_band = "0" + utm_band - if lat >= 0: - epsg_code = "epsg:326" + utm_band - return epsg_code - epsg_code = "epsg:327" + utm_band - return epsg_code - - -def orthometric_correction(lat, lon, Z, epsg): - # Define the Proj string - # To transform from WGS84 ellipsoidal height - # to EGM2008 orthometric height using PyProj - # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' - # # Define the Proj string for WGS84 ellipsoidal height - # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - - # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 - # egm2008_proj_string = \ - # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ - # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' - - # transform ellipsoid (WGS84) height to orthometric height - # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) - transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) - X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) - - # transform WGS84 proj to local UTM - myProj = Proj(epsg) - X_utm, Y_utm = myProj(lon, lat) - - return Y_utm, X_utm, Z_egm08 - - -def load_data(file_path, read_attributes=False): - """ - Wrapper around read_granule that lets you decide - whether to pull the full attribute tree. - - Flowchart step: 1 — Acquire ATL03 data (download or use cloud services). - - Parameters - ---------- - file_path : str - Path to the ATL03 HDF5 granule. - read_attributes : bool, optional - If True, read_granule returns the full IS2_atl03_attrs tree. - Defaults to False. - """ - - return read_granule(file_path, ATTRIBUTES=read_attributes) - - -# requires that the input gdf has ranged index values i -# will need to change if index is changed to time or something -# this currently checks point in polygon for EVERY point -# would be significantly sped up if evaluated at 10m or something similar -# maybe later, fine for now -def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): - # try loading the shoreline data - try: - ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) - - # allocation of to be used arrays - zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) - - # Land flag initialized as -1 - # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) - - # set the projection - ICESat2_GDF.set_crs("EPSG:4326", inplace=True) - - # load shoreline dataset to include only the features that intersect the bounding box - # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None - # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. - # engine str, 'fiona' or 'pyogrio' - # somtime it gives error if using fiona - # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file( - shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" - ) - - # continue with getting a new array of 0-or-1 labels for each photon - land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) - - # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") - - # get land or not bool value - land_loc = ICESat2_GDF.index.isin(pts_in_land.index) - - # asigned them to new numpy array - land_point_labels[land_loc] = 1 - land_point_labels[~land_loc] = 0 - - return land_point_labels - - except Exception as e: - print(e) - - print("Error loading shoreline data, returning -1s for is_land flag") - - # if the shoreline data is not available - # return the original label array - - return -np.ones_like(ICESat2_GDF.is_land.values) - - -def create_photon_dataframe( - lat_ph, - lon_ph, - ref_elev, - ref_azimuth, - geoid, - h_ph, - quality_ph, - is_land_label_interp1d, - signal_conf_photon, - x_atc, - relative_AT_dist, - solar_elevation=None, - background_rate=None, -): - # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights - h_ph_geoid_cor = h_ph[:] - geoid[:] - - # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude - epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - - # Perform orthometric correction to obtain UTM coordinates and corrected heights - lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - - # Put the data into the dataframe - sea_photon_dataset = pd.DataFrame( - { - "latitude": lat_ph, - "longitude": lon_ph, - "lat": lat_utm, - "lon": lon_utm, - "photon_height": h_ph_geoid_cor, - "quality_ph": quality_ph, - "is_land_label": is_land_label_interp1d, - "photon_conf": signal_conf_photon, - "ref_elevation": ref_elev, - "ref_azimuth": ref_azimuth, - "relative_AT_dist": relative_AT_dist, - }, - columns=[ - "latitude", - "longitude", - "lat", - "lon", - "photon_height", - "quality_ph", - "is_land_label", - "photon_conf", - "ref_elevation", - "ref_azimuth", - "relative_AT_dist", - ], - ) - - if solar_elevation is not None: - sea_photon_dataset["solar_elevation"] = solar_elevation - if background_rate is not None: - sea_photon_dataset["background_rate"] = background_rate - - return sea_photon_dataset - - -def interpolate_by_time(source_time, source_values, target_time): - """Linearly interpolate source_values from source_time to target_time.""" - source_time = np.asarray(source_time) - source_values = np.asarray(source_values) - target_time = np.asarray(target_time) - - valid_mask = np.isfinite(source_time) & np.isfinite(source_values) - if valid_mask.sum() < 2: - return np.full_like(target_time, np.nan, dtype=float) - - sorted_idx = np.argsort(source_time[valid_mask]) - sorted_time = source_time[valid_mask][sorted_idx] - sorted_values = source_values[valid_mask][sorted_idx] - - unique_time, unique_idx = np.unique(sorted_time, return_index=True) - unique_values = sorted_values[unique_idx] - if unique_time.size < 2: - return np.full_like(target_time, unique_values[0], dtype=float) - - return np.interp( - target_time, - unique_time, - unique_values, - left=unique_values[0], - right=unique_values[-1], - ) - - -def apply_optional_solar_background_filter( - sea_photon_dataset, - enabled=False, - day_threshold=6.0, - median_window_deg=10.0, - noise_multiplier=1.5, - min_signal_conf=2, -): - """ - Optionally filter photons under strong daytime solar background conditions. - - Uses a degree-based rolling median of background_rate over solar_elevation - to compute the expected background at each elevation. Photons whose actual - background_rate exceeds expected * noise_multiplier are flagged noisy. - Noisy photons are removed unless their signal confidence >= min_signal_conf. - - Flowchart step: 5 — Compute solar background from atmospheric signal (using - selected x and z bin length scales) and subtract from subsurface photon - histogram data. - """ - if not enabled: - return sea_photon_dataset - - required_cols = {"solar_elevation", "background_rate", "photon_conf"} - if not required_cols.issubset(set(sea_photon_dataset.columns)): - logger.warning( - "Solar background filter skipped because required columns are missing." - ) - return sea_photon_dataset - - filtered_dataset = sea_photon_dataset.copy() - daytime_mask = filtered_dataset["solar_elevation"] >= day_threshold - - # Select valid daytime photons for rolling median computation - daytime = filtered_dataset.loc[daytime_mask].copy() - valid = np.isfinite(daytime["background_rate"]) & np.isfinite( - daytime["solar_elevation"] - ) - daytime = daytime.loc[valid] - - if len(daytime) < 10: - logger.info( - "Solar background filter enabled, but fewer than 10 valid daytime " - "photons were found. Skipping filter." - ) - return filtered_dataset - - # Clamp solar elevations to physical range [DAY_THRESHOLD, 90] to avoid - # HDF5 fill values (e.g. 3.4e+38) blowing up the bin array. - daytime = daytime.copy() - daytime["solar_elevation"] = daytime["solar_elevation"].clip( - lower=day_threshold, upper=90.0 - ) - - # Vectorized degree-based rolling median: - # Bin elevations into fine steps, compute one median per bin using - # np.searchsorted on the sorted array, then map back to photons. - daytime = daytime.sort_values("solar_elevation") - elevations = daytime["solar_elevation"].values - bg_rates = daytime["background_rate"].values - half_window = median_window_deg / 2.0 - - bin_step = 0.1 # degrees — fine enough for smooth curve - bin_centers = np.arange(elevations[0], elevations[-1] + bin_step, bin_step) - bin_medians = np.empty(len(bin_centers)) - for i in range(len(bin_centers)): - lo = np.searchsorted(elevations, bin_centers[i] - half_window, side="left") - hi = np.searchsorted(elevations, bin_centers[i] + half_window, side="right") - if lo < hi: - bin_medians[i] = np.median(bg_rates[lo:hi]) - else: - bin_medians[i] = np.nan - - # Map bin medians back to each photon via nearest-bin lookup (vectorized) - bin_indices = np.searchsorted(bin_centers, elevations, side="right") - 1 - bin_indices = np.clip(bin_indices, 0, len(bin_centers) - 1) - expected_bg = bin_medians[bin_indices] - - # Map expected background onto the full DataFrame. Non-daytime and - # non-finite daytime rows stay NaN, so they are never flagged noisy. - filtered_dataset["expected_bg"] = np.nan - filtered_dataset.loc[daytime.index, "expected_bg"] = expected_bg - - # Flag noisy daytime photons - noisy_daytime_mask = daytime_mask & ( - filtered_dataset["background_rate"] - >= filtered_dataset["expected_bg"] * noise_multiplier - ) - keep_mask = (~noisy_daytime_mask) | ( - filtered_dataset["photon_conf"] >= min_signal_conf - ) - - # Clean up temporary column - filtered_dataset.drop(columns=["expected_bg"], inplace=True, errors="ignore") - - before_count = len(filtered_dataset) - filtered_dataset = filtered_dataset.loc[keep_mask].copy() - after_count = len(filtered_dataset) - logger.info( - "Solar background filter (multiplier=%.1f) removed %s photons (from %s to %s).", - noise_multiplier, - before_count - after_count, - before_count, - after_count, - ) - return filtered_dataset - - -def apply_optional_ir_ap_filter( - sea_photon_dataset, enabled=False, quality_max=0, min_signal_conf=0 -): - """ - Optionally remove photons flagged as afterpulse using the quality_ph bitmask. - - Flowchart step: 2 — Remove photons flagged as afterpulse or impulse response. - - ATL03 v007 quality_ph bit encoding: - bit 0 (value 1) = possible afterpulse <- targeted by this filter - bit 1 (value 2) = possible impulse response effect - bit 2 (value 4) = possible TEP - - Only photons with bit 0 set (afterpulse) are removed. Photons flagged - for impulse response or TEP only are kept, avoiding over-filtering on - turbid high-background sites where many photons carry non-zero flags. - """ - if not enabled: - return sea_photon_dataset - - filtered_dataset = sea_photon_dataset.copy() - keep_mask = np.ones(len(filtered_dataset), dtype=bool) - - if "quality_ph" in filtered_dataset.columns: - quality_values = ( - pd.to_numeric(filtered_dataset["quality_ph"], errors="coerce") - .fillna(0) - .astype(int) - ) - afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse - keep_mask &= ~afterpulse_mask - logger.info( - "IR/AP filter: %d photons flagged as afterpulse (quality_ph bit 0) will be removed.", - int(afterpulse_mask.sum()), - ) - else: - logger.warning("IR/AP filter enabled, but quality_ph is missing.") - - if "photon_conf" in filtered_dataset.columns: - conf_values = pd.to_numeric(filtered_dataset["photon_conf"], errors="coerce") - keep_mask &= conf_values >= min_signal_conf - else: - logger.warning("IR/AP filter enabled, but photon_conf is missing.") - - before_count = len(filtered_dataset) - filtered_dataset = filtered_dataset.loc[keep_mask].copy() - after_count = len(filtered_dataset) - logger.info( - "IR/AP proxy filter removed %s photons (from %s to %s).", - before_count - after_count, - before_count, - after_count, - ) - return filtered_dataset - - -def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path): - Segment_ID = {} - Segment_Index_begin = {} - Segment_PE_count = {} - Equator_Segment_Distance = {} - Segment_Length = {} - Segment_Is_Land = {} - Segment_Lon = {} - Segment_Lat = {} - Segment_Elev = {} - Segment_Time = {} - Segment_ref_elev = {} - Segment_ref_azimuth = {} - background_rate = {} - background_counts = {} - - # Initialize a list to store data for each beam - beam_datasets = [] - - # Loop over each strong beam in target_strong_beams - for gtx in target_strong_beams: - print("Processing strong beam ID:", gtx) - - # Access the data for the current beam - IS2_val = IS2_atl03_mds[gtx] - - # Initialize dictionaries to store segment data for the beam - Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] - n_seg = len(Segment_ID[gtx]) - (n_pe,) = IS2_val["heights"]["delta_time"].shape - Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 - Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] - Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] - Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] - delta_time = IS2_val["geolocation"]["delta_time"] - segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() - segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() - ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() - ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() - geoid = IS2_val["geophys_corr"]["geoid"][:].copy() - h_ph = IS2_val["heights"]["h_ph"][:].copy() - photon_delta_time = IS2_val["heights"]["delta_time"][:].copy() - lat_ph = IS2_val["heights"]["lat_ph"][:].copy() - lon_ph = IS2_val["heights"]["lon_ph"][:].copy() - signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() - x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() - y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() - quality_ph = IS2_val["heights"]["quality_ph"] - - # Optional variables for solar background sensitivity testing - photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) - photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) - if "solar_elevation" in IS2_val["geolocation"]: - segment_solar_elevation = IS2_val["geolocation"]["solar_elevation"][ - : - ].copy() - photon_solar_elevation = interpolate_by_time( - delta_time, segment_solar_elevation, photon_delta_time - ) - if ( - "bckgrd_atlas" in IS2_val - and "bckgrd_rate" in IS2_val["bckgrd_atlas"] - and "delta_time" in IS2_val["bckgrd_atlas"] - ): - bckgrd_rate = IS2_val["bckgrd_atlas"]["bckgrd_rate"][:].copy() - bckgrd_time = IS2_val["bckgrd_atlas"]["delta_time"][:].copy() - photon_background_rate = interpolate_by_time( - bckgrd_time, bckgrd_rate, photon_delta_time - ) - - # Adjust x_atc based on segment distances - for seg_index in range(n_seg): - idx = Segment_Index_begin[gtx][seg_index] - cnt = Segment_PE_count[gtx][seg_index] - x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - - # Calculate relative distances - relative_AT_dist = (x_atc - x_atc[0]) / 1000 - relative_seg_dist = ( - Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] - ) / 1000 - - # Create a GeoDataFrame to hold segment data for shoreline check - Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - - # Determine if it is land by the land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons( - shoreline_data_path, ICESat2_GDF - ) - ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels - - # Apply interpolations for required data - is_land_label_interp1d = apply_interpolation( - interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph - ) - ph_ref_elev = apply_interpolation( - interpolate_labels(segment_lat, ref_elev), lat_ph - ) - ph_ref_azimuth = apply_interpolation( - interpolate_labels(segment_lat, ref_azimuth), lat_ph - ) - ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) - - # Create photon DataFrame for the current beam - sea_photon_dataset = create_photon_dataframe( - lat_ph=lat_ph, - lon_ph=lon_ph, - ref_elev=ph_ref_elev, - ref_azimuth=ph_ref_azimuth, - geoid=ph_geoid, - h_ph=h_ph, - quality_ph=quality_ph, - is_land_label_interp1d=is_land_label_interp1d, - signal_conf_photon=signal_conf_photon, - x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. - relative_AT_dist=relative_AT_dist, - solar_elevation=photon_solar_elevation, - background_rate=photon_background_rate, - ) - - # Filter out land photons - sea_photon_dataset = sea_photon_dataset[ - sea_photon_dataset["is_land_label"] != 1 - ] - - # Add a new column to indicate the beam ID - sea_photon_dataset["beam_id"] = gtx - - # Append the processed dataset for the current beam to the list - beam_datasets.append(sea_photon_dataset) - - # Concatenate all beam data into a single DataFrame - all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) - - return all_beams_dataset - - -def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000): - """ - Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, - convex hull areas, and convex hull points. - """ - lat_bins_grouped = photon_dataset.groupby("lat_bins", observed=False) - filtered_dataset = photon_dataset.copy() - - convex_hulls = {} - convex_hull_areas = {} - - for lat_bin, bin_data in lat_bins_grouped: - if len(bin_data) >= 3: - points = bin_data[["lat", "photon_height"]].to_numpy() - hull = ConvexHull(points) - area = hull.volume - convex_hull_areas[lat_bin] = area - - # Store the ConvexHull points if area meets the threshold - if area >= hull_area_threshold: - convex_hulls[lat_bin] = points[hull.vertices] - else: - # Remove bins with hull area below the threshold - filtered_dataset = filtered_dataset[ - filtered_dataset["lat_bins"] != lat_bin - ] - - return filtered_dataset, convex_hull_areas, convex_hulls - - -########################## -##Discard Functions Below -########################## - - -# Extracts key information from filenames, -# which could include processed status, product ID, timestamps, identifiers, or metadata. -def extract_file_params(file_path): - """ - Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" - Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', - '2023', '11', '15', '001', '12', '_data') - """ - # Defines a regex pattern to match strings in the file path - rx = re.compile( - r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" - r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" - ) - # Searches for all matches of the regex pattern in the given - params = rx.findall(file_path).pop() - return params - - -def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): - """ - create a mask for filtering sea surface, sea floor, and threshold_height - """ - mask = ( - (binned_data["height"] <= sea_surface_height) - & (binned_data["height"] >= seafloor_height) - & (binned_data["height"] >= threshold_height) - ) - return mask - - -def generate_polygons_from_binned_data( - lat, height, mask, lat_interval, height_interval -): - """ - horizontal_vertical_bin_dataset - Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals - """ - # Create latitude bins based on the specified interval - lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) - - # Create height bins within the range [-12, 2] based on the specified interval - height_bins = np.arange(-12, 2 + height_interval, height_interval) - - # Identify which bin each masked latitude and height value belongs to - lat_bin_indices = np.digitize(lat[mask], lat_bins) - - # Determine which height bin each masked point belongs to - height_bin_indices = np.digitize(height[mask], height_bins) - - # Combine latitude and height bin indices for each point - combined_bins = list(zip(lat_bin_indices, height_bin_indices, strict=False)) - - # Find unique bin combinations and count the number of points in each bin - unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) - - # Filter bins to include only those within valid index ranges - valid_bins = [ - (bin[0], bin[1]) - for bin in unique_bins - if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins) - ] - - polygons = [] - for bin_lat, bin_height in valid_bins: - if ( - bin_lat > 0 - and bin_lat < len(lat_bins) - and bin_height > 0 - and bin_height < len(height_bins) - ): - bin_points = np.array( - [ - (lat[mask][i], height[mask][i]) - for i in range(sum(mask)) - if lat_bin_indices[i] == bin_lat - and height_bin_indices[i] == bin_height - ] - ) - if len(bin_points) > 2: - hull = ConvexHull(bin_points) - polygons.append(bin_points[hull.vertices]) - - return lat_bins, height_bins, valid_bins, polygons diff --git a/icesat-2_kdph_py/kd_utils/interpolation.py b/icesat-2_kdph_py/kd_utils/interpolation.py deleted file mode 100644 index 23a4c24..0000000 --- a/icesat-2_kdph_py/kd_utils/interpolation.py +++ /dev/null @@ -1,25 +0,0 @@ -# utils/interpolation.py - -import scipy.interpolate - - -def interpolate_labels(segment_lat, labels): - model = scipy.interpolate.interp1d(segment_lat, labels, fill_value="extrapolate") - return model - - -def apply_interpolation(model, lat_ph): - return model(lat_ph) - - -def geoid_correction(lat_ph, segment_lat, geoid): - model = interpolate_labels(segment_lat, geoid) - return apply_interpolation(model, lat_ph) - - -def refraction_correction(lat_ph, segment_lat, ref_elev, ref_azimuth): - elev_model = interpolate_labels(segment_lat, ref_elev) - azimuth_model = interpolate_labels(segment_lat, ref_azimuth) - return apply_interpolation(elev_model, lat_ph), apply_interpolation( - azimuth_model, lat_ph - ) diff --git a/icesat-2_kdph_py/kd_utils/kd_utils.py b/icesat-2_kdph_py/kd_utils/kd_utils.py deleted file mode 100644 index 5aa45ca..0000000 --- a/icesat-2_kdph_py/kd_utils/kd_utils.py +++ /dev/null @@ -1,1358 +0,0 @@ -#!/usr/bin/env python - -##### -# This group of functions processes ICESat-2 data and creates a bathymetric model. -# To do this, it follows a number of steps in the form of functions, including: -# 1. Reading data (ReadATL03()) -# 2. Orthometrically correcting the dataset (OrthometricCorrection()) -# 3. Pulling down the data segment ID (getAtl03SegID()) -# 4. Bin the data along latitudinal and height gradients (bin_data()) -# 5. Calculate sea height (get_sea_height()) -# 6. Get water temperature (get_water_temp()) -# 7. Correct bathymetric surface for refraction (RefractionCorrection()) -# 8. Calculate bathymetric height (get_bath_height()) -# 9. Produce figures (produce_figures()) -##### -from datetime import datetime - -# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" -import io -import logging -import math -import os -import re -import time - -import geopandas as gpd -import h5py -import hdbscan - -# from pyproj import Transformer -import matplotlib.pyplot as plt -import netCDF4 -import numpy as np -import pandas as pd - -# import fiona -from pyproj import Proj, Transformer -from sklearn.preprocessing import MinMaxScaler - - -# this function from icesat2_toolkit -# PURPOSE: read ICESat-2 ATL03 HDF5 data files -def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): - """ - Reads ICESat-2 ATL03 Global Geolocated Photons data files - - Parameters - ---------- - FILENAME: str - full path to ATL03 file - ATTRIBUTES: bool, default False - read file, group and variable attributes - - Returns - ------- - IS2_atl03_mds: dict - ATL03 variables - IS2_atl03_attrs: dict - ATL03 attributes - IS2_atl03_beams: list - valid ICESat-2 beams within ATL03 file - """ - # Open the HDF5 file for reading - if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, "r") - else: - fileID = h5py.File(os.path.expanduser(FILENAME), "r") - - # Output HDF5 file information - logging.info(fileID.filename) - logging.info(list(fileID.keys())) - - # allocate python dictionaries for ICESat-2 ATL03 variables and attributes - IS2_atl03_mds = {} - IS2_atl03_attrs = {} - - # read each input beam within the file - IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: - # check if subsetted beam contains data - # check in both the geolocation and heights groups - try: - fileID[gtx]["geolocation"]["segment_id"] - fileID[gtx]["heights"]["delta_time"] - except KeyError: - pass - else: - IS2_atl03_beams.append(gtx) - - # for each included beam - for gtx in IS2_atl03_beams: - # get each HDF5 variable - IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]["heights"] = {} - IS2_atl03_mds[gtx]["geolocation"] = {} - IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} - IS2_atl03_mds[gtx]["geophys_corr"] = {} - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_mds[gtx]["heights"][key] = val[:] - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_mds[gtx]["geolocation"][key] = val[:] - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] - # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, - # solid earth, pole, load, and equilibrium), inverted barometer (IB) - # effects, and range corrections for tropospheric delays - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] - - # Getting attributes of included variables - if ATTRIBUTES: - # Getting attributes of IS2_atl03_mds beam variables - IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]["heights"] = {} - IS2_atl03_attrs[gtx]["geolocation"] = {} - IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} - IS2_atl03_attrs[gtx]["geophys_corr"] = {} - # Global Group Attributes - for att_name, att_val in fileID[gtx].attrs.items(): - IS2_atl03_attrs[gtx][att_name] = att_val - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_attrs[gtx]["heights"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_attrs[gtx]["geolocation"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val - # ICESat-2 Geophysical Corrections Group - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val - - # ICESat-2 spacecraft orientation at time - IS2_atl03_mds["orbit_info"] = {} - IS2_atl03_attrs["orbit_info"] = {} - for key, val in fileID["orbit_info"].items(): - IS2_atl03_mds["orbit_info"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID["orbit_info"].attrs.items(): - IS2_atl03_attrs["orbit_info"][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs["orbit_info"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["orbit_info"][key][att_name] = att_val - - # information ancillary to the data product - # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) - # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) - # Add this value to delta time parameters to compute full gps_seconds - # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 - # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds["ancillary_data"] = {} - IS2_atl03_attrs["ancillary_data"] = {} - ancillary_keys = [ - "atlas_sdp_gps_epoch", - "data_end_utc", - "data_start_utc", - "end_cycle", - "end_geoseg", - "end_gpssow", - "end_gpsweek", - "end_orbit", - "end_region", - "end_rgt", - "granule_end_utc", - "granule_start_utc", - "release", - "start_cycle", - "start_geoseg", - "start_gpssow", - "start_gpsweek", - "start_orbit", - "start_region", - "start_rgt", - "version", - ] - for key in ancillary_keys: - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"][key] = {} - for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): - IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val - - # transmit-echo-path (tep) parameters - IS2_atl03_mds["ancillary_data"]["tep"] = {} - IS2_atl03_attrs["ancillary_data"]["tep"] = {} - for key, val in fileID["ancillary_data"]["tep"].items(): - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val - - # channel dead time and first photon bias derived from ATLAS calibration - cal1, cal2 = ("ancillary_data", "calibrations") - for var in ["dead_time", "first_photon_bias"]: - IS2_atl03_mds[cal1][var] = {} - IS2_atl03_attrs[cal1][var] = {} - for key, val in fileID[cal1][cal2][var].items(): - # get each HDF5 variable - if isinstance(val, h5py.Dataset): - IS2_atl03_mds[cal1][var][key] = val[:] - elif isinstance(val, h5py.Group): - IS2_atl03_mds[cal1][var][key] = {} - for k, v in val.items(): - IS2_atl03_mds[cal1][var][key][k] = v[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs[cal1][var][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][att_name] = att_val - if isinstance(val, h5py.Group): - for k, v in val.items(): - IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val - - # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1, tep2 = ("atlas_impulse_response", "tep_histogram") - IS2_atl03_mds[tep1] = {} - IS2_atl03_attrs[tep1] = {} - for pce in ["pce1_spot1", "pce2_spot3"]: - IS2_atl03_mds[tep1][pce] = {tep2: {}} - IS2_atl03_attrs[tep1][pce] = {tep2: {}} - # for each TEP variable - for key, val in fileID[tep1][pce][tep2].items(): - IS2_atl03_mds[tep1][pce][tep2][key] = val[:] - # Getting attributes of included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val - - # Global File Attributes - if ATTRIBUTES: - for att_name, att_val in fileID.attrs.items(): - IS2_atl03_attrs[att_name] = att_val - - # Closing the HDF5 file - fileID.close() - # Return the datasets and variables - return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) - - -# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 -def convert_wgs_to_utm(lon: float, lat: float): - """Based on lat and lng, return best utm epsg-code""" - utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) - if len(utm_band) == 1: - utm_band = "0" + utm_band - if lat >= 0: - epsg_code = "epsg:326" + utm_band - return epsg_code - epsg_code = "epsg:327" + utm_band - return epsg_code - - -def orthometric_correction(lat, lon, Z, epsg): - # Define the Proj string - # To transform from WGS84 ellipsoidal height - # to EGM2008 orthometric height using PyProj - # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' - # # Define the Proj string for WGS84 ellipsoidal height - # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - - # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 - # egm2008_proj_string = \ - # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ - # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' - - # transform ellipsoid (WGS84) height to orthometric height - # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) - transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) - X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) - - # transform WGS84 proj to local UTM - myProj = Proj(epsg) - X_utm, Y_utm = myProj(lon, lat) - - return Y_utm, X_utm, Z_egm08 - - -# Snippet by Eric Guenther (via Amy N.) for assigning photons to a segment -def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): - # We need to know spacecraft orbit info, - # which is provided across segments. - # This first function assigns photons to the segment they belong to. - # We end up making a new array - # that has more points to match the photon. Segment is defined as every 100m in the long track. - - # Filter all data where atl03_ph_index starts at 0 (0 indicates errors) - indsNotZero = atl03_ph_index_beg != 0 - atl03_ph_index_beg = atl03_ph_index_beg[indsNotZero] - atl03_segment_id = atl03_segment_id[indsNotZero] - - # Subtract 1 from ph_index_beg to start at python 0th pos - atl03_ph_index_beg = atl03_ph_index_beg - 1 - - # Sometimes the ph_index_beg is not at the 0th position, it is not, - # add it in and then add the associated segment id - # Warning, this is assuming that the segment id for the points are from - # the segment id directly before it, this assumption might fail, but I have - # not come across a case yet where it does. If you want to play it safe - # you could comment this section out and then if the first position is not - # 0 then all photons before the first position will not be assigned a - # segment id. - # if atl03_ph_index_beg[0] != 0: - # atl03_ph_index_beg = np.append(0,atl03_ph_index_beg) - # first_seg_id = atl03_segment_id[0] -1 - # atl03_segment_id = np.append(first_seg_id,atl03_segment_id) - - # Append atl03_height_len to end of array for final position - atl03_ph_index_beg = np.append(atl03_ph_index_beg, atl03_heights_len) - - # Make array equal to the length of the atl03_heights photon level data - ph_segment_id = np.zeros(atl03_heights_len) - - # Iterate through ph_index_beg, from the first to second to last number - # and set the photons between ph_index_beg i to ph_index_beg i + 1 to - # segment id i - for i in range(len(atl03_ph_index_beg) - 1): - ph_segment_id[atl03_ph_index_beg[i] : atl03_ph_index_beg[i + 1]] = ( - atl03_segment_id[i] - ) - - # Return list of segment_id at the photon level - return ph_segment_id - - -def ref_linear_interp(x, y): - # initialize an empty list - arr = [] - - # get unique x values - ux = np.unique(x) - for u in ux: - # get y values for x=u - idx = y[x == u] - - # try to get the y values for x=u-1 and x=u - # if this is not possible, set min and max to y values for x=u - try: - min = y[x == u - 1][0] - max = y[x == u][0] - except: - min = y[x == u][0] - max = y[x == u][0] - - # try to get the y values for x=u and x=u+1 - # if this is not possible, set min and max to y values for x=u - try: - min = y[x == u][0] - max = y[x == u + 1][0] - except: - min = y[x == u][0] - max = y[x == u][0] - - # if the min and max values are the same, - # fill the sub array with the value - if min == max: - sub = np.full((len(idx)), min) - arr.append(sub) - - # if min and max are different, - # create a sub array using linear interpolation - else: - sub = np.linspace(min, max, len(idx)) - arr.append(sub) - - # concatenate all the sub arrays into a single array and return - return np.concatenate(arr, axis=None).ravel() - - -# Bin data along vertical and horizontal scales -def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): - """Bin data along vertical and horizontal scales - for later segmentation""" - - # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise - valid_range = (-50, 10) - valid_mask = (dataset["photon_height"] > valid_range[0]) & ( - dataset["photon_height"] < valid_range[1] - ) - - # Apply the valid_mask to filter unwanted values - filtered_dataset = dataset[valid_mask] - - # Calculate the number of height bins - height_range = abs( - filtered_dataset["photon_height"].max() - - filtered_dataset["photon_height"].min() - ) - height_bin_number = max( - 1, round(height_range / vertical_res) - ) # Ensure at least one bin - - # Calculate the number of latitude bins - lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) - lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin - - # Create bins for latitude - lat_bins = pd.cut( - filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) - - # Create bins for height - height_bins = pd.cut( - filtered_dataset["photon_height"], - bins=height_bin_number, - labels=np.round( - np.linspace( - filtered_dataset["photon_height"].min(), - filtered_dataset["photon_height"].max(), - num=height_bin_number, - ), - decimals=1, - ), - ) - - # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, "lat_bins"] = lat_bins - filtered_dataset.loc[:, "height_bins"] = height_bins - filtered_dataset = filtered_dataset.reset_index(drop=True) - - return filtered_dataset - - -# thinking about grid searching to detect bathymetric directly -# rather than bin and then search -def horizontal_vertical_grid_density_cal( - dataset, lat_res, vertical_res, density_threshold -): - """Bin data along vertical and horizontal scales - and calculate high-density points using a grid method""" - - # Calculate the number of bins required both vertically - lat_bin_number = round(abs(dataset["lat"].min() - dataset["lat"].max()) / lat_res) - # and horizontally based on resolution size - height_bin_number = round( - abs(dataset["photon_height"].min() - dataset["photon_height"].max()) - / vertical_res - ) - - # Create the grid - grid = np.zeros((lat_bin_number, height_bin_number)) - - # Iterate over the dataset and assign points to cells - for _, row in dataset.iterrows(): - lat_index = int((row["lat"] - dataset["lat"].min()) / lat_res) - height_index = int( - (row["photon_height"] - dataset["photon_height"].min()) / vertical_res - ) - grid[lat_index, height_index] += 1 - - # Identify high-density cells - high_density_cells = np.argwhere(grid > density_threshold) - - # Create a copy of the dataset - dataset_copy = dataset.copy() - - # Assign the cell indices as bins to the dataset - dataset_copy["lat_bins"] = pd.cut( - dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) - dataset_copy["height_bins"] = pd.cut( - dataset["photon_height"], - bins=height_bin_number, - labels=np.arange(height_bin_number), - ) - - # Reset the index of the copied dataset - dataset_copy = dataset_copy.reset_index(drop=True) - - return dataset_copy, high_density_cells - - -# Bin data along horizontal scale only -def horizontal_bin_dataset(dataset, lat_res): - """Bin data along the horizontal scale (lat) only - for later segmentation""" - - # Calculate the number of bins required horizontally based on resolution size - lat_bin_number = round( - abs(dataset["lat_utm"].min() - dataset["lat_utm"].max()) / lat_res - ) - - # Cut lat bins - # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) - lat_bins = pd.cut( - dataset["lat_utm"], bins=lat_bin_number, labels=np.array(range(lat_bin_number)) - ) - - # Create a copy of the dataset - dataset_copy = dataset.copy() - - # Add lat bins to the dataframe - dataset_copy["lat_bins"] = lat_bins - - # Reset the index of the copied dataset - dataset_copy = dataset_copy.reset_index(drop=True) - - return dataset_copy - - -# Bin data first, and then throw away only the surface bin -def get_rm_sea_surface_bin(binned_dataset): - """Calculate mean sea height for easier calculation of depth and cleaner - figures""" - - # set flag for the df save - flag = 1 - - # group dataset by lat bins - grouped_data = binned_dataset.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - lat_bin_average = v["lat"].mean() - - # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby("height_bins", observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat"].argmax() - - # Select the index of the bin with the highest count - largest_h_index = new_df.index[largest_h_bin] - - # get all values below this bin - # Use boolean indexing to select only the values below the peak bin - new_photon_array_without_peak_bin = v.loc[v["height_bins"] < largest_h_index] - - if flag == 1: - photon_array_without_peak_bin = new_photon_array_without_peak_bin - flag = 2 - - else: - photon_array_without_peak_bin = photon_array_without_peak_bin.append( - new_photon_array_without_peak_bin - ) - - del new_df - - return photon_array_without_peak_bin - - -def get_sea_surface_height(binned_data, threshold): - """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - # set flag for the df save - firstTimeIndex = True - - # Create sea height list - sea_surface_height = [] - mean_lat_bins_seq = [] - sea_surface_subsurface_photons_ratio = [] - - # Group dataset by latitude bins - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - # based on lat_utm - lat_bin_average = v["lat"].mean() - - # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat"].argmax() - - # Select the index of the bin with the highest count - largest_h_index = new_df.index[largest_h_bin] - - # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[ - v["height_bins"] == largest_h_index, "photon_height" - ] - lat_bin_sea_median = photons_sea_surface.median() - - # Append to sea height list - sea_surface_height.append(lat_bin_sea_median) - mean_lat_bins_seq.append(lat_bin_average) - del new_df - - # Get all photons below sea surface - # to determine segment type of each subsurface water column - # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = v.loc[ - (v["photon_height"] > (lat_bin_sea_median - threshold)) - & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) - ] - - # Calculate the photon ratio between surface and whole photons - if v["photon_height"].shape[0] > 0: - new_photons_ratio_sea_surface = ( - photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] - ) - else: - new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append(1 - new_photons_ratio_sea_surface) - - # Filter out sea height bin values outside 2 SD of mean. - mean = np.nanmean(sea_surface_height, axis=0) - sd = np.nanstd(sea_surface_height, axis=0) - - final_sea_surface_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) - - # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) - - # Loop through groups again and return photons below 0.5m of sea height - PhotonDFBelowThresholdPeak = pd.DataFrame() - for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin - NewPhotonDFBelowThresholdPeak = v.loc[ - v["photon_height"] < (final_sea_surface_height[i] - threshold) - ] - - if firstTimeIndex: - PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex = False - else: - PhotonDFBelowThresholdPeak = pd.concat( - [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] - ) - - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowThresholdPeak, - ) - - -# -# Arbitrary cutoff below the max value - 0.5 m below peak -# (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) -def get_photon_below_sea_surface(binned_data, threshold): - """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - # set flag for the df save - firstTimeIndex = 1 - - # Create sea height list - sea_surface_height = [] - # mean_lat_bins_seq=[] - - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - lat_bin_average = v["lat_utm"].mean() - - # Create new dataframe based on occurance of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat_utm"].argmax() - - # Select the index of the bin with the highest count - largest_h = new_df.index[largest_h_bin] - - # Calculate the median value of all values within this bin - lat_bin_sea_median = v.loc[ - v["height_bins"] == largest_h, "photon_height" - ].median() - - # Append to sea height list - sea_surface_height.append(lat_bin_sea_median) - # mean_lat_bins_seq.append(lat_bin_average) - del new_df - - # Filter out sea height bin values outside 2 SD of mean. - mean = np.nanmean(sea_surface_height, axis=0) - sd = np.nanstd(sea_surface_height, axis=0) - Final_sea_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - Abnormal_sea_height_label = np.where(np.isnan(Final_sea_height), 1, 0) - - # Loop through groups again and return photons below 0.5m of sea height - for k, v in data_groups.items(): - # get all values below this bin - # Use calculated sea height to determine photons at 0.5m below peak - NewPhotonArrayBelowThresholdPeak = v.loc[ - v["photon_height"] < (sea_surface_height[k] - threshold) - ] - - if firstTimeIndex == 1: - PhotonArrayBelowThresholdPeak = NewPhotonArrayBelowThresholdPeak - firstTimeIndex = 2 - - else: - PhotonArrayBelowThresholdPeak = PhotonArrayBelowThresholdPeak.append( - NewPhotonArrayBelowThresholdPeak - ) - - return PhotonArrayBelowThresholdPeak - - -# Function to get elevation for multiple points -def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): - # Convert geographic coordinates to the raster's coordinate system - rows, cols = raster.index(lons, lats) - rows, cols = np.array(rows), np.array(cols) - - # Ensure the indices are within bounds - valid_mask = ( - (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) - ) - elevations = np.full(lons.shape, np.nan) - elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] - - return elevations - - -def get_water_temp(date_year, date_month, date_day, latitude, longitude): - """ - Pull down surface water temperature along the track from the JPL GHRSST opendap website. - - The GHRSST data are gridded tiles with dimension 17998 x 35999. - To get the specific grid tile of the SST, you must convert from lat, lon coordinates - to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. - """ - # Get date from data filename - # data_path[-33:-25] - date = date_year + date_month + date_day - # date[0:4] - year = date_year - # date[4:6] - month = date_month - # date[6:8] - day = date_day - day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) - # Add zero in front of day of year string - zero_day_of_year = day_of_year.zfill(3) - - # Calculate ratio of latitude from mid-point of IS2 track - old_lat = latitude.mean() - old_lat_min = -90 - old_lat_max = 90 - new_lat_min = 0 - new_lat_max = 17998 - - new_lat = round( - ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) - * (new_lat_max - new_lat_min) - + new_lat_min - ) - - # Calculate ratio of longitude from mid-point of IS2 track - old_lon = longitude.mean() - old_lon_min = -180 - old_lon_max = 180 - new_lon_min = 0 - new_lon_max = 35999 - - new_lon = round( - ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) - * (new_lon_max - new_lon_min) - + new_lon_min - ) - - # Access the SST data using the JPL OpenDap interface - url = ( - "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" - + str(year) - + "/" - + str(zero_day_of_year) - + "/" - + str(date) - + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" - ) - - dataset = netCDF4.Dataset(url) - - # Access the data and convert the temperature from K to C - water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 - return water_temp - - -def refraction_correction( - WTemp, - WSmodel, - Wavelength, - Photon_ref_elev, - Ph_ref_azimuth, - PhotonZ, - PhotonX, - PhotonY, - Ph_Conf, -): - """ - WTemp; there is python library that pulls water temp data - WSmodel is the value surface height - Wavelength is fixed - """ - - # Only process photons below water surface model - PhotonX = PhotonX[PhotonZ <= WSmodel] - PhotonY = PhotonY[PhotonZ <= WSmodel] - Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] - Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] - Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] - PhotonZ = PhotonZ[PhotonZ <= WSmodel] - - # water temp for refraction correction - WaterTemp = WTemp - - # Refraction coefficient # - a = -0.000001501562500 - b = 0.000000107084865 - c = -0.000042759374989 - d = -0.000160475520686 - e = 1.398067112092424 - wl = Wavelength - - # refractive index of air - n1 = 1.00029 - - # refractive index of water - n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e - - # assumption is 0.25416 - # This example is refractionCoef = 0.25449 - # 1.00029 is refraction of air constant - correction_coef = 1 - (n1 / n2) - - # read photon ref_elev to get theta1 - theta1 = np.pi / 2 - Photon_ref_elev - - # eq 1. Theta2 - theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) - - # eq 3. S - # Approximate water Surface = 1.5 - # D = raw uncorrected depth - D = WSmodel - PhotonZ - - # For Triangle DTS - S = D / np.cos(theta1) - - # eq 2. R - R = (S * n1) / n2 - Gamma = (np.pi / 2) - theta1 - - # For triangle RPS - # phi is an angle needed - phi = theta1 - theta2 - - # P is the difference between raw and corrected YZ location - P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) - - # alpha is an angle needed - alpha = np.arcsin((R * np.sin(phi)) / P) - - # Beta angle needed for Delta Y an d Delta Z - Beta = Gamma - alpha - - # Delta Y - DY = P * np.cos(Beta) - - # Delta Z - DZ = P * np.sin(Beta) - - # Delta Easting - DE = DY * np.sin(Ph_ref_azimuth) - - # Delta Northing - DN = DY * np.cos(Ph_ref_azimuth) - - outX = PhotonX + DE - outY = PhotonY + DN - outZ = PhotonZ + DZ - - """ - print('For selected Bathy photon:') - print('lat = ', PhotonY[9000]) - print('long = ', PhotonX[9000]) - print('Raw Depth = ', PhotonZ[9000]) - print('D = ', D[9000]) - - print('ref_elev = ', Photon_ref_elev[9000]) - - print('Delta East = ', DE[9000]) - print('Delta North = ', DN[9000]) - print('Delta Z = ', DZ[9000]) - """ - return ( - outX, - outY, - outZ, - Ph_Conf, - PhotonX, - PhotonY, - PhotonZ, - Ph_ref_azimuth, - Photon_ref_elev, - ) # We are most interested in out-x, out-y, out-z - - -def get_bath_height_percentile_thresh( - binned_data, percentile_thresh, sea_surface_height, vertical_res -): - """Detect bathymetric level per bin based on percentile_thresh""" - # Create sea height list - bath_height = [] - - geo_photon_height = [] - geo_longitude = [] - geo_latitude = [] - - # Group data by latitude - # Filter out surface data that are two bins below median surface value calculated above - binned_data_bath = binned_data[ - (binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) - ] - grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Create a percentile threshold of photon counts in each grid, - # grouped by both x and y axes. - count_threshold = np.percentile( - binned_data.groupby(["lat_bins", "height_bins"]) - .size() - .reset_index() - .groupby("lat_bins")[[0]] - .max(), - percentile_thresh, - ) - - # Loop through groups and return average bathy height - for k, v in data_groups.items(): - new_df = pd.DataFrame(v.groupby("height_bins").count()) - bath_bin = new_df["lat"].argmax() - bath_bin_h = new_df.index[bath_bin] - - # Set threshold of photon counts per bin - # here this script determines whether there is bathymetry signals by - # the photon counts per bin below sea surface height - if new_df.iloc[bath_bin]["lat"] >= count_threshold: - geo_photon_height.append( - v.loc[v["height_bins"] == bath_bin_h, "cor_photon_height"].values - ) - geo_longitude.append(v.loc[v["height_bins"] == bath_bin_h, "lon"].values) - geo_latitude.append(v.loc[v["height_bins"] == bath_bin_h, "lat"].values) - - bath_bin_median = v.loc[ - v["height_bins"] == bath_bin_h, "cor_photon_height" - ].median() - bath_height.append(bath_bin_median) - del new_df - - else: - bath_height.append(np.nan) - del new_df - - geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() - geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() - geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() - geo_depth = sea_surface_height - geo_photon_list - geo_df = pd.DataFrame( - { - "lon": geo_longitude_list, - "lat": geo_latitude_list, - "photon_height": geo_photon_list, - "depth": geo_depth, - } - ) - - del geo_longitude_list, geo_latitude_list, geo_photon_list - - return bath_height, geo_df - - -def get_bath_height_HDBSCAN( - lat_binned_data, percentile_thresh, sea_surface_height, vertical_res -): - """Calculate bathymetric level per lat bin based on horizontal resolution""" - # Create sea height list - bath_height = [] - geo_photon_height = [] - geo_longitude = [] - geo_latitude = [] - - # Group data by latitude - # Filter out surface data that are two bins (2 times height resolution) - # below median sea surface value calculated above - binned_data_bath = lat_binned_data[ - (lat_binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) - ] - - grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average bathymetric height - for k, v in data_groups.items(): - # assign each group of dataset to a new dataframe - new_df = pd.DataFrame(v) - - # Check if the DataFrame is empty - if new_df.empty: - # If the DataFrame is empty, append null values to the lists and skip to the next iteration - bath_height.append(np.nan) - geo_photon_height.append([]) - geo_longitude.append([]) - geo_latitude.append([]) - del new_df - continue - - lat_height_pairs = list( - zip(new_df["lat_utm"], new_df["cor_photon_height"], strict=False) - ) - - # Convert to a numpy array for sklearn - lat_height_pairs_array = np.array(lat_height_pairs) - - # Perform HDBSCAN clustering on the photons below sea surface for each lat bin - # scaler = StandardScaler() - scaler = MinMaxScaler() - data_scaled = scaler.fit_transform(lat_height_pairs_array) - - min_cluster_size = 4 - - # Check the number of data points is greater than the initial min_cluster_size - if len(data_scaled) < min_cluster_size: - min_cluster_size = len(data_scaled) // 2 - if min_cluster_size < 2: - bath_height.append(np.nan) - geo_photon_height.append([]) - geo_longitude.append([]) - geo_latitude.append([]) - del new_df - continue - - # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) - cluster = hdbscan.HDBSCAN( - min_cluster_size=min_cluster_size, - min_samples=3, - cluster_selection_epsilon=20, - leaf_size=25, - core_dist_n_jobs=-1, - ) - cluster.fit(data_scaled) - - # Get the labels assigned to each point by the HDBSCAN model - labels = cluster.labels_ - - # Count the number of points assigned to each cluster - unique, counts = np.unique(labels, return_counts=True) - - # Create a dictionary that maps each cluster label to the count of points assigned to it - clusters_dict = dict(zip(unique, counts, strict=False)) - - # Print the dictionary to see the size of each cluster - print(clusters_dict) - - # Identify the label of the largest cluster - max_cluster_label = max(clusters_dict, key=clusters_dict.get) - - # Check if no cluster is found - if max_cluster_label == -1: - bath_height.append(np.nan) - else: - # Add labels to the DataFrame - new_df_copy = new_df.copy() - new_df_copy["cluster"] = labels - - # Subset the DataFrame to get only the data points in the largest cluster - bath_cluster_data = new_df[new_df_copy["cluster"] == max_cluster_label] - - # calculate the median height value as the average bathymetric height - bath_bin_median = bath_cluster_data["cor_photon_height"].median() - bath_height.append(bath_bin_median) - - # Extract the longitude, latitude, and photon height for each lat bin - # and add it to respective lists - geo_photon_height.append(bath_cluster_data["cor_photon_height"].values) - geo_longitude.append(bath_cluster_data["lon_utm"].values) - geo_latitude.append(bath_cluster_data["lat_utm"].values) - - del new_df - - # Convert the lists to a single list - geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() - geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() - geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() - - # Calculate depth - geo_depth = sea_surface_height - np.array(geo_photon_list) - - # Create a DataFrame - geo_df = pd.DataFrame( - { - "lon": geo_longitude_list, - "lat": geo_latitude_list, - "photon_height": geo_photon_list, - "depth": geo_depth.tolist(), - } - ) - - return bath_height, geo_df - - -# requires that the input gdf has ranged index values i -# will need to change if index is changed to time or something -# this currently checks point in polygon for EVERY point -# would be significantly sped up if evaluated at 10m or something similar -# maybe later, fine for now -def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): - # try loading the shoreline data - try: - ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) - - # allocation of to be used arrays - zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) - - # Land flag initialized as -1 - # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) - - # set the projection - ICESat2_GDF.set_crs("EPSG:4326", inplace=True) - - # load shoreline dataset to include only the features that intersect the bounding box - # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None - # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. - # engine str, 'fiona' or 'pyogrio' - # somtime it gives error if using fiona - # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file( - shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" - ) - - # continue with getting a new array of 0-or-1 labels for each photon - land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) - - # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") - - # get land or not bool value - land_loc = ICESat2_GDF.index.isin(pts_in_land.index) - - # asigned them to new numpy array - land_point_labels[land_loc] = 1 - land_point_labels[~land_loc] = 0 - - return land_point_labels - - except Exception as e: - print(e) - - print("Error loading shoreline data, returning -1s for is_land flag") - - # if the shoreline data is not available - # return the original label array - - return -np.ones_like(ICESat2_GDF.is_land.values) - - -def produce_figures( - binned_data, - bath_height, - sea_height, - solo_sea_surface_label, - y_limit_top, - y_limit_bottom, - percentile, - file, - geo_df, - ref_y, - ref_z, - beam, - epsg_num, -): - """Create figures""" - - # Create bins for latitude - bath_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 - ) - - sea_surface_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 - ) - - # Create new dataframes for median values - bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) - - # Create uniform sea surface based on median sea surface values and filter out surface breaching - sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) - - # Create uniform solo sea surface label - sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame( - {"x": sea_surface_x_axis_bins, "y": sea_surface_label} - ) - idx_1 = np.where(sea_surface_label_df.y == 1) - idx_0 = np.where(sea_surface_label_df.y == 0) - - # Define figure size - fig = plt.rcParams["figure.figsize"] = (40, 25) - - # Plot raw points - # plt.scatter(x=binned_data.lat, - # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, - # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") - plt.scatter( - geo_df.lat, - geo_df.photon_height, - s=0.8, - marker="o", - alpha=0.1, - c="red", - label="Classified Photons", - ) - - # plt.scatter(x=geo_df.lat, - # y = geo_df.photon_height, marker='o', lw=0, s=0.8, - # alpha = 0.8, c = 'black', label = 'Corrected photon bin') - - # Plot median values - plt.scatter( - bath_median_df.x, - bath_median_df.y, - marker="o", - c="r", - alpha=0.8, - s=2, - label="Median bathymetry", - ) - - plt.scatter( - sea_median_df.x, - sea_median_df.y, - marker="o", - c="b", - alpha=1, - s=2, - label="Median sea surface", - ) - - plt.scatter( - sea_surface_label_df.iloc[idx_1].x, - sea_surface_label_df.iloc[idx_1].y, - marker="o", - c="pink", - alpha=1, - s=3, - label="solo_sea_surface", - ) - plt.scatter( - sea_surface_label_df.iloc[idx_0].x, - sea_surface_label_df.iloc[idx_0].y, - marker="o", - c="g", - alpha=1, - s=3, - label="non_solo_sea_surface", - ) - - # Insert titles and subtitles - plt.title("Icesat2 Bathymetry\n" + file) - plt.xlabel("Latitude", fontsize=25) - plt.ylabel("Photon Height (m)", fontsize=25) - plt.xticks(fontsize=16) - plt.yticks(fontsize=16) - - plt.legend(loc="upper left", prop={"size": 20}) - - # Limit the x and y axes using parameters - plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) - plt.ylim(top=y_limit_top, bottom=y_limit_bottom) - - timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace(".h5", "") - # Define where to save file - plt.tight_layout() - plt.savefig( - "C:/Workstation/ICESat2_HLS/" - + file - + "_gt" - + str(beam) - + "_" - + str(percentile) - + "_EPSG" - + str(epsg_num) - + "_" - + timestr - + ".pdf" - ) - # plt.show() - # plt.close() - - # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs( - "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True - ) - print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - - geo_df["lon_wgs84"] = lon_wgs84 - geo_df["lat_wgs84"] = lat_wgs84 - - geodf = gpd.GeoDataFrame( - geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) - ) - - geodf.set_crs(epsg=4326, inplace=True) - - # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + - # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") diff --git a/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py b/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py deleted file mode 100644 index b0c30ec..0000000 --- a/icesat-2_kdph_py/kd_utils/sea_photons_analysis.py +++ /dev/null @@ -1,570 +0,0 @@ -from datetime import datetime - -import netCDF4 -import numpy as np -import pandas as pd -from scipy.signal import find_peaks -from scipy.stats import norm - - -# Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset -def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): - """ - Apply horizontal and vertical binning to each beam. - - Flowchart steps: - 3 — Choose along-track (x) and vertical (z) bin sizes - (e.g., 500 m in x, 0.25 m in z). - 4 — Build vertical histograms of photon counts using selected x, z bin sizes. - """ - # Initialize list to store results from each beam - binned_beam_datasets = [] - - # Group the dataset by 'beam_id' and process each group separately - for beam_id, beam_data in sea_photon_dataset.groupby("beam_id"): - print(f"Processing binning for beam: {beam_id}") - - # Apply binning to the current beam dataset - binned_beam_data = horizontal_vertical_bin_dataset( - beam_data, horizontal_res, vertical_res - ) - - # Append the binned data for the current beam to the list - binned_beam_datasets.append(binned_beam_data) - - # Combine all binned beam datasets into a single DataFrame - binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) - - return binned_dataset_sea_surface - - -# Bin data along vertical and horizontal scales -def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): - """ - Bin data along vertical and horizontal scales for later segmentation. - - Flowchart steps: - 3 — Choose along-track (x) and vertical (z) bin sizes - (e.g., 500 m in x, 0.25 m in z). - 4 — Build vertical histograms of photon counts using selected x, z bin sizes. - 10 — Re-build histograms based on corrected depths (called again after - refraction correction when apply_post_refraction_refit is enabled). - """ - - # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise - valid_range = (-70, 5) - valid_mask = (dataset["photon_height"] > valid_range[0]) & ( - dataset["photon_height"] < valid_range[1] - ) - - # Apply the valid_mask to filter unwanted values - # and create a copy to avoid SettingWithCopyWarning - filtered_dataset = dataset[valid_mask].copy() - - # Calculate the number of height bins - height_range = abs( - filtered_dataset["photon_height"].max() - - filtered_dataset["photon_height"].min() - ) - height_bin_number = max( - 1, round(height_range / vertical_res) - ) # Ensure at least one bin - - # Calculate the number of latitude bins - lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) - lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin - - # Create bins for latitude - lat_bins = pd.cut( - filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) - - # Create bins for height - height_bins = pd.cut( - filtered_dataset["photon_height"], - bins=height_bin_number, - labels=np.round( - np.linspace( - filtered_dataset["photon_height"].min(), - filtered_dataset["photon_height"].max(), - num=height_bin_number, - ), - decimals=1, - ), - ) - - # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, "lat_bins"] = lat_bins - filtered_dataset.loc[:, "height_bins"] = height_bins - filtered_dataset = filtered_dataset.reset_index(drop=True) - - return filtered_dataset - - -def get_sea_surface_height_static(binned_data, threshold): - """ - Calculate sea surface height and filter subsurface photons using static threshold. - - Flowchart step: 7 — Fit Gaussian curve to identify surface elevation; compute - standard deviation of Gaussian peak. - - Parameters: - binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height - - Returns: - final_sea_surface_height (list): Detected sea surface heights - sea_surface_height_abnormal_label (array): Labels for abnormal heights - sea_surface_dominated_label (array): Labels for surface-dominated bins - PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface - """ - - # set flag for the df save - firstTimeIndex = True - - # Create sea height list - sea_surface_height = [] - mean_lat_bins_seq = [] - sea_surface_subsurface_photons_ratio = [] - - # Group dataset along horizental (latitude bins) - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - # Loop through groups to detect sea surface - for k, v in data_groups.items(): - # based on lat_utm - lat_bin_average = v["lat"].mean() - - # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) - - # Check if new_df is not empty before finding the bin with the highest photon count - if not new_df.empty: - # Find the vertical bin with the highest photon count - largest_h_bin = new_df["lat"].argmax() - - # Select the index of the bin with the highest count - largest_h_index = new_df.index[largest_h_bin] - - # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[ - v["height_bins"] == largest_h_index, "photon_height" - ] - lat_bin_sea_median = photons_sea_surface.median() - - # Append to sea height list - sea_surface_height.append(lat_bin_sea_median) - mean_lat_bins_seq.append(lat_bin_average) - del new_df - - # Get all photons below sea surface - # to determine segment type of each subsurface water column - # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = v.loc[ - (v["photon_height"] > (lat_bin_sea_median - threshold)) - & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) - ] - - # Calculate the photon ratio between surface and whole photons - if v["photon_height"].shape[0] > 0: - new_photons_ratio_sea_surface = ( - photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] - ) - else: - new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append( - 1 - new_photons_ratio_sea_surface - ) - - else: - # Append NaNs if the group is empty - sea_surface_height.append(np.nan) - mean_lat_bins_seq.append(np.nan) - sea_surface_subsurface_photons_ratio.append(np.nan) - - # Filter out sea height bin values outside 2 SD of mean. - mean = np.nanmean(sea_surface_height, axis=0) - sd = np.nanstd(sea_surface_height, axis=0) - - final_sea_surface_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) - - # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) - - # Loop through groups again and return photons below 0.5m of sea height - PhotonDFBelowThresholdPeak = pd.DataFrame() - for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin - if not np.isnan(final_sea_surface_height[i]): - NewPhotonDFBelowThresholdPeak = v.loc[ - v["photon_height"] < (final_sea_surface_height[i] - threshold) - ] - - if firstTimeIndex: - PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex = False - else: - PhotonDFBelowThresholdPeak = pd.concat( - [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] - ) - - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowThresholdPeak, - ) - - -def get_sea_surface_height_adaptive(binned_data): - """ - Calculate sea surface height and filter subsurface photons with enhanced wave removal. - - Flowchart steps: - 7 — Fit Gaussian curve to identify surface elevation; compute standard - deviation of Gaussian peak. - 11 — Re-fit Gaussian curve to surface to identify surface peak; compute - standard deviation and remove histogram data within three standard - deviations. (This function is called a second time after refraction - correction and histogram rebuild when apply_post_refraction_refit - is enabled.) - - Parameters: - binned_data (pandas.DataFrame): Binned photon data with lat, height_bins, photon_height - - Returns: - final_sea_surface_height (list): Detected sea surface heights - sea_surface_height_abnormal_label (array): Labels for abnormal heights - sea_surface_dominated_label (array): Labels for surface-dominated bins - PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed - """ - - firstTimeIndex = True - sea_surface_height = [] - mean_lat_bins_seq = [] - sea_surface_subsurface_photons_ratio = [] - - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - for k, v in data_groups.items(): - lat_bin_average = v["lat"].mean() - - if len(v) < 20: # Increase minimum photon count for robustness - sea_surface_height.append(np.nan) - mean_lat_bins_seq.append(np.nan) - sea_surface_subsurface_photons_ratio.append(np.nan) - continue - - # Finer histogram for better peak resolution - hist, bin_edges = np.histogram( - v["photon_height"], - bins=100, # Finer bins - density=True, - range=(v["photon_height"].min(), v["photon_height"].max()), - ) - bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - - # Enhanced peak detection - peaks, properties = find_peaks( - hist, - height=np.max(hist) * 0.3, # Stricter peak height - distance=10, # Wider separation - prominence=np.max(hist) * 0.1, - ) # Require prominent peaks - - if len(peaks) == 0: - sea_surface_height.append(np.nan) - mean_lat_bins_seq.append(np.nan) - sea_surface_subsurface_photons_ratio.append(np.nan) - continue - - # Use strongest peak as surface, consider nearby peaks as wave effects - surface_peak_idx = peaks[np.argmax(hist[peaks])] - surface_height = bin_centers[surface_peak_idx] - - # Two-pass Gaussian fit: start with ±1 m, widen if sigma is large - # (the ±1 m window underestimates sigma for SWH > 2 m) - half_win = 1.0 - peak_data = v[ - (v["photon_height"] > surface_height - half_win) - & (v["photon_height"] < surface_height + half_win) - ] - if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data["photon_height"]) - # Refit with wider window if sigma suggests truncation - if sigma > 0.4: - half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[ - (v["photon_height"] > mu - half_win2) - & (v["photon_height"] < mu + half_win2) - ] - if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2["photon_height"]) - else: - mu, sigma = surface_height, 0.2 # More conservative default - - adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # at least 1 m below - - # Extended surface layer to remove wave effects - surface_photons = v[ - (v["photon_height"] > adaptive_threshold) - & (v["photon_height"] < mu + 3.0 * sigma) - ] # Wider upper bound - ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan - - sea_surface_height.append(mu) - mean_lat_bins_seq.append(lat_bin_average) - sea_surface_subsurface_photons_ratio.append(1 - ratio) - - # Outlier filtering - mean = np.nanmean(sea_surface_height) - sd = np.nanstd(sea_surface_height) - final_sea_surface_height = np.where( - (sea_surface_height > mean + 2 * sd) | (sea_surface_height < mean - 2 * sd), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) - - # Filter subsurface photons with diagnostics - PhotonDFBelowSurface = pd.DataFrame() - for i, (k, v) in enumerate(data_groups.items()): - if not np.isnan(final_sea_surface_height[i]): - half_win = 1.0 - peak_data = v[ - (v["photon_height"] > final_sea_surface_height[i] - half_win) - & (v["photon_height"] < final_sea_surface_height[i] + half_win) - ] - if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data["photon_height"]) - if sigma > 0.4: - half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[ - (v["photon_height"] > mu - half_win2) - & (v["photon_height"] < mu + half_win2) - ] - if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2["photon_height"]) - adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) - else: - adaptive_threshold = final_sea_surface_height[i] - 1.0 - - NewPhotonDFBelowSurface = v[v["photon_height"] < adaptive_threshold] - - if firstTimeIndex: - PhotonDFBelowSurface = NewPhotonDFBelowSurface - firstTimeIndex = False - else: - PhotonDFBelowSurface = pd.concat( - [PhotonDFBelowSurface, NewPhotonDFBelowSurface] - ) - - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowSurface, - ) - - -def get_water_temp(date_year, date_month, date_day, latitude, longitude): - """ - Pull down surface water temperature along the track from the JPL GHRSST opendap website. - - The GHRSST data are gridded tiles with dimension 17998 x 35999. - To get the specific grid tile of the SST, you must convert from lat, lon coordinates - to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. - """ - # Get date from data filename - # data_path[-33:-25] - date = date_year + date_month + date_day - # date[0:4] - year = date_year - # date[4:6] - month = date_month - # date[6:8] - day = date_day - day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) - # Add zero in front of day of year string - zero_day_of_year = day_of_year.zfill(3) - - # Calculate ratio of latitude from mid-point of IS2 track - old_lat = latitude.mean() - old_lat_min = -90 - old_lat_max = 90 - new_lat_min = 0 - new_lat_max = 17998 - - new_lat = round( - ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) - * (new_lat_max - new_lat_min) - + new_lat_min - ) - - # Calculate ratio of longitude from mid-point of IS2 track - old_lon = longitude.mean() - old_lon_min = -180 - old_lon_max = 180 - new_lon_min = 0 - new_lon_max = 35999 - - new_lon = round( - ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) - * (new_lon_max - new_lon_min) - + new_lon_min - ) - - # Access the SST data using the JPL OpenDap interface - url = ( - "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" - + str(year) - + "/" - + str(zero_day_of_year) - + "/" - + str(date) - + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" - ) - - dataset = netCDF4.Dataset(url) - - # Access the data and convert the temperature from K to C - water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 - return water_temp - - -def refraction_correction( - WTemp, - WSmodel, - Wavelength, - Photon_ref_elev, - Ph_ref_azimuth, - PhotonZ, - PhotonX, - PhotonY, - Ph_Conf, -): - """ - WTemp; there is python library that pulls water temp data - WSmodel is the value surface height - Wavelength is fixed - """ - - # Only process photons below water surface model - PhotonX = PhotonX[PhotonZ <= WSmodel] - PhotonY = PhotonY[PhotonZ <= WSmodel] - Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] - Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] - Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] - PhotonZ = PhotonZ[PhotonZ <= WSmodel] - - # water temp for refraction correction - WaterTemp = WTemp - - # Refraction coefficient # - a = -0.000001501562500 - b = 0.000000107084865 - c = -0.000042759374989 - d = -0.000160475520686 - e = 1.398067112092424 - wl = Wavelength - - # refractive index of air - n1 = 1.00029 - - # refractive index of water - n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e - - # assumption is 0.25416 - # This example is refractionCoef = 0.25449 - # 1.00029 is refraction of air constant - correction_coef = 1 - (n1 / n2) - - # read photon ref_elev to get theta1 - theta1 = np.pi / 2 - Photon_ref_elev - - # eq 1. Theta2 - theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) - - # eq 3. S - # Approximate water Surface = 1.5 - # D = raw uncorrected depth - D = WSmodel - PhotonZ - - # For Triangle DTS - S = D / np.cos(theta1) - - # eq 2. R - R = (S * n1) / n2 - Gamma = (np.pi / 2) - theta1 - - # For triangle RPS - # phi is an angle needed - phi = theta1 - theta2 - - # P is the difference between raw and corrected YZ location - P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) - - # alpha is an angle needed - alpha = np.arcsin((R * np.sin(phi)) / P) - - # Beta angle needed for Delta Y an d Delta Z - Beta = Gamma - alpha - - # Delta Y - DY = P * np.cos(Beta) - - # Delta Z - DZ = P * np.sin(Beta) - - # Delta Easting - DE = DY * np.sin(Ph_ref_azimuth) - - # Delta Northing - DN = DY * np.cos(Ph_ref_azimuth) - - outX = PhotonX + DE - outY = PhotonY + DN - outZ = PhotonZ + DZ - - """ - print('For selected Bathy photon:') - print('lat = ', PhotonY[9000]) - print('long = ', PhotonX[9000]) - print('Raw Depth = ', PhotonZ[9000]) - print('D = ', D[9000]) - - print('ref_elev = ', Photon_ref_elev[9000]) - - print('Delta East = ', DE[9000]) - print('Delta North = ', DN[9000]) - print('Delta Z = ', DZ[9000]) - """ - return ( - outX, - outY, - outZ, - Ph_Conf, - PhotonX, - PhotonY, - PhotonZ, - Ph_ref_azimuth, - Photon_ref_elev, - ) # We are most interested in out-x, out-y, out-z diff --git a/icesat-2_kdph_py/kd_utils/visualization.py b/icesat-2_kdph_py/kd_utils/visualization.py deleted file mode 100644 index d9ed46f..0000000 --- a/icesat-2_kdph_py/kd_utils/visualization.py +++ /dev/null @@ -1,495 +0,0 @@ -# utils/visualization.py - -import os -import time - -import geopandas as gpd -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from pyproj import Transformer -from scipy.spatial import ConvexHull - - -def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): - fig, ax = plt.subplots() - ax.scatter( - sea_photon_dataset["latitude"], - sea_photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="ATL03 Photons", - ) - ax.set_xlabel("Relative AT Distance") - ax.set_ylabel("Height (h_ph)") - ax.set_title("Scatter Plot of ATL03 Photons") - ax.set_ylim(hlims) - ax.legend() - plt.show() - - -def plot_filtered_seafloor_photons( - filtered_seafloor_subsurface_dataset, - sea_photon_dataset, - sea_surface_height, - output_path, -): - """ - Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. - - Parameters: - - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. - - sea_photon_dataset: DataFrame containing the original sea photon data. - - sea_surface_height: Array of sea surface height values. - - output_path: Path to save the output plot. - """ - - # get sea_surface_x_axis_bins - sea_surface_x_axis_bins = np.linspace( - filtered_seafloor_subsurface_dataset["relative_AT_dist"].min(), - filtered_seafloor_subsurface_dataset["relative_AT_dist"].max(), - len(sea_surface_height), - ) - - hlims = [-25, 10] - fig, ax = plt.subplots() - - # Scatter plot of subsurface photons - ax.scatter( - sea_photon_dataset["relative_AT_dist"], - sea_photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="Subsurface ATL03 Photons", - ) - - # Overlay the seafloor elevation data as points - ax.plot( - filtered_seafloor_subsurface_dataset["relative_AT_dist"], - filtered_seafloor_subsurface_dataset["seafloor_elevation"], - linewidth=0.8, - c="b", - alpha=0.4, - label="Seafloor Elevation", - ) - - ax.plot( - sea_surface_x_axis_bins, - [x - 0.5 for x in sea_surface_height], - linewidth=0.8, - color="#DD571C", - alpha=0.4, - label="0.5 m Below Surface Peak", - ) - - # Set labels and title - ax.set_xlabel("Distance From Start of Track (km)") - ax.set_ylabel("Photon Height (m)") - ax.set_title("Scatter Plot of Filtered Sea Photon Dataset") - ax.set_ylim(hlims) - - # Add a legend - ax.legend() - - # Save the plot - plt.legend(loc="upper right") - plt.savefig(output_path, dpi=400, format="jpeg") - - # Show the plot - plt.show() - - -def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull_areas): - """ - Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. - """ - # filter beam type - # photon_dataset - photon_dataset = photon_dataset[photon_dataset["beam_id"].isin(target_beam_ids)] - - fig, ax = plt.subplots(figsize=(10, 6)) - hlims = [-10, 2] - - for lat_bin, hull_points in convex_hulls.items(): - hull = ConvexHull(hull_points) - ax.fill( - hull_points[hull.vertices, 0], - hull_points[hull.vertices, 1], - alpha=0.3, - label=f"Bin {lat_bin}", - ) - - # Calculate centroid to place the label - centroid = np.mean(hull_points[hull.vertices], axis=0) - ax.text( - centroid[0], - centroid[1], - f"{convex_hull_areas[lat_bin]:.2f}", - horizontalalignment="center", - verticalalignment="center", - fontsize=10, - color="black", - ) - - ax.scatter( - photon_dataset["lat"], - photon_dataset["photon_height"], - s=1, - c="k", - alpha=0.15, - edgecolors="none", - label="Subsurface ATL03 Photons", - ) - ax.set_ylim(hlims) - ax.set_xlabel("Latitude") - ax.set_ylabel("Photon Height") - ax.set_title("ConvexHull of Photons within each Lat Bin") - plt.show() - - -# plot photons coloured by quality_ph flag for diagnostic purposes -def plot_photon_quality_flags( - output_path, timestamp, sea_photon_dataset, target_beam_ids -): - """ - Scatter plot of photon height vs. along-track distance, coloured by quality_ph value. - - quality_ph bit meanings (ATL03 v007): - bit 0 (value 1) = possible afterpulse - bit 1 (value 2) = possible impulse response effect - bit 2 (value 4) = possible TEP - """ - dataset = sea_photon_dataset[ - sea_photon_dataset["beam_id"].isin(target_beam_ids) - ].copy() - if dataset.empty: - return - - quality_colors = { - 0: ("dimgray", "nominal (0)"), - 1: ("red", "afterpulse (1)"), - 2: ("orange", "impulse response (2)"), - 3: ("darkred", "afterpulse + impulse (3)"), - 4: ("royalblue", "TEP (4)"), - 5: ("purple", "afterpulse + TEP (5)"), - 6: ("cyan", "impulse + TEP (6)"), - 7: ("black", "all flags (7)"), - } - - fig, ax = plt.subplots(figsize=(12, 5)) - unique_flags = sorted(dataset["quality_ph"].dropna().astype(int).unique()) - for flag in unique_flags: - subset = dataset[dataset["quality_ph"].astype(int) == flag] - color, label = quality_colors.get(flag, ("magenta", f"unknown ({flag})")) - ax.scatter( - subset["relative_AT_dist"], - subset["photon_height"], - s=0.5, - c=color, - alpha=0.3, - edgecolors="none", - label=label, - ) - - ax.set_xlabel("Relative Along-Track Distance (km)", fontsize=12) - ax.set_ylabel("Photon Height (m)", fontsize=12) - ax.set_title("Photon quality_ph flag distribution", fontsize=13) - ax.set_ylim([-15, 5]) - ax.legend(loc="lower right", markerscale=6, fontsize=9) - fig.tight_layout() - save_path = os.path.join(output_path, f"{timestamp}_quality_ph_flags.jpg") - plt.savefig(save_path, dpi=300, format="jpeg") - plt.show() - print(f"Quality flag plot saved: {save_path}") - - # Print summary counts - print("\nquality_ph flag summary:") - for flag in unique_flags: - count = (dataset["quality_ph"].astype(int) == flag).sum() - _, label = quality_colors.get(flag, ("", f"unknown ({flag})")) - pct = 100.0 * count / len(dataset) - print(f" {label:35s}: {count:>8,d} ({pct:.1f}%)") - - -# plot the kd and photon -def plot_kd_photons( - OutputPath, - timestamp, - target_beam_ids, - subsurface_photon_dataset, - Kd_DF_MergedDistance, -): - # filter beam type - # photon_dataset - subsurface_photon_dataset = subsurface_photon_dataset[ - subsurface_photon_dataset["beam_id"].isin(target_beam_ids) - ] - Kd_DF_MergedDistance = Kd_DF_MergedDistance[ - Kd_DF_MergedDistance["beam_id"].isin(target_beam_ids) - ] - - hlims = [-45, 5] - fig, ax1 = plt.subplots(figsize=(10, 6)) - ax1.scatter( - subsurface_photon_dataset["relative_AT_dist"], - subsurface_photon_dataset["photon_height"], - s=1.5, - c="k", - alpha=0.2, - edgecolors="none", - label="Subsurface ATL03 Photon Height", - ) - ax1.set_xlabel("Relative Along-Track Distance", fontsize=18) - ax1.tick_params(axis="x", labelsize=18) # Added line for x-tick label fontsize - ax1.set_ylabel("Photon Height", color="b", fontsize=18) - ax1.tick_params(axis="y", labelcolor="b", labelsize=18) - if "seafloor_elevation" in subsurface_photon_dataset.columns: - ax1.plot( - subsurface_photon_dataset["relative_AT_dist"], - subsurface_photon_dataset["seafloor_elevation"], - linewidth=0.8, - c="b", - alpha=0.4, - label="Seafloor Elevation", - ) - # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') - # ax1.set_xlim([1800, 2100]) # Set x-axis limits - ax1.set_ylim(hlims) - - ax2 = ax1.twinx() - ax2.scatter( - Kd_DF_MergedDistance["relative_AT_dist"], - Kd_DF_MergedDistance["kd"], - label="Kd values", - color="r", - alpha=0.6, - ) - ax2.set_ylabel("Kd Value", color="r", fontsize=18) - ax2.tick_params(axis="y", labelcolor="r", labelsize=18) - # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') - handles1, labels1 = ax1.get_legend_handles_labels() - handles2, labels2 = ax2.get_legend_handles_labels() - fig.legend( - handles1 + handles2, - labels1 + labels2, - loc="upper right", - bbox_to_anchor=(0.85, 0.85), - ) - # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) - fig.tight_layout() - plt.savefig( - os.path.join(OutputPath, f"{timestamp}_IS2_subsurface_kd_2.jpg"), - dpi=400, - format="jpeg", - ) - plt.show() - - -def plot_bin_polygon_data( - lat, - height, - seafloor_height, - mask, - lat_bins, - height_bins, - valid_bins, - polygons, - y_min, - y_max, -): - plt.figure(figsize=(12, 6)) - - plt.subplot(1, 2, 1) - plt.title("Original Height Data") - plt.scatter(lat, height, c=height, cmap="viridis", s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") - plt.colorbar(label="Height") - plt.xlabel("Latitude") - plt.ylabel("Height") - plt.ylim(y_min, y_max) - plt.legend() - - plt.subplot(1, 2, 2) - plt.title("Masked Region (ROI)") - plt.scatter(lat[mask], height[mask], c=height[mask], cmap="viridis", s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") - - for polygon in polygons: - plt.gca().add_patch( - plt.Polygon(polygon, edgecolor="red", facecolor="none", linewidth=1) - ) - - for lb in lat_bins: - plt.axvline(x=lb, color="gray", linestyle="--", linewidth=0.5) - for hb in height_bins: - plt.axhline(y=hb, color="gray", linestyle="--", linewidth=0.5) - - plt.colorbar(label="Height") - plt.xlabel("Latitude") - plt.ylabel("Height") - plt.ylim(y_min, y_max) - plt.legend() - - plt.tight_layout() - plt.show() - - -def produce_figures( - binned_data, - bath_height, - sea_height, - solo_sea_surface_label, - y_limit_top, - y_limit_bottom, - percentile, - file, - geo_df, - ref_y, - ref_z, - beam, - epsg_num, -): - """Create figures""" - - # Create bins for latitude - bath_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 - ) - - sea_surface_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 - ) - - # Create new dataframes for median values - bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) - - # Create uniform sea surface based on median sea surface values and filter out surface breaching - sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) - - # Create uniform solo sea surface label - sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame( - {"x": sea_surface_x_axis_bins, "y": sea_surface_label} - ) - idx_1 = np.where(sea_surface_label_df.y == 1) - idx_0 = np.where(sea_surface_label_df.y == 0) - - # Define figure size - fig = plt.rcParams["figure.figsize"] = (40, 25) - - # Plot raw points - # plt.scatter(x=binned_data.lat, - # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, - # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") - plt.scatter( - geo_df.lat, - geo_df.photon_height, - s=0.8, - marker="o", - alpha=0.1, - c="red", - label="Classified Photons", - ) - - # plt.scatter(x=geo_df.lat, - # y = geo_df.photon_height, marker='o', lw=0, s=0.8, - # alpha = 0.8, c = 'black', label = 'Corrected photon bin') - - # Plot median values - plt.scatter( - bath_median_df.x, - bath_median_df.y, - marker="o", - c="r", - alpha=0.8, - s=2, - label="Median bathymetry", - ) - - plt.scatter( - sea_median_df.x, - sea_median_df.y, - marker="o", - c="b", - alpha=1, - s=2, - label="Median sea surface", - ) - - plt.scatter( - sea_surface_label_df.iloc[idx_1].x, - sea_surface_label_df.iloc[idx_1].y, - marker="o", - c="pink", - alpha=1, - s=3, - label="solo_sea_surface", - ) - plt.scatter( - sea_surface_label_df.iloc[idx_0].x, - sea_surface_label_df.iloc[idx_0].y, - marker="o", - c="g", - alpha=1, - s=3, - label="non_solo_sea_surface", - ) - - # Insert titles and subtitles - plt.title("Icesat2 Bathymetry\n" + file) - plt.xlabel("Latitude", fontsize=25) - plt.ylabel("Photon Height (m)", fontsize=25) - plt.xticks(fontsize=16) - plt.yticks(fontsize=16) - - plt.legend(loc="upper left", prop={"size": 20}) - - # Limit the x and y axes using parameters - plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) - plt.ylim(top=y_limit_top, bottom=y_limit_bottom) - - timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace(".h5", "") - # Define where to save file - plt.tight_layout() - plt.savefig( - "C:/Workstation/ICESat2_HLS/" - + file - + "_gt" - + str(beam) - + "_" - + str(percentile) - + "_EPSG" - + str(epsg_num) - + "_" - + timestr - + ".pdf" - ) - # plt.show() - # plt.close() - - # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs( - "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True - ) - print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - - geo_df["lon_wgs84"] = lon_wgs84 - geo_df["lat_wgs84"] = lat_wgs84 - - geodf = gpd.GeoDataFrame( - geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) - ) - - geodf.set_crs(epsg=4326, inplace=True) - - # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + - # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") diff --git a/icesat-2_kdph_py/main.py b/icesat-2_kdph_py/main.py deleted file mode 100644 index 9168ef0..0000000 --- a/icesat-2_kdph_py/main.py +++ /dev/null @@ -1,329 +0,0 @@ -import glob -import logging -import os -import re -import sys - -from config import get_args -from kd_utils.bathy_processing import process_subsurface_photon_filtering -from kd_utils.data_processing import ( - Extract_sea_photons, - apply_optional_ir_ap_filter, - apply_optional_solar_background_filter, - filter_photon_dataset_by_hull_area, - load_data, -) -from kd_utils.Kd_analysis import process_kd_calculation -from kd_utils.sea_photons_analysis import process_sea_photon_binning -from kd_utils.visualization import ( - plot_convex_hulls, - plot_kd_photons, - plot_photon_quality_flags, -) -import pandas as pd - -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - - -PAIR_ID_MAP = { - "gt1l": "gt1_pair", - "gt1r": "gt1_pair", - "gt2l": "gt2_pair", - "gt2r": "gt2_pair", - "gt3l": "gt3_pair", - "gt3r": "gt3_pair", -} - - -def get_target_beams(all_beams, beam_attrs, target_beams_arg): - strong_beams = [ - gtx - for gtx in all_beams - if beam_attrs[gtx]["atlas_beam_type"].decode("utf-8") == "strong" - ] - if not target_beams_arg: - return strong_beams[:1] # auto-select first strong beam - - requested = [b.strip() for b in target_beams_arg.split(",") if b.strip()] - return [b for b in requested if b in strong_beams] - - -def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=False): - """ - Optionally combine left/right beams into pair groups before Kd fitting. - Uses shared along-track bins from relative_AT_dist when available. - - Flowchart step: 15 — Combine data from paired beams if appropriate (not - recommended in optically complex water). - """ - if (not enabled) or dataset.empty: - return dataset - - combined = dataset.copy() - combined["source_beam_id"] = combined["beam_id"] - combined["beam_id"] = ( - combined["beam_id"].map(PAIR_ID_MAP).fillna(combined["beam_id"]) - ) - - if "relative_AT_dist" in combined.columns: - rel_m = pd.to_numeric(combined["relative_AT_dist"], errors="coerce") * 1000.0 - pair_bins = (rel_m / max(horizontal_res, 1)).astype("Int64") - pair_bins = pair_bins.fillna(-1).astype(int) - combined["lat_bins"] = pair_bins - - return combined - - -def run_pipeline(args): - """ - Top-level pipeline orchestrator. Executes all 19 flowchart steps in sequence. - - Flowchart steps directly handled here: - 1 — Load ATL03 data (load_data / Extract_sea_photons). - 2 — Remove afterpulse/impulse-response photons (apply_optional_ir_ap_filter). - 3 — Bin sizes set via args.horizontal_res / args.vertical_res. - 4 — Build histograms (process_sea_photon_binning). - 5 — Solar background filter (apply_optional_solar_background_filter). - 6–14 — Subsurface filtering chain (process_subsurface_photon_filtering). - 15 — Paired beam combine (optionally_combine_paired_beams_for_kd). - 16–17 — Beer's Law Kd fit (process_kd_calculation). - 18 — Save Kd CSV output (subsurface_photon_df_added_kd.to_csv). - 19 — Check data for reasonableness (filter_photon_dataset_by_hull_area + - plot_kd_photons). - """ - atl03_h5_file_path = os.path.join( - args.workspace_path, args.atl03_path, args.atl03_file - ) - shoreline_data_path = os.path.join( - args.workspace_path, args.other_data_path, args.shoreline_data - ) - gebco_full_path = os.path.join( - args.workspace_path, args.other_data_path, args.gebco_path - ) - atl24_file_path = args.atl24_file - if atl24_file_path and (not os.path.isabs(atl24_file_path)): - atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) - output_path = os.path.join(args.workspace_path, args.output_path) - os.makedirs(output_path, exist_ok=True) - - match = re.search(r"_(\d{14})_", atl03_h5_file_path) - timestamp = match.group(1) if match else "unknown" - - version_match = re.search( - r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path) - ) - atl03_version = int(version_match.group(1)) if version_match else None - logger.info( - "ATL03 version detected: %s", - f"{atl03_version:03d}" if atl03_version else "unknown", - ) - - if args.enable_ir_ap_filter: - if atl03_version is None or atl03_version < 7: - logger.error( - "IR/AP filter requires ATL03 version 007 or later. " - "Detected version: %s. " - "Skipping IR/AP filter for this run. " - "Switch to a version 007 file (e.g., Wax Delta dataset) to enable this step.", - f"{atl03_version:03d}" if atl03_version else "unknown", - ) - args.enable_ir_ap_filter = False - - gebco_pattern = os.path.join(gebco_full_path, "gebco_*.tif") - gebco_file_path_lists = [p for p in glob.glob(gebco_pattern)] - - is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) - target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) - if not target_strong_beams: - logger.error( - "No target strong beams found. Check input file and --target_beams." - ) - sys.exit(1) - logger.info("Strong beams selected: %s", target_strong_beams) - plot_target_beam = [target_strong_beams[0]] - - sea_photon_dataset = Extract_sea_photons( - is2_mds, target_strong_beams, shoreline_data_path - ) - - if args.enable_ir_ap_filter and not args.no_plot: - plot_photon_quality_flags( - output_path, timestamp, sea_photon_dataset, plot_target_beam - ) - - sea_photon_dataset = apply_optional_ir_ap_filter( - sea_photon_dataset, - enabled=args.enable_ir_ap_filter, - quality_max=args.ir_ap_quality_max, - min_signal_conf=args.ir_ap_min_signal_conf, - ) - - # Step 5 — Solar background filter is ON by default. - # It self-gates on solar_elevation so has zero effect on nighttime passes. - # Use --disable_solar_background_filter to turn it off. - solar_bg_enabled = not args.disable_solar_background_filter - if not solar_bg_enabled: - if "solar_elevation" in sea_photon_dataset.columns: - max_solar_elev = sea_photon_dataset["solar_elevation"].max() - if max_solar_elev > args.solar_elevation_day_threshold: - logger.warning( - "Solar background filter is DISABLED via --disable_solar_background_filter " - "but this granule is a daytime pass " - "(max solar elevation = %.1f°, daytime threshold = %.1f°). " - "This filter is recommended for daytime passes.", - max_solar_elev, - args.solar_elevation_day_threshold, - ) - - sea_photon_dataset = apply_optional_solar_background_filter( - sea_photon_dataset, - enabled=solar_bg_enabled, - day_threshold=args.solar_elevation_day_threshold, - median_window_deg=args.solar_bg_median_window_deg, - noise_multiplier=args.solar_bg_noise_multiplier, - min_signal_conf=args.solar_background_min_signal_conf, - ) - - binned_dataset_sea_surface = process_sea_photon_binning( - sea_photon_dataset, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) - - post_refraction_refit_enabled = args.enable_post_refraction_refit - if post_refraction_refit_enabled and (not args.enable_refraction_correction): - logger.warning( - "--enable_post_refraction_refit requested without --enable_refraction_correction. " - "The post-refraction refit step will be skipped." - ) - post_refraction_refit_enabled = False - - ( - sea_surface_height, - sea_surface_label, - filtered_seafloor_subsurface_photon_dataset, - ) = process_subsurface_photon_filtering( - binned_dataset_sea_surface, - gebco_file_path_lists, - args.subsurface_thresh, - args.ignore_subsurface_height_thres, - use_atl24_filter=args.enable_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, - use_gebco_filter=args.enable_gebco_filter, - apply_histogram_quality_filter=args.enable_histogram_quality_filter, - histogram_quality_min_ratio=args.histogram_quality_min_ratio, - histogram_quality_depth_min=args.histogram_quality_depth_min, - histogram_quality_depth_max=args.histogram_quality_depth_max, - histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, - apply_surface_sigma_filter=args.enable_surface_sigma_filter, - surface_sigma_max=args.surface_sigma_max, - apply_refraction_correction=args.enable_refraction_correction, - refraction_water_temp_c=args.refraction_water_temp_c, - refraction_wavelength_nm=args.refraction_wavelength_nm, - apply_post_refraction_refit=post_refraction_refit_enabled, - apply_flattening=args.enable_sea_surface_flattening, - flattening_window_m=args.sea_surface_flattening_window_m, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) - - final_filtered_subsurface_photon_dataset = ( - filtered_seafloor_subsurface_photon_dataset[ - filtered_seafloor_subsurface_photon_dataset["beam_id"].isin( - target_strong_beams - ) - ].copy() - ) - - if args.enable_convex_hull_filter: - final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = ( - filter_photon_dataset_by_hull_area( - final_filtered_subsurface_photon_dataset, - hull_area_threshold=args.convex_hull_area_threshold, - ) - ) - if plot_target_beam and not args.no_plot: - plot_convex_hulls( - final_filtered_subsurface_photon_dataset, - plot_target_beam, - convex_hulls, - convex_hull_areas, - ) - - subsurface_output_path = os.path.join( - output_path, - f"{timestamp}_strongBeam_{'_'.join(target_strong_beams)}_subsurface_photons.csv", - ) - final_filtered_subsurface_photon_dataset.to_csv(subsurface_output_path, index=False) - - kd_input_dataset = optionally_combine_paired_beams_for_kd( - final_filtered_subsurface_photon_dataset, - horizontal_res=args.horizontal_res, - enabled=args.enable_paired_beam_combine, - ) - wave_mult = args.wave_exclusion_multiplier if args.enable_wave_adaptive_fit else 0.0 - subsurface_photon_df_added_kd = process_kd_calculation( - kd_input_dataset, - decay_zone_threshold=args.decay_zone_threshold, - kd_fit_method=args.kd_fit_method, - wave_exclusion_multiplier=wave_mult, - wave_sigma_calm_threshold=args.wave_sigma_calm_threshold, - ) - kd_output_path = os.path.join( - output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv" - ) - subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) - - if not args.no_plot and "relative_AT_dist" in kd_input_dataset.columns: - unique_photon_dataset = kd_input_dataset[ - ["relative_AT_dist", "lat_bins", "photon_height"] - ].drop_duplicates() - unique_photon_dataset["relative_AT_dist_center"] = ( - unique_photon_dataset.groupby( - "lat_bins", observed=False - )["relative_AT_dist"].transform("mean") - ) - - _closest_rows = [] - for _lat_bin, _group in unique_photon_dataset.groupby( - "lat_bins", observed=False - ): - if _group.empty: - continue - _center = _group["relative_AT_dist_center"].iloc[0] - _group = _group.copy() - _group["dist_to_center"] = abs(_group["relative_AT_dist"] - _center) - _closest_rows.append(_group.loc[[_group["dist_to_center"].idxmin()]]) - if not _closest_rows: - logger.warning("No along-track bins survived filtering. Skipping plot.") - else: - closest_to_center = pd.concat(_closest_rows).reset_index(drop=True) - kd_df_merged_distance = closest_to_center.merge( - subsurface_photon_df_added_kd, on="lat_bins", how="left" - ).drop( - columns=["relative_AT_dist_center", "dist_to_center"], errors="ignore" - ) - - plot_kd_photons( - output_path, - timestamp, - plot_target_beam, - kd_input_dataset, - kd_df_merged_distance, - ) - - logger.info("SUCCESS! Kd output: %s", kd_output_path) - - -if __name__ == "__main__": - cli_args = get_args() - try: - run_pipeline(cli_args) - except Exception as e: - logger.error("An error occurred: %s", e) - raise From 3b388931e30c74c19c1eb974ce7edd0bf0835587 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 19:58:53 +0000 Subject: [PATCH 07/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- aok/core/kd_utils/Kd_analysis.py | 460 +++++++++------ aok/core/kd_utils/bathy_processing.py | 582 +++++++++++-------- aok/core/kd_utils/config.py | 506 +++++++++++------ aok/core/kd_utils/data_processing.py | 651 ++++++++++++---------- aok/core/kd_utils/main.py | 204 ++++--- aok/core/kd_utils/sea_photons_analysis.py | 341 +++++++----- aok/core/kd_utils/visualization.py | 462 ++++++++++----- 7 files changed, 2011 insertions(+), 1195 deletions(-) diff --git a/aok/core/kd_utils/Kd_analysis.py b/aok/core/kd_utils/Kd_analysis.py index 4b03867..7d1504b 100644 --- a/aok/core/kd_utils/Kd_analysis.py +++ b/aok/core/kd_utils/Kd_analysis.py @@ -1,61 +1,62 @@ # utils/Kd_analysis.py # updated to perform a linear fit in log-space, just like MATLAB's polyfitn(zdepth, y, 1) for a first-order polynomial. +import logging + import numpy as np import pandas as pd -import logging from scipy.optimize import curve_fit from sklearn.linear_model import LinearRegression # def log_model(z, kd, e0): # return np.log(e0) - kd * z -## This is wrong because the input is already a bined data, +## This is wrong because the input is already a bined data, ## it is not necessary to do a histogram again # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -66,22 +67,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -95,47 +96,47 @@ # def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.25): # if df.empty or 'lat_bins' not in df.columns: # return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], 'latitude': [np.nan], 'longitude': [np.nan]}) - + # # Get the latitude bin value # lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan # latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan # longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan - + # # Calculate photon height range # photon_height_min = df['photon_height'].min() # photon_height_max = df['photon_height'].max() - + # # Check for sufficient data range # if np.isnan(photon_height_min) or np.isnan(photon_height_max) or photon_height_min == photon_height_max: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - -# height_bins_range = abs(photon_height_max - photon_height_min) + +# height_bins_range = abs(photon_height_max - photon_height_min) # height_bins_number = round(height_bins_range / vertical_res) - + # # Ensure there are enough bins # if height_bins_number < 5: # return pd.DataFrame({'lat_bins': [lat_bin_value], 'kd': [np.nan], 'e0': [np.nan]}) - + # # Create histogram of photon heights # bin_edges = np.linspace(photon_height_min, photon_height_max, num=height_bins_number) # counts, _ = np.histogram(df['photon_height'], bins=bin_edges) # bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 - + # # Store histogram data # hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': counts}) - + # # x value for model # # Reverse zdepth to align with the MATLAB approach # hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] - + # # Log-transform photon counts, replacing zeros with NaN # # y value for model # hist_df['log_photon_counts'] = np.log(hist_df['photon_counts'].replace(0, np.nan)) # hist_df.loc[np.isinf(hist_df['log_photon_counts']), 'log_photon_counts'] = np.nan - + # # Filter out rows with NaNs in either column for regression # valid_data = hist_df.dropna(subset=['zdepth', 'log_photon_counts']) - + # # Check for enough valid data points # # Skip the regression if there are fewer than 5 datapoints # if valid_data['log_photon_counts'].notna().sum() > 3: @@ -146,22 +147,22 @@ # # Perform linear regression # model = LinearRegression() # model.fit(zdepth_valid, log_counts_valid) - + # # Extract kd as the negative of the slope and e0 from the intercept # kd = -model.coef_[0] - + # print('zdepth_valid:',zdepth_valid) # print('photon_counts:',hist_df['photon_counts']) # print('kd:',kd) - + # e0 = np.exp(model.intercept_) - + # # Set kd to NaN if negative # if kd < 0: # kd = np.nan # else: # kd, e0 = np.nan, np.nan - + # return pd.DataFrame({ # 'lat_bins': [lat_bin_value], # 'kd': [kd], @@ -199,13 +200,15 @@ def find_exponential_decay_zone(hist_df, decay_threshold=0.0): # Apply decay threshold: exclude bins below threshold fraction of peak signal if decay_threshold > 0.0: - peak_count = df['photon_counts'].max() + peak_count = df["photon_counts"].max() if peak_count > 0: - df.loc[df['photon_counts'] < decay_threshold * peak_count, 'photon_counts'] = 0 + df.loc[ + df["photon_counts"] < decay_threshold * peak_count, "photon_counts" + ] = 0 - df['log_photon_counts'] = np.log(df['photon_counts'].replace(0, np.nan)) - df.loc[np.isinf(df['log_photon_counts']), 'log_photon_counts'] = np.nan - return df.dropna(subset=['zdepth', 'log_photon_counts']) + df["log_photon_counts"] = np.log(df["photon_counts"].replace(0, np.nan)) + df.loc[np.isinf(df["log_photon_counts"]), "log_photon_counts"] = np.nan + return df.dropna(subset=["zdepth", "log_photon_counts"]) def fit_beers_law(valid_data): @@ -227,11 +230,11 @@ def fit_beers_law(valid_data): tuple[float, float] (kd, e0). kd is set to np.nan if negative or if fewer than 4 points. """ - if valid_data['log_photon_counts'].notna().sum() <= 3: + if valid_data["log_photon_counts"].notna().sum() <= 3: return np.nan, np.nan - zdepth_valid = valid_data['zdepth'].values.reshape(-1, 1) - log_counts_valid = valid_data['log_photon_counts'].values + zdepth_valid = valid_data["zdepth"].values.reshape(-1, 1) + log_counts_valid = valid_data["log_photon_counts"].values model = LinearRegression() model.fit(zdepth_valid, log_counts_valid) @@ -264,25 +267,25 @@ def fit_beers_law_bg_subtract(hist_df, bg_fraction=0.2): tuple[float, float, float] (kd, e0, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') + df = hist_df.copy().sort_values("zdepth") n_bins = len(df) if n_bins < 5: return np.nan, np.nan, np.nan # Estimate noise floor from the deepest bg_fraction of bins n_bg = max(int(n_bins * bg_fraction), 2) - noise_floor = float(df.tail(n_bg)['photon_counts'].median()) + noise_floor = float(df.tail(n_bg)["photon_counts"].median()) # Subtract noise and keep only positive residuals - df['signal'] = df['photon_counts'] - noise_floor - df = df[df['signal'] > 0].copy() + df["signal"] = df["photon_counts"] - noise_floor + df = df[df["signal"] > 0].copy() if len(df) < 4: return np.nan, np.nan, noise_floor - df['log_signal'] = np.log(df['signal']) + df["log_signal"] = np.log(df["signal"]) - zdepth = df['zdepth'].values.reshape(-1, 1) - log_signal = df['log_signal'].values + zdepth = df["zdepth"].values.reshape(-1, 1) + log_signal = df["log_signal"].values model = LinearRegression() model.fit(zdepth, log_signal) @@ -317,14 +320,14 @@ def fit_beers_law_breakpoint(hist_df): tuple[float, float, float, float] (kd, e0, breakpoint_depth, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') - df = df[df['photon_counts'] > 0].copy() + df = hist_df.copy().sort_values("zdepth") + df = df[df["photon_counts"] > 0].copy() if len(df) < 5: return np.nan, np.nan, np.nan, np.nan - df['log_counts'] = np.log(df['photon_counts']) - depths = df['zdepth'].values - log_counts = df['log_counts'].values + df["log_counts"] = np.log(df["photon_counts"]) + depths = df["zdepth"].values + log_counts = df["log_counts"].values n = len(depths) best_rss = np.inf @@ -392,16 +395,16 @@ def fit_beers_law_nonlinear(hist_df): tuple[float, float, float] (kd, e0, noise_floor). """ - df = hist_df.copy().sort_values('zdepth') - depths = df['zdepth'].values - counts = df['photon_counts'].values.astype(float) + df = hist_df.copy().sort_values("zdepth") + depths = df["zdepth"].values + counts = df["photon_counts"].values.astype(float) if len(depths) < 5: return np.nan, np.nan, np.nan # Initial guesses A0 = float(counts.max()) - N0 = float(np.median(counts[len(counts) * 3 // 4:])) # deepest 25% + N0 = float(np.median(counts[len(counts) * 3 // 4 :])) # deepest 25% # Quick log-linear Kd estimate for initial guess (ignore noise) pos = counts > 0 if pos.sum() >= 2: @@ -413,10 +416,12 @@ def fit_beers_law_nonlinear(hist_df): try: popt, _ = curve_fit( - _beer_plus_noise, depths, counts, + _beer_plus_noise, + depths, + counts, p0=[A0, kd0, N0], bounds=([0, 0, 0], [np.inf, np.inf, np.inf]), - maxfev=5000 + maxfev=5000, ) A_fit, kd_fit, N_fit = popt if kd_fit <= 0: @@ -429,9 +434,12 @@ def fit_beers_law_nonlinear(hist_df): # --------------------------------------------------------------------------- # Hybrid: Stabilised breakpoint zone detection + log-linear fit on decay only # --------------------------------------------------------------------------- -def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, - expected_noise_floor=None, - noise_floor_tolerance=3.0): +def fit_beers_law_hybrid( + hist_df, + min_breakpoint_depth=2.0, + expected_noise_floor=None, + noise_floor_tolerance=3.0, +): """ Two-stage approach matching the flowchart intent: Step 16 — Find where exponential decay transitions to noise floor @@ -474,22 +482,24 @@ def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, (kd, e0, breakpoint_depth, noise_floor). breakpoint_depth and noise_floor are np.nan if fallback to full fit. """ - df = hist_df.copy().sort_values('zdepth') - df = df[df['photon_counts'] > 0].copy() + df = hist_df.copy().sort_values("zdepth") + df = df[df["photon_counts"] > 0].copy() n = len(df) if n < 6: return np.nan, np.nan, np.nan, np.nan - df['log_counts'] = np.log(df['photon_counts']) - depths = df['zdepth'].values - log_counts = df['log_counts'].values + df["log_counts"] = np.log(df["photon_counts"]) + depths = df["zdepth"].values + log_counts = df["log_counts"].values # ------------------------------------------------------------------ # Reference: BIC for a single-line model (no breakpoint) # ------------------------------------------------------------------ model_full = LinearRegression() model_full.fit(depths.reshape(-1, 1), log_counts) - rss_full = float(np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2)) + rss_full = float( + np.sum((model_full.predict(depths.reshape(-1, 1)) - log_counts) ** 2) + ) k_full = 2 # slope + intercept bic_full = n * np.log(rss_full / n + 1e-10) + k_full * np.log(n) @@ -567,11 +577,12 @@ def fit_beers_law_hybrid(hist_df, min_breakpoint_depth=2.0, # --------------------------------------------------------------------------- # Dispatcher: select fitting strategy by name # --------------------------------------------------------------------------- -KD_FIT_METHODS = ('log_linear', 'bg_subtract', 'breakpoint', 'nonlinear', 'hybrid') +KD_FIT_METHODS = ("log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid") -def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, - expected_noise_floor=None): +def _fit_kd_with_method( + hist_df, method="log_linear", decay_threshold=0.0, expected_noise_floor=None +): """ Run the requested fitting strategy on a single histogram. @@ -580,38 +591,43 @@ def _fit_kd_with_method(hist_df, method='log_linear', decay_threshold=0.0, tuple[float, float, float] (kd, e0, noise_floor). noise_floor is np.nan for log_linear. """ - if method == 'log_linear': + if method == "log_linear": valid = find_exponential_decay_zone(hist_df, decay_threshold=decay_threshold) kd, e0 = fit_beers_law(valid) return kd, e0, np.nan - elif method == 'bg_subtract': + if method == "bg_subtract": return fit_beers_law_bg_subtract(hist_df) - elif method == 'breakpoint': + if method == "breakpoint": kd, e0, bp, nf = fit_beers_law_breakpoint(hist_df) return kd, e0, nf - elif method == 'nonlinear': + if method == "nonlinear": return fit_beers_law_nonlinear(hist_df) - elif method == 'hybrid': + if method == "hybrid": kd, e0, bp, nf = fit_beers_law_hybrid( hist_df, expected_noise_floor=expected_noise_floor ) return kd, e0, nf - else: - raise ValueError(f"Unknown kd_fit_method: {method!r}. " - f"Choose from {KD_FIT_METHODS}") + raise ValueError( + f"Unknown kd_fit_method: {method!r}. " f"Choose from {KD_FIT_METHODS}" + ) # another solution is to calculate kd without hist -def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_threshold=0.0, - kd_fit_method='log_linear', expected_noise_floor=None, - surface_sigma=None, - wave_exclusion_multiplier=0.0, - wave_sigma_calm_threshold=0.1): +def CalculateKdFromFilteredSubsurfacePhoton( + df, + vertical_res=0.8, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + expected_noise_floor=None, + surface_sigma=None, + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Calculate Kd for a single along-track bin by fitting Beer's Law in log-space. @@ -623,25 +639,47 @@ def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_thr 17 — fit_beers_law: fit Beer's Law curve; kd = -slope, e0 = exp(intercept). """ # Early exit if DataFrame is empty or missing required column - if df.empty or 'lat_bins' not in df.columns: - return pd.DataFrame({'lat_bins': [np.nan], 'kd': [np.nan], 'e0': [np.nan], - 'noise_floor': [np.nan], 'surface_sigma': [np.nan], - 'latitude': [np.nan], 'longitude': [np.nan]}) + if df.empty or "lat_bins" not in df.columns: + return pd.DataFrame( + { + "lat_bins": [np.nan], + "kd": [np.nan], + "e0": [np.nan], + "noise_floor": [np.nan], + "surface_sigma": [np.nan], + "latitude": [np.nan], + "longitude": [np.nan], + } + ) # Retrieve latitude and longitude - lat_bin_value = df['lat_bins'].iloc[0] if not df['lat_bins'].empty else np.nan - latitude = df['latitude'].mean() if 'latitude' in df.columns else df['lat'].mean() if 'lat' in df.columns else np.nan - longitude = df['longitude'].mean() if 'longitude' in df.columns else df['lon'].mean() if 'lon' in df.columns else np.nan + lat_bin_value = df["lat_bins"].iloc[0] if not df["lat_bins"].empty else np.nan + latitude = ( + df["latitude"].mean() + if "latitude" in df.columns + else df["lat"].mean() + if "lat" in df.columns + else np.nan + ) + longitude = ( + df["longitude"].mean() + if "longitude" in df.columns + else df["lon"].mean() + if "lon" in df.columns + else np.nan + ) # Use value_counts to get photon counts in each height bin - height_counts = df['height_bins'].value_counts().sort_index() + height_counts = df["height_bins"].value_counts().sort_index() bin_centers = height_counts.index.astype(float) # Create a DataFrame for the height bins and counts - hist_df = pd.DataFrame({'zdepth': bin_centers, 'photon_counts': height_counts.values}) + hist_df = pd.DataFrame( + {"zdepth": bin_centers, "photon_counts": height_counts.values} + ) # Reverse zdepth for model alignment - hist_df['zdepth'] = hist_df['zdepth'].max() - hist_df['zdepth'] + hist_df["zdepth"] = hist_df["zdepth"].max() - hist_df["zdepth"] # Wave-adaptive fit: skip shallow bins contaminated by wave smearing/bubbles. # zdepth=0 in the histogram corresponds to the adaptive surface detection @@ -653,35 +691,51 @@ def CalculateKdFromFilteredSubsurfacePhoton(df, vertical_res=0.8, decay_zone_thr if surface_sigma is not None and wave_exclusion_multiplier > 0: if not np.isnan(surface_sigma) and surface_sigma > wave_sigma_calm_threshold: adaptive_offset = max(3.0 * surface_sigma, 1.0) - wave_skip = max(0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset) - trimmed = hist_df[hist_df['zdepth'] >= wave_skip] + wave_skip = max( + 0.0, wave_exclusion_multiplier * surface_sigma - adaptive_offset + ) + trimmed = hist_df[hist_df["zdepth"] >= wave_skip] if len(trimmed) >= 4: - logging.info("Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " - "skip=%.2f m, %d->%d bins", - surface_sigma, adaptive_offset, wave_skip, - len(hist_df), len(trimmed)) + logging.info( + "Wave-adaptive trim: sigma=%.3f, adaptive_offset=%.2f m, " + "skip=%.2f m, %d->%d bins", + surface_sigma, + adaptive_offset, + wave_skip, + len(hist_df), + len(trimmed), + ) hist_df = trimmed # else: keep original hist_df (too few bins after trim) kd, e0, noise_floor = _fit_kd_with_method( - hist_df, method=kd_fit_method, decay_threshold=decay_zone_threshold, - expected_noise_floor=expected_noise_floor + hist_df, + method=kd_fit_method, + decay_threshold=decay_zone_threshold, + expected_noise_floor=expected_noise_floor, ) - return pd.DataFrame({ - 'lat_bins': [lat_bin_value], - 'kd': [kd], - 'e0': [e0], - 'noise_floor': [noise_floor], - 'surface_sigma': [surface_sigma if surface_sigma is not None else np.nan], - 'latitude': [latitude], - 'longitude': [longitude] - }) + return pd.DataFrame( + { + "lat_bins": [lat_bin_value], + "kd": [kd], + "e0": [e0], + "noise_floor": [noise_floor], + "surface_sigma": [surface_sigma if surface_sigma is not None else np.nan], + "latitude": [latitude], + "longitude": [longitude], + } + ) # Original kd calculation function remains unchanged -def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', - wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): +def calculate_kd( + filtered_seafloor_subsurface_photon_dataset, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Loop over along-track bins and call CalculateKdFromFilteredSubsurfacePhoton for each bin. @@ -701,29 +755,51 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho 18 — Save K_dph attenuation coefficient value and statistical terms (kd, e0 columns in the returned DataFrame). """ - logging.info("Calculating Kd from filtered subsurface photon dataset (method=%s)", kd_fit_method) + logging.info( + "Calculating Kd from filtered subsurface photon dataset (method=%s)", + kd_fit_method, + ) - empty_df = pd.DataFrame({'lat_bins': [], 'kd': [], 'e0': [], 'noise_floor': [], 'surface_sigma': [], 'latitude': [], 'longitude': []}) + empty_df = pd.DataFrame( + { + "lat_bins": [], + "kd": [], + "e0": [], + "noise_floor": [], + "surface_sigma": [], + "latitude": [], + "longitude": [], + } + ) - groups = list(filtered_seafloor_subsurface_photon_dataset.groupby('lat_bins', observed=False)) + groups = list( + filtered_seafloor_subsurface_photon_dataset.groupby("lat_bins", observed=False) + ) if not groups: return empty_df - if kd_fit_method == 'hybrid': + if kd_fit_method == "hybrid": # ------------------------------------------------------------------ # Pass 1: estimate per-bin noise floors (no expected_noise_floor) # ------------------------------------------------------------------ logging.info("Hybrid pass 1: estimating per-bin noise floors") pass1_results = [] for _, group in groups: - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - pass1_results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + pass1_results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) if not pass1_results: return empty_df pass1_df = pd.concat(pass1_results, ignore_index=True) @@ -736,15 +812,17 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # within ±half_window bins. Falls back to beam-level median when # the local window has < 3 valid estimates. HALF_WINDOW = 10 # ±10 bins = ±5 km at 500 m horizontal_res - MIN_LOCAL = 3 # minimum valid noise floors to use local median + MIN_LOCAL = 3 # minimum valid noise floors to use local median - nf_array = pass1_df['noise_floor'].values.copy() + nf_array = pass1_df["noise_floor"].values.copy() n_bins = len(nf_array) # Beam-level fallback - valid_nf_all = pass1_df['noise_floor'].dropna() + valid_nf_all = pass1_df["noise_floor"].dropna() valid_nf_all = valid_nf_all[valid_nf_all > 0] - beam_median_nf = float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + beam_median_nf = ( + float(valid_nf_all.median()) if len(valid_nf_all) >= MIN_LOCAL else None + ) if beam_median_nf is not None: # Build per-bin expected noise floor via sliding window @@ -759,16 +837,21 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho else: expected_nf_per_bin[i] = beam_median_nf # fallback - logging.info("Hybrid pass 2: sliding window noise floor " - "(half_window=%d bins, beam_median=%.3f, " - "local range=%.3f-%.3f)", - HALF_WINDOW, beam_median_nf, - float(np.nanmin(expected_nf_per_bin)), - float(np.nanmax(expected_nf_per_bin))) + logging.info( + "Hybrid pass 2: sliding window noise floor " + "(half_window=%d bins, beam_median=%.3f, " + "local range=%.3f-%.3f)", + HALF_WINDOW, + beam_median_nf, + float(np.nanmin(expected_nf_per_bin)), + float(np.nanmax(expected_nf_per_bin)), + ) else: expected_nf_per_bin = None - logging.info("Hybrid: insufficient noise floor estimates (%d), " - "skipping pass 2", len(valid_nf_all)) + logging.info( + "Hybrid: insufficient noise floor estimates (%d), " "skipping pass 2", + len(valid_nf_all), + ) # ------------------------------------------------------------------ # Pass 2: re-fit with per-bin expected noise floor constraint @@ -776,15 +859,22 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho if expected_nf_per_bin is not None: results = [] for idx, (_, group) in enumerate(groups): - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, - kd_fit_method=kd_fit_method, - expected_noise_floor=float(expected_nf_per_bin[idx]), - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + expected_noise_floor=float(expected_nf_per_bin[idx]), + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) else: SubsurfacePhotonDFAddedKd = pass1_df @@ -794,7 +884,7 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # Uses 3×IQR to accommodate real spatial variation (e.g. turbidity # gradients near river mouths) while removing fitting artefacts. # ------------------------------------------------------------------ - kd_vals = SubsurfacePhotonDFAddedKd['kd'].dropna() + kd_vals = SubsurfacePhotonDFAddedKd["kd"].dropna() if len(kd_vals) >= 5: q1 = float(kd_vals.quantile(0.25)) q3 = float(kd_vals.quantile(0.75)) @@ -803,12 +893,18 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho upper = q3 + 3.0 * iqr # Also apply physical cap: Kd > 5.0 m⁻¹ is unrealistic upper = min(upper, 5.0) - outlier_mask = (SubsurfacePhotonDFAddedKd['kd'] < lower) | (SubsurfacePhotonDFAddedKd['kd'] > upper) + outlier_mask = (SubsurfacePhotonDFAddedKd["kd"] < lower) | ( + SubsurfacePhotonDFAddedKd["kd"] > upper + ) n_outliers = int(outlier_mask.sum()) if n_outliers > 0: - logging.info("Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", - n_outliers, lower, upper) - SubsurfacePhotonDFAddedKd.loc[outlier_mask, 'kd'] = np.nan + logging.info( + "Hybrid IQR filter: removed %d outliers (bounds: [%.4f, %.4f])", + n_outliers, + lower, + upper, + ) + SubsurfacePhotonDFAddedKd.loc[outlier_mask, "kd"] = np.nan else: # ------------------------------------------------------------------ @@ -816,13 +912,21 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # ------------------------------------------------------------------ results = [] for _, group in groups: - sigma = float(group['surface_sigma'].iloc[0]) if ('surface_sigma' in group.columns and len(group) > 0) else None - results.append(CalculateKdFromFilteredSubsurfacePhoton( - group, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, - surface_sigma=sigma, - wave_exclusion_multiplier=wave_exclusion_multiplier, - wave_sigma_calm_threshold=wave_sigma_calm_threshold, - )) + sigma = ( + float(group["surface_sigma"].iloc[0]) + if ("surface_sigma" in group.columns and len(group) > 0) + else None + ) + results.append( + CalculateKdFromFilteredSubsurfacePhoton( + group, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, + surface_sigma=sigma, + wave_exclusion_multiplier=wave_exclusion_multiplier, + wave_sigma_calm_threshold=wave_sigma_calm_threshold, + ) + ) if not results: return empty_df SubsurfacePhotonDFAddedKd = pd.concat(results, ignore_index=True) @@ -831,8 +935,13 @@ def calculate_kd(filtered_seafloor_subsurface_photon_dataset, decay_zone_thresho # Updated function to apply kd calculation beam-by-beam -def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_threshold=0.0, kd_fit_method='log_linear', - wave_exclusion_multiplier=0.0, wave_sigma_calm_threshold=0.1): +def process_kd_calculation( + Final_filtered_subsurface_photon_dataset, + decay_zone_threshold=0.0, + kd_fit_method="log_linear", + wave_exclusion_multiplier=0.0, + wave_sigma_calm_threshold=0.1, +): """ Beam-by-beam wrapper that calls calculate_kd for every beam and concatenates the results into a single Kd output DataFrame. @@ -849,18 +958,22 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_ kd_beam_datasets = [] # Group by 'beam_id' to process each beam independently - for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby('beam_id'): + for beam_id, beam_data in Final_filtered_subsurface_photon_dataset.groupby( + "beam_id" + ): logging.info(f"Calculating Kd for beam: {beam_id}") # Apply the calculate_kd function to the current beam's dataset SubsurfacePhotonDFAddedKd = calculate_kd( - beam_data, decay_zone_threshold=decay_zone_threshold, kd_fit_method=kd_fit_method, + beam_data, + decay_zone_threshold=decay_zone_threshold, + kd_fit_method=kd_fit_method, wave_exclusion_multiplier=wave_exclusion_multiplier, wave_sigma_calm_threshold=wave_sigma_calm_threshold, ) # Add a column to track the beam_id in the results - SubsurfacePhotonDFAddedKd['beam_id'] = beam_id + SubsurfacePhotonDFAddedKd["beam_id"] = beam_id # Append the result to the list kd_beam_datasets.append(SubsurfacePhotonDFAddedKd) @@ -871,9 +984,20 @@ def process_kd_calculation(Final_filtered_subsurface_photon_dataset, decay_zone_ "All bins may have been discarded by an upstream filter (e.g. histogram quality). " "Returning empty DataFrame." ) - return pd.DataFrame(columns=['lat_bins', 'kd', 'e0', 'noise_floor', 'surface_sigma', 'latitude', 'longitude', 'beam_id']) + return pd.DataFrame( + columns=[ + "lat_bins", + "kd", + "e0", + "noise_floor", + "surface_sigma", + "latitude", + "longitude", + "beam_id", + ] + ) # Combine results from all beams into a single DataFrame combined_kd_dataset = pd.concat(kd_beam_datasets, ignore_index=True) - return combined_kd_dataset \ No newline at end of file + return combined_kd_dataset diff --git a/aok/core/kd_utils/bathy_processing.py b/aok/core/kd_utils/bathy_processing.py index 772f5cd..f29418b 100644 --- a/aok/core/kd_utils/bathy_processing.py +++ b/aok/core/kd_utils/bathy_processing.py @@ -1,18 +1,19 @@ # utils/bathy_processing.py -import os import logging +import os + +import numpy as np import pandas as pd import rasterio -import numpy as np +from rtree import index +from scipy.spatial import cKDTree +from scipy.stats import norm +from shapely.geometry import box + from kd_utils.sea_photons_analysis import ( get_sea_surface_height_adaptive, - get_sea_surface_height_static, horizontal_vertical_bin_dataset, ) -from rtree import index -from shapely.geometry import box, Point -from scipy.spatial import cKDTree -from scipy.stats import norm def apply_optional_histogram_quality_filter( @@ -23,7 +24,7 @@ def apply_optional_histogram_quality_filter( depth_min=6.0, depth_max=7.0, reference_depth_min=0.0, - reference_depth_max=1.0 + reference_depth_max=1.0, ): """ Optional histogram quality check: keep along-track bins only if enough @@ -62,32 +63,41 @@ def apply_optional_histogram_quality_filter( if subsurface_photon_dataset.empty: return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - quality_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height': sea_surface_height - }).dropna(subset=['sea_surface_height']).copy() + quality_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + .dropna(subset=["sea_surface_height"]) + .copy() + ) if quality_df.empty: return subsurface_photon_dataset.iloc[0:0].copy() ratios = [] for _, row in quality_df.iterrows(): - lat_bin = row['lat_bins'] - surface = row['sea_surface_height'] - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + lat_bin = row["lat_bins"] + surface = row["sea_surface_height"] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: ratios.append(0.0) continue - depth = surface - bin_data['photon_height'] + depth = surface - bin_data["photon_height"] underwater_mask = depth > 0 # Reference band: near-surface photon count (incoming signal) - ref_mask = underwater_mask & (depth >= reference_depth_min) & (depth <= reference_depth_max) + ref_mask = ( + underwater_mask + & (depth >= reference_depth_min) + & (depth <= reference_depth_max) + ) ref_count = float(ref_mask.sum()) if ref_count == 0: ratios.append(0.0) @@ -104,9 +114,15 @@ def apply_optional_histogram_quality_filter( ratios.append(decay_ratio) - quality_df['hist_quality_ratio'] = ratios - valid_bins = set(quality_df.loc[quality_df['hist_quality_ratio'] >= min_ratio, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + quality_df["hist_quality_ratio"] = ratios + valid_bins = set( + quality_df.loc[ + quality_df["hist_quality_ratio"] >= min_ratio, "lat_bins" + ].tolist() + ) + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): @@ -117,32 +133,34 @@ def compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height): distribution). Returns a dict mapping lat_bin -> sigma (float or NaN). Bins with fewer than 10 surface photons get NaN. """ - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return {} sigma_map = {} - for lat_bin, surface in zip(lat_bin_keys, sea_surface_height): + for lat_bin, surface in zip(lat_bin_keys, sea_surface_height, strict=False): if pd.isna(surface): continue - bin_data = binned_dataset_sea_surface[binned_dataset_sea_surface['lat_bins'] == lat_bin] + bin_data = binned_dataset_sea_surface[ + binned_dataset_sea_surface["lat_bins"] == lat_bin + ] if bin_data.empty: continue peak_data = bin_data[ - (bin_data['photon_height'] > surface - 1.0) & - (bin_data['photon_height'] < surface + 1.0) + (bin_data["photon_height"] > surface - 1.0) + & (bin_data["photon_height"] < surface + 1.0) ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) peak_data2 = bin_data[ - (bin_data['photon_height'] > mu - half_win2) & - (bin_data['photon_height'] < mu + half_win2) + (bin_data["photon_height"] > mu - half_win2) + & (bin_data["photon_height"] < mu + half_win2) ] if len(peak_data2) > 10: - _, sigma = norm.fit(peak_data2['photon_height']) + _, sigma = norm.fit(peak_data2["photon_height"]) else: sigma = np.nan sigma_map[lat_bin] = sigma @@ -154,7 +172,7 @@ def apply_optional_surface_sigma_filter( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=0.5 + sigma_max=0.5, ): """ Optional Gaussian surface sigma filter: discard along-track bins if the @@ -172,10 +190,14 @@ def apply_optional_surface_sigma_filter( return subsurface_photon_dataset sigma_df = pd.DataFrame( - list(sigma_map.items()), columns=['lat_bins', 'surface_sigma'] + list(sigma_map.items()), columns=["lat_bins", "surface_sigma"] + ) + valid_bins = set( + sigma_df.loc[sigma_df["surface_sigma"] <= sigma_max, "lat_bins"].tolist() ) - valid_bins = set(sigma_df.loc[sigma_df['surface_sigma'] <= sigma_max, 'lat_bins'].tolist()) - return subsurface_photon_dataset[subsurface_photon_dataset['lat_bins'].isin(valid_bins)].copy() + return subsurface_photon_dataset[ + subsurface_photon_dataset["lat_bins"].isin(valid_bins) + ].copy() def apply_optional_refraction_correction( @@ -183,7 +205,7 @@ def apply_optional_refraction_correction( subsurface_photon_dataset, sea_surface_height, water_temp_c=20.0, - wavelength_nm=532.0 + wavelength_nm=532.0, ): """ Optional refraction correction for subsurface photons. @@ -192,23 +214,36 @@ def apply_optional_refraction_correction( Flowchart step: 9 — Compute water depths (distance below surface) and correct depths for refraction. """ - required_cols = {'lat_bins', 'photon_height', 'ref_elevation', 'ref_azimuth', 'lon', 'lat'} - if subsurface_photon_dataset.empty or (not required_cols.issubset(set(subsurface_photon_dataset.columns))): + required_cols = { + "lat_bins", + "photon_height", + "ref_elevation", + "ref_azimuth", + "lon", + "lat", + } + if subsurface_photon_dataset.empty or ( + not required_cols.issubset(set(subsurface_photon_dataset.columns)) + ): return subsurface_photon_dataset - grouped = binned_dataset_sea_surface.groupby(['lat_bins'], observed=False) + grouped = binned_dataset_sea_surface.groupby(["lat_bins"], observed=False) lat_bin_keys = list(grouped.groups.keys()) if len(lat_bin_keys) != len(sea_surface_height): return subsurface_photon_dataset - surface_df = pd.DataFrame({'lat_bins': lat_bin_keys, 'sea_surface_height': sea_surface_height}) - corrected = subsurface_photon_dataset.merge(surface_df, on='lat_bins', how='left').copy() - if corrected['sea_surface_height'].isna().all(): + surface_df = pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height": sea_surface_height} + ) + corrected = subsurface_photon_dataset.merge( + surface_df, on="lat_bins", how="left" + ).copy() + if corrected["sea_surface_height"].isna().all(): return subsurface_photon_dataset - corrected['photon_height_pre_refraction'] = corrected['photon_height'] - corrected['lon_pre_refraction'] = corrected['lon'] - corrected['lat_pre_refraction'] = corrected['lat'] + corrected["photon_height_pre_refraction"] = corrected["photon_height"] + corrected["lon_pre_refraction"] = corrected["lon"] + corrected["lat_pre_refraction"] = corrected["lat"] # Refraction index parameterization from legacy module. a = -0.000001501562500 @@ -217,14 +252,26 @@ def apply_optional_refraction_correction( d = -0.000160475520686 e = 1.398067112092424 n1 = 1.00029 - n2 = (a * water_temp_c ** 2) + (b * wavelength_nm ** 2) + (c * water_temp_c) + (d * wavelength_nm) + e + n2 = ( + (a * water_temp_c**2) + + (b * wavelength_nm**2) + + (c * water_temp_c) + + (d * wavelength_nm) + + e + ) - ref_elev = pd.to_numeric(corrected['ref_elevation'], errors='coerce').to_numpy(dtype=float) - ref_az = pd.to_numeric(corrected['ref_azimuth'], errors='coerce').to_numpy(dtype=float) - z = pd.to_numeric(corrected['photon_height'], errors='coerce').to_numpy(dtype=float) - ws = pd.to_numeric(corrected['sea_surface_height'], errors='coerce').to_numpy(dtype=float) - x = pd.to_numeric(corrected['lon'], errors='coerce').to_numpy(dtype=float) - y = pd.to_numeric(corrected['lat'], errors='coerce').to_numpy(dtype=float) + ref_elev = pd.to_numeric(corrected["ref_elevation"], errors="coerce").to_numpy( + dtype=float + ) + ref_az = pd.to_numeric(corrected["ref_azimuth"], errors="coerce").to_numpy( + dtype=float + ) + z = pd.to_numeric(corrected["photon_height"], errors="coerce").to_numpy(dtype=float) + ws = pd.to_numeric(corrected["sea_surface_height"], errors="coerce").to_numpy( + dtype=float + ) + x = pd.to_numeric(corrected["lon"], errors="coerce").to_numpy(dtype=float) + y = pd.to_numeric(corrected["lat"], errors="coerce").to_numpy(dtype=float) # Convert to radians if values look like degrees. if np.nanmax(np.abs(ref_elev)) > (2 * np.pi): @@ -232,7 +279,13 @@ def apply_optional_refraction_correction( if np.nanmax(np.abs(ref_az)) > (2 * np.pi): ref_az = np.deg2rad(ref_az) - valid_mask = np.isfinite(ref_elev) & np.isfinite(ref_az) & np.isfinite(z) & np.isfinite(ws) & (z <= ws) + valid_mask = ( + np.isfinite(ref_elev) + & np.isfinite(ref_az) + & np.isfinite(z) + & np.isfinite(ws) + & (z <= ws) + ) if not np.any(valid_mask): return subsurface_photon_dataset @@ -248,7 +301,7 @@ def apply_optional_refraction_correction( R = (S * n1) / n2 Gamma = (np.pi / 2.0) - theta1 phi = theta1 - theta2 - P_sq = np.maximum(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi), 0.0) + P_sq = np.maximum(R**2 + S**2 - 2 * R * S * np.cos(phi), 0.0) P = np.sqrt(P_sq) alpha_arg = np.divide(R * np.sin(phi), P, out=np.zeros_like(P), where=(P != 0)) alpha_arg = np.clip(alpha_arg, -1.0, 1.0) @@ -264,9 +317,9 @@ def apply_optional_refraction_correction( y_corr = y[valid_mask] + DN z_corr = z[valid_mask] + DZ - corrected.loc[valid_mask, 'lon'] = x_corr - corrected.loc[valid_mask, 'lat'] = y_corr - corrected.loc[valid_mask, 'photon_height'] = z_corr + corrected.loc[valid_mask, "lon"] = x_corr + corrected.loc[valid_mask, "lat"] = y_corr + corrected.loc[valid_mask, "photon_height"] = z_corr return corrected @@ -275,7 +328,7 @@ def apply_sea_surface_flattening( sea_surface_height, lat_bin_keys, horizontal_res, - flattening_window_m=500 + flattening_window_m=500, ): """ Flatten small-bin sea-surface variations to the mean sea level of larger along-track windows. @@ -287,15 +340,18 @@ def apply_sea_surface_flattening( if len(sea_surface_height) != len(lat_bin_keys): return subsurface_photon_dataset - surface_df = pd.DataFrame({ - 'lat_bins': lat_bin_keys, - 'sea_surface_height_local': sea_surface_height - }).dropna(subset=['sea_surface_height_local']).copy() + surface_df = ( + pd.DataFrame( + {"lat_bins": lat_bin_keys, "sea_surface_height_local": sea_surface_height} + ) + .dropna(subset=["sea_surface_height_local"]) + .copy() + ) if surface_df.empty: return subsurface_photon_dataset - surface_df['lat_bins_num'] = pd.to_numeric(surface_df['lat_bins'], errors='coerce') - surface_df = surface_df.dropna(subset=['lat_bins_num']) + surface_df["lat_bins_num"] = pd.to_numeric(surface_df["lat_bins"], errors="coerce") + surface_df = surface_df.dropna(subset=["lat_bins_num"]) if surface_df.empty: return subsurface_photon_dataset @@ -305,65 +361,83 @@ def apply_sea_surface_flattening( "Sea surface flattening window (%d m) <= horizontal_res (%d m) → " "big_bin_size=1, flattening has no effect. " "Use --sea_surface_flattening_window_m > --horizontal_res.", - flattening_window_m, horizontal_res + flattening_window_m, + horizontal_res, ) - surface_df['big_bin'] = (surface_df['lat_bins_num'].astype(int) // big_bin_size).astype(int) + surface_df["big_bin"] = ( + surface_df["lat_bins_num"].astype(int) // big_bin_size + ).astype(int) mean_surface = ( - surface_df.groupby('big_bin', observed=False)['sea_surface_height_local'] + surface_df.groupby("big_bin", observed=False)["sea_surface_height_local"] .mean() - .rename('sea_surface_height_bigbin_mean') + .rename("sea_surface_height_bigbin_mean") .reset_index() ) - surface_df = surface_df.merge(mean_surface, on='big_bin', how='left') - surface_df['sea_surface_flattening_offset'] = ( - surface_df['sea_surface_height_bigbin_mean'] - surface_df['sea_surface_height_local'] + surface_df = surface_df.merge(mean_surface, on="big_bin", how="left") + surface_df["sea_surface_flattening_offset"] = ( + surface_df["sea_surface_height_bigbin_mean"] + - surface_df["sea_surface_height_local"] ) flattened = subsurface_photon_dataset.copy() - flattened['lat_bins_num'] = pd.to_numeric(flattened['lat_bins'], errors='coerce') - flattened['big_bin'] = (flattened['lat_bins_num'].fillna(-1).astype(int) // big_bin_size).astype(int) + flattened["lat_bins_num"] = pd.to_numeric(flattened["lat_bins"], errors="coerce") + flattened["big_bin"] = ( + flattened["lat_bins_num"].fillna(-1).astype(int) // big_bin_size + ).astype(int) flattened = flattened.merge( - surface_df[['lat_bins', 'sea_surface_height_local', 'sea_surface_height_bigbin_mean', 'sea_surface_flattening_offset']], - on='lat_bins', - how='left' - ) - flattened['photon_height_original'] = flattened['photon_height'] - flattened['photon_height'] = ( - flattened['photon_height'] - flattened['sea_surface_flattening_offset'].fillna(0.0) + surface_df[ + [ + "lat_bins", + "sea_surface_height_local", + "sea_surface_height_bigbin_mean", + "sea_surface_flattening_offset", + ] + ], + on="lat_bins", + how="left", ) + flattened["photon_height_original"] = flattened["photon_height"] + flattened["photon_height"] = flattened["photon_height"] - flattened[ + "sea_surface_flattening_offset" + ].fillna(0.0) # Re-bin height_bins from flattened photon_height so that downstream # Kd calculation (which reads height_bins) uses the corrected depths. - if 'height_bins' in flattened.columns: + if "height_bins" in flattened.columns: vertical_res_actual = horizontal_res # not used; infer from existing bins # Infer the vertical resolution from the original height_bins spacing - orig_bins = pd.to_numeric(flattened['height_bins'], errors='coerce').dropna().unique() + orig_bins = ( + pd.to_numeric(flattened["height_bins"], errors="coerce").dropna().unique() + ) if len(orig_bins) >= 2: sorted_bins = np.sort(orig_bins) diffs = np.diff(sorted_bins) v_res = float(np.median(diffs[diffs > 0])) if np.any(diffs > 0) else 0.25 else: v_res = 0.25 - h_min = flattened['photon_height'].min() - h_max = flattened['photon_height'].max() + h_min = flattened["photon_height"].min() + h_max = flattened["photon_height"].max() n_hbins = max(1, int(round((h_max - h_min) / v_res))) bin_edges = np.linspace(h_min, h_max, n_hbins + 1) bin_labels = np.round((bin_edges[:-1] + bin_edges[1:]) / 2, decimals=1) - flattened['height_bins'] = pd.cut( - flattened['photon_height'], bins=bin_edges, labels=bin_labels, - include_lowest=True + flattened["height_bins"] = pd.cut( + flattened["photon_height"], + bins=bin_edges, + labels=bin_labels, + include_lowest=True, ) - return flattened.drop(columns=['lat_bins_num', 'big_bin'], errors='ignore') + return flattened.drop(columns=["lat_bins_num", "big_bin"], errors="ignore") + # 1. Function to create an R-tree spatial index for raster bounds def create_spatial_index(gebco_paths): """ Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - + Parameters: gebco_paths (list): List of paths to GEBCO raster files. - + Returns: raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. @@ -371,26 +445,29 @@ def create_spatial_index(gebco_paths): idx = index.Index() raster_data_dict = {} for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE='EPSG'): + with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): gebco_raster = rasterio.open(path) raster_data = gebco_raster.read(1) raster_data_dict[path] = (gebco_raster, raster_data) bounds = gebco_raster.bounds - idx.insert(i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path) + idx.insert( + i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path + ) return raster_data_dict, idx + # 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): """ Determine which raster datasets are relevant for the given coordinates using spatial index. This function uses a more efficient approach by performing a bounding box query for batches of points. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. raster_data_dict (dict): Dictionary containing raster datasets and their data. spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - + Returns: relevant_rasters (list): List of relevant raster datasets and their respective data. """ @@ -402,7 +479,10 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index # Get all rasters that intersect with the bounding box matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) relevant_paths = {match.object for match in matches} - relevant_rasters = [(raster_data_dict[path][0], raster_data_dict[path][1]) for path in relevant_paths] + relevant_rasters = [ + (raster_data_dict[path][0], raster_data_dict[path][1]) + for path in relevant_paths + ] return relevant_rasters @@ -410,25 +490,25 @@ def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index def get_seafloor_elevation(lons, lats, relevant_rasters): """ Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - + Parameters: lons (numpy.ndarray): Array of longitudes. lats (numpy.ndarray): Array of latitudes. relevant_rasters (list): List of relevant raster datasets and their respective data. - + Returns: seafloor_elevations (numpy.ndarray): Array of seafloor elevations. """ points = np.vstack((lons, lats)).T seafloor_elevations = np.full(len(lons), np.nan) - + for gebco_raster, raster_data in relevant_rasters: # Use rasterio.sample to get values for multiple points in a batch values = list(gebco_raster.sample(points)) for idx, value in enumerate(values): if np.isnan(seafloor_elevations[idx]) and value is not None: seafloor_elevations[idx] = value[0] - + return seafloor_elevations @@ -456,10 +536,12 @@ def query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths): """ dataset = sea_photon_dataset.copy() raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - lons = dataset['longitude'].values - lats = dataset['latitude'].values - relevant_rasters = get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index) - dataset['seafloor_elevation'] = get_seafloor_elevation(lons, lats, relevant_rasters) + lons = dataset["longitude"].values + lats = dataset["latitude"].values + relevant_rasters = get_relevant_rasters_using_index( + lons, lats, raster_data_dict, spatial_index + ) + dataset["seafloor_elevation"] = get_seafloor_elevation(lons, lats, relevant_rasters) return dataset @@ -481,7 +563,7 @@ def remove_photons_below_seafloor(sea_photon_dataset): Only rows where photon_height > seafloor_elevation. """ return sea_photon_dataset[ - sea_photon_dataset['photon_height'] > sea_photon_dataset['seafloor_elevation'] + sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] ].copy() @@ -508,13 +590,13 @@ def discard_shallow_seabed_bins(sea_photon_dataset, min_depth_m): pd.DataFrame Only rows where depth >= min_depth_m. """ - return sea_photon_dataset[ - sea_photon_dataset['depth'] >= min_depth_m - ].copy() + return sea_photon_dataset[sea_photon_dataset["depth"] >= min_depth_m].copy() # 4. The main process function that ties everything together -def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres): +def process_seafloor_data( + sea_photon_dataset, gebco_paths, Ignore_Subsurface_Height_Thres +): """ Query GEBCO bathymetry then apply seabed and shallow-water filters. @@ -533,9 +615,11 @@ def process_seafloor_data(sea_photon_dataset, gebco_paths, Ignore_Subsurface_Hei containing points above the seafloor in water deeper than threshold. """ dataset = query_gebco_seafloor_elevation(sea_photon_dataset, gebco_paths) # step 12 - dataset['depth'] = -dataset['seafloor_elevation'] - dataset = remove_photons_below_seafloor(dataset) # step 13 - return discard_shallow_seabed_bins(dataset, abs(Ignore_Subsurface_Height_Thres)) # step 14 + dataset["depth"] = -dataset["seafloor_elevation"] + dataset = remove_photons_below_seafloor(dataset) # step 13 + return discard_shallow_seabed_bins( + dataset, abs(Ignore_Subsurface_Height_Thres) + ) # step 14 def load_atl24_points(atl24_file_path): @@ -547,87 +631,123 @@ def load_atl24_points(atl24_file_path): retrieve GEBCO data (or other regional raster bathymetry data) for region. """ if (not atl24_file_path) or (not os.path.exists(atl24_file_path)): - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) lower_path = atl24_file_path.lower() - if lower_path.endswith(('.h5', '.hdf5')): + if lower_path.endswith((".h5", ".hdf5")): try: import h5py - beams = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r'] + + beams = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] frames = [] - with h5py.File(atl24_file_path, 'r') as f: + with h5py.File(atl24_file_path, "r") as f: for beam in beams: if beam not in f: continue grp = f[beam] - if not all(k in grp for k in ('lon_ph', 'lat_ph', 'ortho_h', 'class_ph')): + if not all( + k in grp for k in ("lon_ph", "lat_ph", "ortho_h", "class_ph") + ): continue - class_ph = grp['class_ph'][:] + class_ph = grp["class_ph"][:] bathy_mask = class_ph == 40 # ATL24 bathymetry class # Apply low_confidence_flag filter if available - if 'low_confidence_flag' in grp: - bathy_mask = bathy_mask & (grp['low_confidence_flag'][:] == 0) + if "low_confidence_flag" in grp: + bathy_mask = bathy_mask & (grp["low_confidence_flag"][:] == 0) if bathy_mask.sum() == 0: continue - frames.append(pd.DataFrame({ - 'longitude': grp['lon_ph'][:][bathy_mask].astype(float), - 'latitude': grp['lat_ph'][:][bathy_mask].astype(float), - 'seafloor_elevation_atl24': grp['ortho_h'][:][bathy_mask].astype(float), - })) + frames.append( + pd.DataFrame( + { + "longitude": grp["lon_ph"][:][bathy_mask].astype(float), + "latitude": grp["lat_ph"][:][bathy_mask].astype(float), + "seafloor_elevation_atl24": grp["ortho_h"][:][ + bathy_mask + ].astype(float), + } + ) + ) if not frames: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) return pd.concat(frames, ignore_index=True).dropna() except Exception: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) - elif lower_path.endswith(('.csv', '.txt')): + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) + elif lower_path.endswith((".csv", ".txt")): atl24_df = pd.read_csv(atl24_file_path) - elif lower_path.endswith('.parquet'): + elif lower_path.endswith(".parquet"): atl24_df = pd.read_parquet(atl24_file_path) - elif lower_path.endswith(('.gpkg', '.shp', '.geojson')): + elif lower_path.endswith((".gpkg", ".shp", ".geojson")): try: import geopandas as gpd + atl24_gdf = gpd.read_file(atl24_file_path) atl24_df = pd.DataFrame(atl24_gdf) - if ('longitude' not in atl24_df.columns or 'latitude' not in atl24_df.columns) and ('geometry' in atl24_gdf.columns): - atl24_df['longitude'] = atl24_gdf.geometry.x - atl24_df['latitude'] = atl24_gdf.geometry.y + if ( + "longitude" not in atl24_df.columns + or "latitude" not in atl24_df.columns + ) and ("geometry" in atl24_gdf.columns): + atl24_df["longitude"] = atl24_gdf.geometry.x + atl24_df["latitude"] = atl24_gdf.geometry.y except Exception: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) else: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) - lon_candidates = ['longitude', 'lon', 'x', 'LONGITUDE', 'LON'] - lat_candidates = ['latitude', 'lat', 'y', 'LATITUDE', 'LAT'] + lon_candidates = ["longitude", "lon", "x", "LONGITUDE", "LON"] + lat_candidates = ["latitude", "lat", "y", "LATITUDE", "LAT"] elev_candidates = [ - 'seafloor_elevation', 'bottom_elevation', 'bathymetry', 'elevation', 'z', - 'depth', 'water_depth' + "seafloor_elevation", + "bottom_elevation", + "bathymetry", + "elevation", + "z", + "depth", + "water_depth", ] lon_col = next((c for c in lon_candidates if c in atl24_df.columns), None) lat_col = next((c for c in lat_candidates if c in atl24_df.columns), None) elev_col = next((c for c in elev_candidates if c in atl24_df.columns), None) if lon_col is None or lat_col is None or elev_col is None: - return pd.DataFrame(columns=['longitude', 'latitude', 'seafloor_elevation_atl24']) + return pd.DataFrame( + columns=["longitude", "latitude", "seafloor_elevation_atl24"] + ) - out_df = pd.DataFrame({ - 'longitude': pd.to_numeric(atl24_df[lon_col], errors='coerce'), - 'latitude': pd.to_numeric(atl24_df[lat_col], errors='coerce'), - 'atl24_raw_bathy': pd.to_numeric(atl24_df[elev_col], errors='coerce') - }).dropna(subset=['longitude', 'latitude', 'atl24_raw_bathy']).copy() + out_df = ( + pd.DataFrame( + { + "longitude": pd.to_numeric(atl24_df[lon_col], errors="coerce"), + "latitude": pd.to_numeric(atl24_df[lat_col], errors="coerce"), + "atl24_raw_bathy": pd.to_numeric(atl24_df[elev_col], errors="coerce"), + } + ) + .dropna(subset=["longitude", "latitude", "atl24_raw_bathy"]) + .copy() + ) - if 'depth' in elev_col.lower() and ('elev' not in elev_col.lower()): - out_df['seafloor_elevation_atl24'] = -out_df['atl24_raw_bathy'].abs() + if "depth" in elev_col.lower() and ("elev" not in elev_col.lower()): + out_df["seafloor_elevation_atl24"] = -out_df["atl24_raw_bathy"].abs() else: - out_df['seafloor_elevation_atl24'] = out_df['atl24_raw_bathy'] + out_df["seafloor_elevation_atl24"] = out_df["atl24_raw_bathy"] - return out_df[['longitude', 'latitude', 'seafloor_elevation_atl24']] + return out_df[["longitude", "latitude", "seafloor_elevation_atl24"]] def process_seafloor_data_atl24( sea_photon_dataset, atl24_file_path, Ignore_Subsurface_Height_Thres, - max_match_distance_deg=0.01 + max_match_distance_deg=0.01, ): """ Match ATL24 bathymetry points to photons, then filter below seafloor and @@ -644,26 +764,32 @@ def process_seafloor_data_atl24( if atl24_points.empty: return sea_photon_dataset, 0 - photon_coords = sea_photon_dataset[['longitude', 'latitude']].to_numpy(dtype=float) - atl24_coords = atl24_points[['longitude', 'latitude']].to_numpy(dtype=float) + photon_coords = sea_photon_dataset[["longitude", "latitude"]].to_numpy(dtype=float) + atl24_coords = atl24_points[["longitude", "latitude"]].to_numpy(dtype=float) if len(photon_coords) == 0 or len(atl24_coords) == 0: return sea_photon_dataset, 0 tree = cKDTree(atl24_coords) - distances, indices = tree.query(photon_coords, k=1, distance_upper_bound=max_match_distance_deg) + distances, indices = tree.query( + photon_coords, k=1, distance_upper_bound=max_match_distance_deg + ) valid_match = np.isfinite(distances) & (indices < len(atl24_points)) matched_count = int(valid_match.sum()) if matched_count == 0: return sea_photon_dataset, 0 matched_dataset = sea_photon_dataset.copy() - matched_dataset['seafloor_elevation'] = np.nan - matched_dataset.loc[valid_match, 'seafloor_elevation'] = atl24_points['seafloor_elevation_atl24'].to_numpy()[indices[valid_match]] - matched_dataset = matched_dataset.dropna(subset=['seafloor_elevation']).copy() - matched_dataset['depth'] = -matched_dataset['seafloor_elevation'] - - above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 - filtered_dataset = discard_shallow_seabed_bins(above_floor, abs(Ignore_Subsurface_Height_Thres)) # step 14 + matched_dataset["seafloor_elevation"] = np.nan + matched_dataset.loc[valid_match, "seafloor_elevation"] = atl24_points[ + "seafloor_elevation_atl24" + ].to_numpy()[indices[valid_match]] + matched_dataset = matched_dataset.dropna(subset=["seafloor_elevation"]).copy() + matched_dataset["depth"] = -matched_dataset["seafloor_elevation"] + + above_floor = remove_photons_below_seafloor(matched_dataset) # step 13 + filtered_dataset = discard_shallow_seabed_bins( + above_floor, abs(Ignore_Subsurface_Height_Thres) + ) # step 14 return filtered_dataset, matched_count @@ -715,7 +841,7 @@ def rebuild_and_refit_surface_after_refraction( water_temp_c=water_temp_c, wavelength_nm=wavelength_nm, ) - rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 + rebinned_corrected = horizontal_vertical_bin_dataset( # step 10 corrected_full_dataset, horizontal_res, vertical_res ) return get_sea_surface_height_adaptive(rebinned_corrected) # step 11 @@ -728,7 +854,7 @@ def get_subsurface_photon( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -746,7 +872,7 @@ def get_subsurface_photon( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): """ Orchestrate per-beam subsurface photon extraction and all optional quality filters. @@ -772,8 +898,12 @@ def get_subsurface_photon( (applied unconditionally via Ignore_Subsurface_Height_Thres). """ # adaptive threshold - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - get_sea_surface_height_adaptive(binned_dataset_sea_surface) + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = get_sea_surface_height_adaptive(binned_dataset_sea_surface) # Always compute per-bin surface sigma for quality flagging and wave-adaptive fit sigma_map = compute_surface_sigma(binned_dataset_sea_surface, sea_surface_height) @@ -787,7 +917,7 @@ def get_subsurface_photon( depth_min=histogram_quality_depth_min, depth_max=histogram_quality_depth_max, reference_depth_min=histogram_quality_ref_depth_min, - reference_depth_max=histogram_quality_ref_depth_max + reference_depth_max=histogram_quality_ref_depth_max, ) if apply_surface_sigma_filter: @@ -795,7 +925,7 @@ def get_subsurface_photon( binned_dataset_sea_surface, subsurface_photon_dataset, sea_surface_height, - sigma_max=surface_sigma_max + sigma_max=surface_sigma_max, ) if apply_refraction_correction: @@ -804,25 +934,29 @@ def get_subsurface_photon( subsurface_photon_dataset, sea_surface_height, water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm + wavelength_nm=refraction_wavelength_nm, ) if apply_post_refraction_refit: - sea_surface_height, sea_surface_label, solo_sea_surface_label, subsurface_photon_dataset = \ - rebuild_and_refit_surface_after_refraction( # steps 10 & 11 - binned_dataset_sea_surface, - sea_surface_height, - water_temp_c=refraction_water_temp_c, - wavelength_nm=refraction_wavelength_nm, - horizontal_res=horizontal_res, - vertical_res=vertical_res, - ) + ( + sea_surface_height, + sea_surface_label, + solo_sea_surface_label, + subsurface_photon_dataset, + ) = rebuild_and_refit_surface_after_refraction( # steps 10 & 11 + binned_dataset_sea_surface, + sea_surface_height, + water_temp_c=refraction_water_temp_c, + wavelength_nm=refraction_wavelength_nm, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) # Apply minimum depth threshold unconditionally — keeps photons at height # >= Ignore_Subsurface_Height_Thres regardless of whether GEBCO/ATL24 is on. # Without this, deep noise photons (down to -70 m) contaminate the Beer's Law fit # and produce near-zero Kd values, especially at shallow turbid sites. subsurface_photon_dataset = subsurface_photon_dataset[ - subsurface_photon_dataset['photon_height'] >= Ignore_Subsurface_Height_Thres + subsurface_photon_dataset["photon_height"] >= Ignore_Subsurface_Height_Thres ].copy() if use_atl24_filter: @@ -830,13 +964,15 @@ def get_subsurface_photon( subsurface_photon_dataset, atl24_file_path, abs(Ignore_Subsurface_Height_Thres), - max_match_distance_deg=atl24_max_match_distance_deg + max_match_distance_deg=atl24_max_match_distance_deg, ) if atl24_match_count > 0: filtered_seafloor_subsurface_photon_dataset = atl24_filtered elif use_gebco_filter: filtered_seafloor_subsurface_photon_dataset = process_seafloor_data( - subsurface_photon_dataset, GEBCO_paths, abs(Ignore_Subsurface_Height_Thres) + subsurface_photon_dataset, + GEBCO_paths, + abs(Ignore_Subsurface_Height_Thres), ) else: filtered_seafloor_subsurface_photon_dataset = subsurface_photon_dataset @@ -849,24 +985,33 @@ def get_subsurface_photon( if apply_flattening: lat_bin_keys = list( - binned_dataset_sea_surface.groupby(['lat_bins'], observed=False).groups.keys() + binned_dataset_sea_surface.groupby( + ["lat_bins"], observed=False + ).groups.keys() ) filtered_seafloor_subsurface_photon_dataset = apply_sea_surface_flattening( filtered_seafloor_subsurface_photon_dataset, sea_surface_height, lat_bin_keys, horizontal_res=horizontal_res, - flattening_window_m=flattening_window_m + flattening_window_m=flattening_window_m, ) # Attach per-bin surface_sigma as a quality flag column - filtered_seafloor_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset.copy() + filtered_seafloor_subsurface_photon_dataset = ( + filtered_seafloor_subsurface_photon_dataset.copy() + ) if sigma_map: - filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = \ - filtered_seafloor_subsurface_photon_dataset['lat_bins'].map(sigma_map) + filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = ( + filtered_seafloor_subsurface_photon_dataset["lat_bins"].map(sigma_map) + ) else: - filtered_seafloor_subsurface_photon_dataset['surface_sigma'] = np.nan + filtered_seafloor_subsurface_photon_dataset["surface_sigma"] = np.nan - return sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset + return ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) # Main processing function to apply the subsurface photon filtering beam-by-beam @@ -876,7 +1021,7 @@ def process_subsurface_photon_filtering( subsurface_thresh, Ignore_Subsurface_Height_Thres, use_atl24_filter=False, - atl24_file_path='', + atl24_file_path="", atl24_max_match_distance_deg=0.01, use_gebco_filter=False, apply_histogram_quality_filter=False, @@ -894,7 +1039,7 @@ def process_subsurface_photon_filtering( apply_flattening=False, flattening_window_m=500, horizontal_res=500, - vertical_res=0.25 + vertical_res=0.25, ): """ Beam-by-beam wrapper that calls get_subsurface_photon for every beam and @@ -910,37 +1055,40 @@ def process_subsurface_photon_filtering( filtered_beam_datasets = [] # Group the binned dataset by 'beam_id' and process each group separately - for beam_id, beam_data in binned_dataset_sea_surface.groupby('beam_id'): - print(f'Processing subsurface filtering for beam: {beam_id}') + for beam_id, beam_data in binned_dataset_sea_surface.groupby("beam_id"): + print(f"Processing subsurface filtering for beam: {beam_id}") # Apply get_subsurface_photon to the current beam's dataset - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - get_subsurface_photon( - beam_data, - GEBCO_paths, - subsurface_thresh, - Ignore_Subsurface_Height_Thres, - use_atl24_filter=use_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=atl24_max_match_distance_deg, - use_gebco_filter=use_gebco_filter, - apply_histogram_quality_filter=apply_histogram_quality_filter, - histogram_quality_min_ratio=histogram_quality_min_ratio, - histogram_quality_depth_min=histogram_quality_depth_min, - histogram_quality_depth_max=histogram_quality_depth_max, - histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, - apply_surface_sigma_filter=apply_surface_sigma_filter, - surface_sigma_max=surface_sigma_max, - apply_refraction_correction=apply_refraction_correction, - refraction_water_temp_c=refraction_water_temp_c, - refraction_wavelength_nm=refraction_wavelength_nm, - apply_post_refraction_refit=apply_post_refraction_refit, - apply_flattening=apply_flattening, - flattening_window_m=flattening_window_m, - horizontal_res=horizontal_res, - vertical_res=vertical_res - ) + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = get_subsurface_photon( + beam_data, + GEBCO_paths, + subsurface_thresh, + Ignore_Subsurface_Height_Thres, + use_atl24_filter=use_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=atl24_max_match_distance_deg, + use_gebco_filter=use_gebco_filter, + apply_histogram_quality_filter=apply_histogram_quality_filter, + histogram_quality_min_ratio=histogram_quality_min_ratio, + histogram_quality_depth_min=histogram_quality_depth_min, + histogram_quality_depth_max=histogram_quality_depth_max, + histogram_quality_ref_depth_min=histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=histogram_quality_ref_depth_max, + apply_surface_sigma_filter=apply_surface_sigma_filter, + surface_sigma_max=surface_sigma_max, + apply_refraction_correction=apply_refraction_correction, + refraction_water_temp_c=refraction_water_temp_c, + refraction_wavelength_nm=refraction_wavelength_nm, + apply_post_refraction_refit=apply_post_refraction_refit, + apply_flattening=apply_flattening, + flattening_window_m=flattening_window_m, + horizontal_res=horizontal_res, + vertical_res=vertical_res, + ) # Append each result to the lists sea_surface_heights.append(sea_surface_height) @@ -949,5 +1097,5 @@ def process_subsurface_photon_filtering( # Combine all filtered beam datasets into a single DataFrame combined_filtered_dataset = pd.concat(filtered_beam_datasets, ignore_index=True) - + return sea_surface_heights, sea_surface_labels, combined_filtered_dataset diff --git a/aok/core/kd_utils/config.py b/aok/core/kd_utils/config.py index 199a779..a67825a 100644 --- a/aok/core/kd_utils/config.py +++ b/aok/core/kd_utils/config.py @@ -25,7 +25,6 @@ # # ATL03_h5_file = "processed_ATL03_20200331204156_00810707_006_01.h5"# - # #China Bohai # # ATL03_h5_file = "processed_ATL03_20191128115256_09560502_006_01.h5"# # # ATL03_h5_file = "processed_ATL03_20190530203316_09560302_006_02.h5"# @@ -33,7 +32,6 @@ # ATL03_h5_file = "processed_ATL03_20190829161305_09560402_006_02.h5"# - # ATL03_h5_file_path = path + ATL03_h5_file # Current_Path = 'C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/' @@ -66,13 +64,20 @@ import argparse + def get_args(): - parser = argparse.ArgumentParser(description="Configure ICESat-2 HLS analysis script.") - + parser = argparse.ArgumentParser( + description="Configure ICESat-2 HLS analysis script." + ) + # General paths - parser.add_argument("--workspace_path", type=str, default='C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/', - help="Root path for all data processing.") - + parser.add_argument( + "--workspace_path", + type=str, + default="C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/", + help="Root path for all data processing.", + ) + # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/New/', # help="Base path for ICESat-2 ATL03 data.") @@ -81,218 +86,379 @@ def get_args(): # help="Base path for ICESat-2 ATL03 data.") # Wax Delta (v007) — required for IR/AP filter sensitivity test - parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Wax_Delta/', - help="Base path for ICESat-2 ATL03 data.") - - - + parser.add_argument( + "--atl03_path", + type=str, + default="Dataset/ATL03_ICESat2/Wax_Delta/", + help="Base path for ICESat-2 ATL03 data.", + ) + # #ChesapeakeBay # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20230825074121_10102002_006_02.h5", - # help="Name of the ATL03 H5 file to process.") - + # help="Name of the ATL03 H5 file to process.") + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20221001113813_01641706_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # # India: processed_ATL03_20200331204156_00810707_006_01.h5 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20200331204156_00810707_006_01.h5", # help="Name of the ATL03 H5 file to process.") - + # #ATL03_20210628230109_00811207_006_01_subsetted - + # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210628230109_00811207_006_01.h5", # help="Name of the ATL03 H5 file to process.") - - + # # WaxDelta # parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # Bohai (v006) — baseline sensitivity test site # parser.add_argument("--atl03_file", type=str, default="ATL03_20190829161305_09560402_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") # Wax Delta (v007) — required for IR/AP filter; use with atl03_path = Dataset/ATL03_ICESat2/Wax_Delta/ - parser.add_argument("--atl03_file", type=str, default="ATL03_20231103172707_06982106_007_01_subsetted.h5", - help="Name of the ATL03 H5 file to process.") + parser.add_argument( + "--atl03_file", + type=str, + default="ATL03_20231103172707_06982106_007_01_subsetted.h5", + help="Name of the ATL03 H5 file to process.", + ) # # Cook Inlet # parser.add_argument("--atl03_file", type=str, default="ATL03_20220507112145_06931503_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - - + # #Core Sound # parser.add_argument("--atl03_file", type=str, default="105430185_ATL03_20220423191113_04841506_006_02_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - + # # Pamlico Sound # parser.add_argument("--atl03_file", type=str, default="ATL03_20240415083629_04232306_006_01_subsetted.h5", # help="Name of the ATL03 H5 file to process.") - - + # # #Cook Inlet: processed_ATL03_20210801125753_05941205_005_01 # parser.add_argument("--atl03_file", type=str, default="processed_ATL03_20210801125753_05941205_005_01.h5", # help="Name of the ATL03 H5 file to process.") - - parser.add_argument("--other_data_path", type=str, default='Dataset/', - help="Path to the current working directory.") - - parser.add_argument("--output_path", type=str, default='Results/', - help="Directory for saving results.") - + + parser.add_argument( + "--other_data_path", + type=str, + default="Dataset/", + help="Path to the current working directory.", + ) + + parser.add_argument( + "--output_path", + type=str, + default="Results/", + help="Directory for saving results.", + ) + # Shoreline data - parser.add_argument("--shoreline_data", type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default='Shorelines/ne_10m_land/ne_10m_land.shp', - help="Path to the global shoreline dataset.") - + parser.add_argument( + "--shoreline_data", + type=str, + # default='Shorelines/GeoPkgGlobalShoreline.gpkg', + default="Shorelines/ne_10m_land/ne_10m_land.shp", + help="Path to the global shoreline dataset.", + ) + # Bathymetry datasets - parser.add_argument("--gebco_path", nargs='+', type=str, - default='Bathy/gebco_2024_geotiff/', - help="Path to GEBCO bathymetry datasets.") - + parser.add_argument( + "--gebco_path", + nargs="+", + type=str, + default="Bathy/gebco_2024_geotiff/", + help="Path to GEBCO bathymetry datasets.", + ) + # Resolution settings - parser.add_argument("--horizontal_res", type=int, default=500, - help="Horizontal resolution for the analysis (in meters).") - - parser.add_argument("--vertical_res", type=float, default=0.25, - help="Vertical resolution for the analysis.") - + parser.add_argument( + "--horizontal_res", + type=int, + default=500, + help="Horizontal resolution for the analysis (in meters).", + ) + + parser.add_argument( + "--vertical_res", + type=float, + default=0.25, + help="Vertical resolution for the analysis.", + ) + # Analysis parameters - parser.add_argument("--subsurface_thresh", type=float, default=1.0, - help="Threshold for photons below the sea surface.") - - parser.add_argument("--ignore_subsurface_height_thres", type=float, default=-6, - help="Maximum depth thres for Kd calculations (in meters).") #5m + parser.add_argument( + "--subsurface_thresh", + type=float, + default=1.0, + help="Threshold for photons below the sea surface.", + ) + + parser.add_argument( + "--ignore_subsurface_height_thres", + type=float, + default=-6, + help="Maximum depth thres for Kd calculations (in meters).", + ) # 5m # Empty string '' auto-selects the first strong beam. Pass comma-separated IDs to override. # Valid IDs: gt1l, gt1r, gt2l, gt2r, gt3l, gt3r — only strong beams for the granule are kept. # To process all strong beams, use --target_beams "gt1r,gt2r,gt3r". - parser.add_argument("--target_beams", type=str, default='', - help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.") - - parser.add_argument("--disable_solar_background_filter", action="store_true", - help="Disable solar background filtering (enabled by default; " - "self-gates on solar elevation so has no effect on nighttime passes).") - parser.add_argument("--solar_elevation_day_threshold", type=float, default=6.0, - help="Solar elevation threshold (deg) to define daytime for optional filtering.") - parser.add_argument("--solar_bg_median_window_deg", type=float, default=10.0, - help="Rolling median window width (degrees of solar elevation) for " - "the solar-elevation-aware background filter.") - parser.add_argument("--solar_bg_noise_multiplier", type=float, default=1.5, - help="Ratio of actual/expected background rate above which a photon " - "is flagged as noisy by the solar background filter.") - parser.add_argument("--solar_background_min_signal_conf", type=int, default=2, - help="Minimum photon signal confidence kept in high daytime background.") - - parser.add_argument("--enable_ir_ap_filter", action="store_true", - help="Optional: remove photons using IR/afterpulse proxy flags.") - parser.add_argument("--ir_ap_quality_max", type=int, default=0, - help="Maximum allowed quality_ph value when IR/AP filter is enabled.") - parser.add_argument("--ir_ap_min_signal_conf", type=int, default=0, - help="Minimum allowed photon_conf value when IR/AP filter is enabled.") - - parser.add_argument("--enable_gebco_filter", action="store_true", - help="Optional: remove photons below GEBCO seafloor and shallow bins.") - parser.add_argument("--enable_sea_surface_flattening", action="store_true", - help="Optional: flatten small-bin sea-surface variation using larger along-track windows.") - parser.add_argument("--sea_surface_flattening_window_m", type=int, default=5000, - help="Window size in meters for sea-surface flattening mean level. " - "Must be larger than horizontal_res to have any effect.") - - parser.add_argument("--enable_convex_hull_filter", action="store_true", - help="Optional: apply convex hull area filtering before Kd fitting.") - parser.add_argument("--convex_hull_area_threshold", type=float, default=100, - help="Minimum convex hull area to keep a horizontal bin when enabled.") - - parser.add_argument("--enable_histogram_quality_filter", action="store_true", - help="Optional: discard along-track bins with weak 6-7 m signal.") - parser.add_argument("--histogram_quality_min_ratio", type=float, default=0.05, - help="Minimum ratio of photons in depth band to keep a bin.") - parser.add_argument("--histogram_quality_depth_min", type=float, default=6.0, - help="Minimum depth (m below surface) of quality-check band.") - parser.add_argument("--histogram_quality_depth_max", type=float, default=7.0, - help="Maximum depth (m below surface) of quality-check band.") - parser.add_argument("--histogram_quality_ref_depth_min", type=float, default=0.0, - help="Min depth (m) of near-surface reference band for decay ratio.") - parser.add_argument("--histogram_quality_ref_depth_max", type=float, default=1.0, - help="Max depth (m) of near-surface reference band for decay ratio.") - - parser.add_argument("--enable_surface_sigma_filter", action="store_true", - help="Optional: discard bins where Gaussian surface sigma is too large.") - parser.add_argument("--surface_sigma_max", type=float, default=0.5, - help="Maximum allowed Gaussian sigma (m) for sea-surface peak.") - - parser.add_argument("--enable_refraction_correction", action="store_true", - help="Optional: apply refraction correction to subsurface photons.") - parser.add_argument("--refraction_water_temp_c", type=float, default=20.0, - help="Water temperature (deg C) used for refraction correction.") - parser.add_argument("--refraction_wavelength_nm", type=float, default=532.0, - help="Laser wavelength (nm) used for refraction correction.") - - parser.add_argument("--enable_post_refraction_refit", action="store_true", - help="Optional: rebuild histograms and re-fit surface after refraction correction.") - - parser.add_argument("--enable_atl24_filter", action="store_true", - help="Optional: use ATL24 bathymetry matching before GEBCO fallback.") - parser.add_argument("--atl24_file", type=str, default='', - help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.") - parser.add_argument("--atl24_max_match_distance_deg", type=float, default=0.01, - help="Maximum lon/lat degree distance for ATL24 nearest-point matching.") - - parser.add_argument("--decay_zone_threshold", type=float, default=0.0, - help="Fraction of peak photon count below which depth bins are excluded " - "in Step 16 decay zone detection. 0.0 = original behaviour (only " - "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.") - - parser.add_argument("--kd_fit_method", type=str, default="log_linear", - choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], - help="Beer's Law fitting strategy for Kd calculation. " - "log_linear = original log-space linear regression; " - "bg_subtract = subtract noise floor then log-linear; " - "breakpoint = segmented regression with breakpoint detection; " - "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.") - - parser.add_argument("--enable_wave_adaptive_fit", action="store_true", - help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " - "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.") - parser.add_argument("--wave_exclusion_multiplier", type=float, default=4.0, - help="Number of surface sigmas to skip from the actual water surface when " - "wave-adaptive fit is enabled. The adaptive surface detection already " - "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " - "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.") - parser.add_argument("--wave_sigma_calm_threshold", type=float, default=0.1, - help="Surface sigma (m) below which a bin is considered calm and no " - "wave trimming is applied. Default 0.1 m.") - - parser.add_argument("--enable_paired_beam_combine", action="store_true", - help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.") - - parser.add_argument("--no_plot", action="store_true", - help="Suppress all plot generation (useful for batch runs).") + parser.add_argument( + "--target_beams", + type=str, + default="", + help="Comma-separated beam IDs to process. Empty means auto-select first strong beam.", + ) + + parser.add_argument( + "--disable_solar_background_filter", + action="store_true", + help="Disable solar background filtering (enabled by default; " + "self-gates on solar elevation so has no effect on nighttime passes).", + ) + parser.add_argument( + "--solar_elevation_day_threshold", + type=float, + default=6.0, + help="Solar elevation threshold (deg) to define daytime for optional filtering.", + ) + parser.add_argument( + "--solar_bg_median_window_deg", + type=float, + default=10.0, + help="Rolling median window width (degrees of solar elevation) for " + "the solar-elevation-aware background filter.", + ) + parser.add_argument( + "--solar_bg_noise_multiplier", + type=float, + default=1.5, + help="Ratio of actual/expected background rate above which a photon " + "is flagged as noisy by the solar background filter.", + ) + parser.add_argument( + "--solar_background_min_signal_conf", + type=int, + default=2, + help="Minimum photon signal confidence kept in high daytime background.", + ) + + parser.add_argument( + "--enable_ir_ap_filter", + action="store_true", + help="Optional: remove photons using IR/afterpulse proxy flags.", + ) + parser.add_argument( + "--ir_ap_quality_max", + type=int, + default=0, + help="Maximum allowed quality_ph value when IR/AP filter is enabled.", + ) + parser.add_argument( + "--ir_ap_min_signal_conf", + type=int, + default=0, + help="Minimum allowed photon_conf value when IR/AP filter is enabled.", + ) + + parser.add_argument( + "--enable_gebco_filter", + action="store_true", + help="Optional: remove photons below GEBCO seafloor and shallow bins.", + ) + parser.add_argument( + "--enable_sea_surface_flattening", + action="store_true", + help="Optional: flatten small-bin sea-surface variation using larger along-track windows.", + ) + parser.add_argument( + "--sea_surface_flattening_window_m", + type=int, + default=5000, + help="Window size in meters for sea-surface flattening mean level. " + "Must be larger than horizontal_res to have any effect.", + ) + + parser.add_argument( + "--enable_convex_hull_filter", + action="store_true", + help="Optional: apply convex hull area filtering before Kd fitting.", + ) + parser.add_argument( + "--convex_hull_area_threshold", + type=float, + default=100, + help="Minimum convex hull area to keep a horizontal bin when enabled.", + ) + + parser.add_argument( + "--enable_histogram_quality_filter", + action="store_true", + help="Optional: discard along-track bins with weak 6-7 m signal.", + ) + parser.add_argument( + "--histogram_quality_min_ratio", + type=float, + default=0.05, + help="Minimum ratio of photons in depth band to keep a bin.", + ) + parser.add_argument( + "--histogram_quality_depth_min", + type=float, + default=6.0, + help="Minimum depth (m below surface) of quality-check band.", + ) + parser.add_argument( + "--histogram_quality_depth_max", + type=float, + default=7.0, + help="Maximum depth (m below surface) of quality-check band.", + ) + parser.add_argument( + "--histogram_quality_ref_depth_min", + type=float, + default=0.0, + help="Min depth (m) of near-surface reference band for decay ratio.", + ) + parser.add_argument( + "--histogram_quality_ref_depth_max", + type=float, + default=1.0, + help="Max depth (m) of near-surface reference band for decay ratio.", + ) + + parser.add_argument( + "--enable_surface_sigma_filter", + action="store_true", + help="Optional: discard bins where Gaussian surface sigma is too large.", + ) + parser.add_argument( + "--surface_sigma_max", + type=float, + default=0.5, + help="Maximum allowed Gaussian sigma (m) for sea-surface peak.", + ) + + parser.add_argument( + "--enable_refraction_correction", + action="store_true", + help="Optional: apply refraction correction to subsurface photons.", + ) + parser.add_argument( + "--refraction_water_temp_c", + type=float, + default=20.0, + help="Water temperature (deg C) used for refraction correction.", + ) + parser.add_argument( + "--refraction_wavelength_nm", + type=float, + default=532.0, + help="Laser wavelength (nm) used for refraction correction.", + ) + + parser.add_argument( + "--enable_post_refraction_refit", + action="store_true", + help="Optional: rebuild histograms and re-fit surface after refraction correction.", + ) + + parser.add_argument( + "--enable_atl24_filter", + action="store_true", + help="Optional: use ATL24 bathymetry matching before GEBCO fallback.", + ) + parser.add_argument( + "--atl24_file", + type=str, + default="", + help="Path to ATL24 point dataset (CSV/Parquet/GPKG/SHP) for bathymetry matching.", + ) + parser.add_argument( + "--atl24_max_match_distance_deg", + type=float, + default=0.01, + help="Maximum lon/lat degree distance for ATL24 nearest-point matching.", + ) + + parser.add_argument( + "--decay_zone_threshold", + type=float, + default=0.0, + help="Fraction of peak photon count below which depth bins are excluded " + "in Step 16 decay zone detection. 0.0 = original behaviour (only " + "zero-count bins removed). E.g. 0.01 removes bins with < 1%% of peak.", + ) + + parser.add_argument( + "--kd_fit_method", + type=str, + default="log_linear", + choices=["log_linear", "bg_subtract", "breakpoint", "nonlinear", "hybrid"], + help="Beer's Law fitting strategy for Kd calculation. " + "log_linear = original log-space linear regression; " + "bg_subtract = subtract noise floor then log-linear; " + "breakpoint = segmented regression with breakpoint detection; " + "nonlinear = fit C(z) = A*exp(-Kd*z) + N directly.", + ) + + parser.add_argument( + "--enable_wave_adaptive_fit", + action="store_true", + help="Optional: trim shallow depth bins by wave amplitude before Beer's Law fit. " + "Uses per-bin surface sigma to skip the top k*sigma metres in wavy bins.", + ) + parser.add_argument( + "--wave_exclusion_multiplier", + type=float, + default=4.0, + help="Number of surface sigmas to skip from the actual water surface when " + "wave-adaptive fit is enabled. The adaptive surface detection already " + "removes 3*sigma; this adds (k-3)*sigma additional margin for bubble " + "injection zone. Default 4.0 adds 1*sigma beyond the 3-sigma cutoff.", + ) + parser.add_argument( + "--wave_sigma_calm_threshold", + type=float, + default=0.1, + help="Surface sigma (m) below which a bin is considered calm and no " + "wave trimming is applied. Default 0.1 m.", + ) + + parser.add_argument( + "--enable_paired_beam_combine", + action="store_true", + help="Optional: combine paired beams (gt1, gt2, gt3) before Kd fitting.", + ) + + parser.add_argument( + "--no_plot", + action="store_true", + help="Suppress all plot generation (useful for batch runs).", + ) # switch on consider the coastal water or inland water # if Inland water, we should use one mask and for coastal water, it should be another mask - + # consider bathymetry or not manual setting - - - #night vs daytime - - #solar elevation detemine remove or not for solar background - - - # 1-2 hours - - + + # night vs daytime + + # solar elevation detemine remove or not for solar background + + # 1-2 hours + return parser.parse_args() + if __name__ == "__main__": args = get_args() # Access parameters like this: atl03_file_path = args.workspace_path + args.atl03_file - + print("Processing file:", atl03_file_path) print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/data_processing.py b/aok/core/kd_utils/data_processing.py index 6d5b28e..8860c99 100644 --- a/aok/core/kd_utils/data_processing.py +++ b/aok/core/kd_utils/data_processing.py @@ -1,42 +1,28 @@ # utils/data_processing.py -import re -import os import io -import time -import math -import h5py import logging -import netCDF4 -import numpy as np +import math +import os +import re + import geopandas as gpd +import h5py +# from pyproj import Transformer +import numpy as np import pandas as pd -from datetime import datetime -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler, MinMaxScaler +from pyproj import Proj, Transformer from scipy.spatial import ConvexHull -import argparse -import subprocess # import fiona -import utm -import pyproj -from pyproj import Transformer, Proj - -# from pyproj import Transformer -from matplotlib.widgets import LassoSelector -from matplotlib.path import Path - -from sklearn.cluster import DBSCAN from .interpolation import * - logger = logging.getLogger(__name__) -#this function from icesat2_toolkit +# this function from icesat2_toolkit # PURPOSE: read ICESat-2 ATL03 HDF5 data files def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ @@ -62,9 +48,9 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): """ # Open the HDF5 file for reading if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, 'r') + fileID = h5py.File(FILENAME, "r") else: - fileID = h5py.File(os.path.expanduser(FILENAME), 'r') + fileID = h5py.File(os.path.expanduser(FILENAME), "r") # Output HDF5 file information logging.info(fileID.filename) @@ -76,12 +62,12 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # read each input beam within the file IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r'gt\d[lr]',k))]: + for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: # check if subsetted beam contains data # check in both the geolocation and heights groups try: - fileID[gtx]['geolocation']['segment_id'] - fileID[gtx]['heights']['delta_time'] + fileID[gtx]["geolocation"]["segment_id"] + fileID[gtx]["heights"]["delta_time"] except KeyError: pass else: @@ -89,82 +75,81 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # for each included beam for gtx in IS2_atl03_beams: - # ------------------------------------------- # 1. make sure the beam-level dict exists IS2_atl03_attrs.setdefault(gtx, {}) # 2. always save the two “must-have” attributes - for key in ('atlas_beam_type', 'atlas_spot_number'): + for key in ("atlas_beam_type", "atlas_spot_number"): IS2_atl03_attrs[gtx][key] = fileID[gtx].attrs[key] - + # get each HDF5 variable IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]['heights'] = {} - IS2_atl03_mds[gtx]['geolocation'] = {} - IS2_atl03_mds[gtx]['bckgrd_atlas'] = {} - IS2_atl03_mds[gtx]['geophys_corr'] = {} + IS2_atl03_mds[gtx]["heights"] = {} + IS2_atl03_mds[gtx]["geolocation"] = {} + IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} + IS2_atl03_mds[gtx]["geophys_corr"] = {} # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_mds[gtx]['heights'][key] = val[:] + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_mds[gtx]["heights"][key] = val[:] # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_mds[gtx]['geolocation'][key] = val[:] + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_mds[gtx]["geolocation"][key] = val[:] # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_mds[gtx]['bckgrd_atlas'][key] = val[:] + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, # solid earth, pole, load, and equilibrium), inverted barometer (IB) # effects, and range corrections for tropospheric delays - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_mds[gtx]['geophys_corr'][key] = val[:] + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Getting attributes of IS2_atl03_mds beam variables IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]['heights'] = {} - IS2_atl03_attrs[gtx]['geolocation'] = {} - IS2_atl03_attrs[gtx]['bckgrd_atlas'] = {} - IS2_atl03_attrs[gtx]['geophys_corr'] = {} - + IS2_atl03_attrs[gtx]["heights"] = {} + IS2_atl03_attrs[gtx]["geolocation"] = {} + IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} + IS2_atl03_attrs[gtx]["geophys_corr"] = {} + # Global Group Attributes - for att_name,att_val in fileID[gtx].attrs.items(): + for att_name, att_val in fileID[gtx].attrs.items(): IS2_atl03_attrs[gtx][att_name] = att_val # ICESat-2 Measurement Group - for key,val in fileID[gtx]['heights'].items(): - IS2_atl03_attrs[gtx]['heights'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['heights'][key][att_name]=att_val + for key, val in fileID[gtx]["heights"].items(): + IS2_atl03_attrs[gtx]["heights"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val # ICESat-2 Geolocation Group - for key,val in fileID[gtx]['geolocation'].items(): - IS2_atl03_attrs[gtx]['geolocation'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geolocation'][key][att_name]=att_val + for key, val in fileID[gtx]["geolocation"].items(): + IS2_atl03_attrs[gtx]["geolocation"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val # ICESat-2 Background Photon Rate Group - for key,val in fileID[gtx]['bckgrd_atlas'].items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['bckgrd_atlas'][key][att_name]=att_val + for key, val in fileID[gtx]["bckgrd_atlas"].items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val # ICESat-2 Geophysical Corrections Group - for key,val in fileID[gtx]['geophys_corr'].items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]['geophys_corr'][key][att_name]=att_val + for key, val in fileID[gtx]["geophys_corr"].items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val # ICESat-2 spacecraft orientation at time - IS2_atl03_mds['orbit_info'] = {} - IS2_atl03_attrs['orbit_info'] = {} - for key,val in fileID['orbit_info'].items(): - IS2_atl03_mds['orbit_info'][key] = val[:] + IS2_atl03_mds["orbit_info"] = {} + IS2_atl03_attrs["orbit_info"] = {} + for key, val in fileID["orbit_info"].items(): + IS2_atl03_mds["orbit_info"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID['orbit_info'].attrs.items(): - IS2_atl03_attrs['orbit_info'][att_name] = att_val + for att_name, att_val in fileID["orbit_info"].attrs.items(): + IS2_atl03_attrs["orbit_info"][att_name] = att_val # Variable Attributes - IS2_atl03_attrs['orbit_info'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['orbit_info'][key][att_name] = att_val + IS2_atl03_attrs["orbit_info"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["orbit_info"][key][att_name] = att_val # information ancillary to the data product # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) @@ -172,91 +157,108 @@ def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): # Add this value to delta time parameters to compute full gps_seconds # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds['ancillary_data'] = {} - IS2_atl03_attrs['ancillary_data'] = {} - ancillary_keys = ['atlas_sdp_gps_epoch','data_end_utc','data_start_utc', - 'end_cycle','end_geoseg','end_gpssow','end_gpsweek','end_orbit', - 'end_region','end_rgt','granule_end_utc','granule_start_utc','release', - 'start_cycle','start_geoseg','start_gpssow','start_gpsweek', - 'start_orbit','start_region','start_rgt','version'] + IS2_atl03_mds["ancillary_data"] = {} + IS2_atl03_attrs["ancillary_data"] = {} + ancillary_keys = [ + "atlas_sdp_gps_epoch", + "data_end_utc", + "data_start_utc", + "end_cycle", + "end_geoseg", + "end_gpssow", + "end_gpsweek", + "end_orbit", + "end_region", + "end_rgt", + "granule_end_utc", + "granule_start_utc", + "release", + "start_cycle", + "start_geoseg", + "start_gpssow", + "start_gpsweek", + "start_orbit", + "start_region", + "start_rgt", + "version", + ] for key in ancillary_keys: # get each HDF5 variable - IS2_atl03_mds['ancillary_data'][key] = fileID['ancillary_data'][key][:] + IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data'][key] = {} - for att_name,att_val in fileID['ancillary_data'][key].attrs.items(): - IS2_atl03_attrs['ancillary_data'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"][key] = {} + for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): + IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val # transmit-echo-path (tep) parameters - IS2_atl03_mds['ancillary_data']['tep'] = {} - IS2_atl03_attrs['ancillary_data']['tep'] = {} - for key,val in fileID['ancillary_data']['tep'].items(): + IS2_atl03_mds["ancillary_data"]["tep"] = {} + IS2_atl03_attrs["ancillary_data"]["tep"] = {} + for key, val in fileID["ancillary_data"]["tep"].items(): # get each HDF5 variable - IS2_atl03_mds['ancillary_data']['tep'][key] = val[:] + IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes - IS2_atl03_attrs['ancillary_data']['tep'][key] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs['ancillary_data']['tep'][key][att_name] = att_val + IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val # channel dead time and first photon bias derived from ATLAS calibration - cal1,cal2 = ('ancillary_data','calibrations') - for var in ['dead_time','first_photon_bias']: + cal1, cal2 = ("ancillary_data", "calibrations") + for var in ["dead_time", "first_photon_bias"]: IS2_atl03_mds[cal1][var] = {} IS2_atl03_attrs[cal1][var] = {} - for key,val in fileID[cal1][cal2][var].items(): + for key, val in fileID[cal1][cal2][var].items(): # get each HDF5 variable if isinstance(val, h5py.Dataset): IS2_atl03_mds[cal1][var][key] = val[:] elif isinstance(val, h5py.Group): IS2_atl03_mds[cal1][var][key] = {} - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_mds[cal1][var][key][k] = v[:] # Getting attributes of group and included variables if ATTRIBUTES: # Variable Attributes IS2_atl03_attrs[cal1][var][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[cal1][var][key][att_name] = att_val if isinstance(val, h5py.Group): - for k,v in val.items(): + for k, v in val.items(): IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name,att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name]=att_val + for att_name, att_val in val.attrs.items(): + IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1,tep2 = ('atlas_impulse_response','tep_histogram') + tep1, tep2 = ("atlas_impulse_response", "tep_histogram") IS2_atl03_mds[tep1] = {} IS2_atl03_attrs[tep1] = {} - for pce in ['pce1_spot1','pce2_spot3']: - IS2_atl03_mds[tep1][pce] = {tep2:{}} - IS2_atl03_attrs[tep1][pce] = {tep2:{}} + for pce in ["pce1_spot1", "pce2_spot3"]: + IS2_atl03_mds[tep1][pce] = {tep2: {}} + IS2_atl03_attrs[tep1][pce] = {tep2: {}} # for each TEP variable - for key,val in fileID[tep1][pce][tep2].items(): + for key, val in fileID[tep1][pce][tep2].items(): IS2_atl03_mds[tep1][pce][tep2][key] = val[:] # Getting attributes of included variables if ATTRIBUTES: # Global Group Attributes - for att_name,att_val in fileID[tep1][pce][tep2].attrs.items(): + for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val # Variable Attributes IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name,att_val in val.attrs.items(): + for att_name, att_val in val.attrs.items(): IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val # Global File Attributes if ATTRIBUTES: - for att_name,att_val in fileID.attrs.items(): + for att_name, att_val in fileID.attrs.items(): IS2_atl03_attrs[att_name] = att_val # Closing the HDF5 file fileID.close() # Return the datasets and variables - return (IS2_atl03_mds,IS2_atl03_attrs,IS2_atl03_beams) - + return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) # convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 @@ -264,22 +266,22 @@ def convert_wgs_to_utm(lon: float, lat: float): """Based on lat and lng, return best utm epsg-code""" utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) if len(utm_band) == 1: - utm_band = '0' + utm_band + utm_band = "0" + utm_band if lat >= 0: - epsg_code = 'epsg:326' + utm_band + epsg_code = "epsg:326" + utm_band return epsg_code - epsg_code = 'epsg:327' + utm_band + epsg_code = "epsg:327" + utm_band return epsg_code def orthometric_correction(lat, lon, Z, epsg): # Define the Proj string - #To transform from WGS84 ellipsoidal height + # To transform from WGS84 ellipsoidal height # to EGM2008 orthometric height using PyProj # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' # # Define the Proj string for WGS84 ellipsoidal height # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - + # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 # egm2008_proj_string = \ # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ @@ -297,7 +299,6 @@ def orthometric_correction(lat, lon, Z, epsg): return Y_utm, X_utm, Z_egm08 - def load_data(file_path, read_attributes=False): """ Wrapper around read_granule that lets you decide @@ -313,10 +314,8 @@ def load_data(file_path, read_attributes=False): If True, read_granule returns the full IS2_atl03_attrs tree. Defaults to False. """ - - - return read_granule(file_path, ATTRIBUTES = read_attributes) + return read_granule(file_path, ATTRIBUTES=read_attributes) # requires that the input gdf has ranged index values i @@ -327,16 +326,15 @@ def load_data(file_path, read_attributes=False): def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # try loading the shoreline data try: - ICESat2_GDF.insert(0, 'lat', ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, 'lon', ICESat2_GDF.geometry.x, False) + ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) + ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) # allocation of to be used arrays zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) # Land flag initialized as -1 # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, 'is_land', - zero_int_array - 1, False) + ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) # set the projection ICESat2_GDF.set_crs("EPSG:4326", inplace=True) @@ -347,13 +345,15 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # engine str, 'fiona' or 'pyogrio' # somtime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='pyogrio') + land_polygon_gdf = gpd.read_file( + shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" + ) # continue with getting a new array of 0-or-1 labels for each photon land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate='within') + pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) @@ -365,7 +365,6 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return land_point_labels except Exception as e: - print(e) print("Error loading shoreline data, returning -1s for is_land flag") @@ -376,37 +375,64 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): return -np.ones_like(ICESat2_GDF.is_land.values) -def create_photon_dataframe(lat_ph, lon_ph, ref_elev, ref_azimuth, geoid, h_ph, \ - quality_ph, is_land_label_interp1d, signal_conf_photon, x_atc, relative_AT_dist, - solar_elevation=None, background_rate=None): +def create_photon_dataframe( + lat_ph, + lon_ph, + ref_elev, + ref_azimuth, + geoid, + h_ph, + quality_ph, + is_land_label_interp1d, + signal_conf_photon, + x_atc, + relative_AT_dist, + solar_elevation=None, + background_rate=None, +): # Apply geoid correction to the photon heights to convert them from ellipsoidal to orthometric heights h_ph_geoid_cor = h_ph[:] - geoid[:] - - # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude + + # Determine the EPSG code for the UTM zone based on the first photon's longitude and latitude epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - + # Perform orthometric correction to obtain UTM coordinates and corrected heights lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - + # Put the data into the dataframe - sea_photon_dataset = pd.DataFrame({ - 'latitude': lat_ph, - 'longitude': lon_ph, - 'lat': lat_utm, - 'lon': lon_utm, - 'photon_height': h_ph_geoid_cor, - 'quality_ph': quality_ph, - 'is_land_label': is_land_label_interp1d, - 'photon_conf': signal_conf_photon, - 'ref_elevation': ref_elev, - 'ref_azimuth': ref_azimuth, - 'relative_AT_dist': relative_AT_dist - }, columns=['latitude', 'longitude', 'lat', 'lon', 'photon_height', 'quality_ph', 'is_land_label', 'photon_conf', 'ref_elevation', 'ref_azimuth', 'relative_AT_dist']) + sea_photon_dataset = pd.DataFrame( + { + "latitude": lat_ph, + "longitude": lon_ph, + "lat": lat_utm, + "lon": lon_utm, + "photon_height": h_ph_geoid_cor, + "quality_ph": quality_ph, + "is_land_label": is_land_label_interp1d, + "photon_conf": signal_conf_photon, + "ref_elevation": ref_elev, + "ref_azimuth": ref_azimuth, + "relative_AT_dist": relative_AT_dist, + }, + columns=[ + "latitude", + "longitude", + "lat", + "lon", + "photon_height", + "quality_ph", + "is_land_label", + "photon_conf", + "ref_elevation", + "ref_azimuth", + "relative_AT_dist", + ], + ) if solar_elevation is not None: - sea_photon_dataset['solar_elevation'] = solar_elevation + sea_photon_dataset["solar_elevation"] = solar_elevation if background_rate is not None: - sea_photon_dataset['background_rate'] = background_rate + sea_photon_dataset["background_rate"] = background_rate return sea_photon_dataset @@ -430,7 +456,13 @@ def interpolate_by_time(source_time, source_values, target_time): if unique_time.size < 2: return np.full_like(target_time, unique_values[0], dtype=float) - return np.interp(target_time, unique_time, unique_values, left=unique_values[0], right=unique_values[-1]) + return np.interp( + target_time, + unique_time, + unique_values, + left=unique_values[0], + right=unique_values[-1], + ) def apply_optional_solar_background_filter( @@ -439,7 +471,7 @@ def apply_optional_solar_background_filter( day_threshold=6.0, median_window_deg=10.0, noise_multiplier=1.5, - min_signal_conf=2 + min_signal_conf=2, ): """ Optionally filter photons under strong daytime solar background conditions. @@ -456,17 +488,21 @@ def apply_optional_solar_background_filter( if not enabled: return sea_photon_dataset - required_cols = {'solar_elevation', 'background_rate', 'photon_conf'} + required_cols = {"solar_elevation", "background_rate", "photon_conf"} if not required_cols.issubset(set(sea_photon_dataset.columns)): - logger.warning("Solar background filter skipped because required columns are missing.") + logger.warning( + "Solar background filter skipped because required columns are missing." + ) return sea_photon_dataset filtered_dataset = sea_photon_dataset.copy() - daytime_mask = filtered_dataset['solar_elevation'] >= day_threshold + daytime_mask = filtered_dataset["solar_elevation"] >= day_threshold # Select valid daytime photons for rolling median computation daytime = filtered_dataset.loc[daytime_mask].copy() - valid = np.isfinite(daytime['background_rate']) & np.isfinite(daytime['solar_elevation']) + valid = np.isfinite(daytime["background_rate"]) & np.isfinite( + daytime["solar_elevation"] + ) daytime = daytime.loc[valid] if len(daytime) < 10: @@ -479,65 +515,66 @@ def apply_optional_solar_background_filter( # Clamp solar elevations to physical range [DAY_THRESHOLD, 90] to avoid # HDF5 fill values (e.g. 3.4e+38) blowing up the bin array. daytime = daytime.copy() - daytime['solar_elevation'] = daytime['solar_elevation'].clip( + daytime["solar_elevation"] = daytime["solar_elevation"].clip( lower=day_threshold, upper=90.0 ) # Vectorized degree-based rolling median: # Bin elevations into fine steps, compute one median per bin using # np.searchsorted on the sorted array, then map back to photons. - daytime = daytime.sort_values('solar_elevation') - elevations = daytime['solar_elevation'].values - bg_rates = daytime['background_rate'].values + daytime = daytime.sort_values("solar_elevation") + elevations = daytime["solar_elevation"].values + bg_rates = daytime["background_rate"].values half_window = median_window_deg / 2.0 bin_step = 0.1 # degrees — fine enough for smooth curve - bin_centers = np.arange( - elevations[0], elevations[-1] + bin_step, bin_step - ) + bin_centers = np.arange(elevations[0], elevations[-1] + bin_step, bin_step) bin_medians = np.empty(len(bin_centers)) for i in range(len(bin_centers)): - lo = np.searchsorted(elevations, bin_centers[i] - half_window, side='left') - hi = np.searchsorted(elevations, bin_centers[i] + half_window, side='right') + lo = np.searchsorted(elevations, bin_centers[i] - half_window, side="left") + hi = np.searchsorted(elevations, bin_centers[i] + half_window, side="right") if lo < hi: bin_medians[i] = np.median(bg_rates[lo:hi]) else: bin_medians[i] = np.nan # Map bin medians back to each photon via nearest-bin lookup (vectorized) - bin_indices = np.searchsorted(bin_centers, elevations, side='right') - 1 + bin_indices = np.searchsorted(bin_centers, elevations, side="right") - 1 bin_indices = np.clip(bin_indices, 0, len(bin_centers) - 1) expected_bg = bin_medians[bin_indices] # Map expected background onto the full DataFrame. Non-daytime and # non-finite daytime rows stay NaN, so they are never flagged noisy. - filtered_dataset['expected_bg'] = np.nan - filtered_dataset.loc[daytime.index, 'expected_bg'] = expected_bg + filtered_dataset["expected_bg"] = np.nan + filtered_dataset.loc[daytime.index, "expected_bg"] = expected_bg # Flag noisy daytime photons noisy_daytime_mask = daytime_mask & ( - filtered_dataset['background_rate'] >= filtered_dataset['expected_bg'] * noise_multiplier + filtered_dataset["background_rate"] + >= filtered_dataset["expected_bg"] * noise_multiplier + ) + keep_mask = (~noisy_daytime_mask) | ( + filtered_dataset["photon_conf"] >= min_signal_conf ) - keep_mask = (~noisy_daytime_mask) | (filtered_dataset['photon_conf'] >= min_signal_conf) # Clean up temporary column - filtered_dataset.drop(columns=['expected_bg'], inplace=True, errors='ignore') + filtered_dataset.drop(columns=["expected_bg"], inplace=True, errors="ignore") before_count = len(filtered_dataset) filtered_dataset = filtered_dataset.loc[keep_mask].copy() after_count = len(filtered_dataset) logger.info( "Solar background filter (multiplier=%.1f) removed %s photons (from %s to %s).", - noise_multiplier, before_count - after_count, before_count, after_count + noise_multiplier, + before_count - after_count, + before_count, + after_count, ) return filtered_dataset def apply_optional_ir_ap_filter( - sea_photon_dataset, - enabled=False, - quality_max=0, - min_signal_conf=0 + sea_photon_dataset, enabled=False, quality_max=0, min_signal_conf=0 ): """ Optionally remove photons flagged as afterpulse using the quality_ph bitmask. @@ -559,19 +596,23 @@ def apply_optional_ir_ap_filter( filtered_dataset = sea_photon_dataset.copy() keep_mask = np.ones(len(filtered_dataset), dtype=bool) - if 'quality_ph' in filtered_dataset.columns: - quality_values = pd.to_numeric(filtered_dataset['quality_ph'], errors='coerce').fillna(0).astype(int) - afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse + if "quality_ph" in filtered_dataset.columns: + quality_values = ( + pd.to_numeric(filtered_dataset["quality_ph"], errors="coerce") + .fillna(0) + .astype(int) + ) + afterpulse_mask = (quality_values & 1) == 1 # bit 0 set = afterpulse keep_mask &= ~afterpulse_mask logger.info( "IR/AP filter: %d photons flagged as afterpulse (quality_ph bit 0) will be removed.", - int(afterpulse_mask.sum()) + int(afterpulse_mask.sum()), ) else: logger.warning("IR/AP filter enabled, but quality_ph is missing.") - if 'photon_conf' in filtered_dataset.columns: - conf_values = pd.to_numeric(filtered_dataset['photon_conf'], errors='coerce') + if "photon_conf" in filtered_dataset.columns: + conf_values = pd.to_numeric(filtered_dataset["photon_conf"], errors="coerce") keep_mask &= conf_values >= min_signal_conf else: logger.warning("IR/AP filter enabled, but photon_conf is missing.") @@ -581,7 +622,9 @@ def apply_optional_ir_ap_filter( after_count = len(filtered_dataset) logger.info( "IR/AP proxy filter removed %s photons (from %s to %s).", - before_count - after_count, before_count, after_count + before_count - after_count, + before_count, + after_count, ) return filtered_dataset @@ -601,105 +644,126 @@ def Extract_sea_photons(IS2_atl03_mds, target_strong_beams, shoreline_data_path) Segment_ref_azimuth = {} background_rate = {} background_counts = {} - + # Initialize a list to store data for each beam beam_datasets = [] - - + # Loop over each strong beam in target_strong_beams for gtx in target_strong_beams: - print('Processing strong beam ID:', gtx) - + print("Processing strong beam ID:", gtx) + # Access the data for the current beam IS2_val = IS2_atl03_mds[gtx] - + # Initialize dictionaries to store segment data for the beam - Segment_ID[gtx] = IS2_val['geolocation']['segment_id'] + Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] n_seg = len(Segment_ID[gtx]) - n_pe, = IS2_val['heights']['delta_time'].shape - Segment_Index_begin[gtx] = IS2_val['geolocation']['ph_index_beg'] - 1 - Segment_PE_count[gtx] = IS2_val['geolocation']['segment_ph_cnt'] - Equator_Segment_Distance[gtx] = IS2_val['geolocation']['segment_dist_x'] - Segment_Length[gtx] = IS2_val['geolocation']['segment_length'] - delta_time = IS2_val['geolocation']['delta_time'] - segment_lat = IS2_val['geolocation']['reference_photon_lat'][:].copy() - segment_lon = IS2_val['geolocation']['reference_photon_lon'][:].copy() - ref_elev = IS2_val['geolocation']['ref_elev'][:].copy() - ref_azimuth = IS2_val['geolocation']['ref_azimuth'][:].copy() - geoid = IS2_val['geophys_corr']['geoid'][:].copy() - h_ph = IS2_val['heights']['h_ph'][:].copy() - photon_delta_time = IS2_val['heights']['delta_time'][:].copy() - lat_ph = IS2_val['heights']['lat_ph'][:].copy() - lon_ph = IS2_val['heights']['lon_ph'][:].copy() - signal_conf_photon = IS2_val['heights']['signal_conf_ph'][..., 0].copy() - x_atc = IS2_val['heights']['dist_ph_along'][:].copy() - y_atc = IS2_val['heights']['dist_ph_across'][:].copy() - quality_ph = IS2_val['heights']['quality_ph'] + (n_pe,) = IS2_val["heights"]["delta_time"].shape + Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 + Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] + Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] + Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] + delta_time = IS2_val["geolocation"]["delta_time"] + segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() + segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() + ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() + ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() + geoid = IS2_val["geophys_corr"]["geoid"][:].copy() + h_ph = IS2_val["heights"]["h_ph"][:].copy() + photon_delta_time = IS2_val["heights"]["delta_time"][:].copy() + lat_ph = IS2_val["heights"]["lat_ph"][:].copy() + lon_ph = IS2_val["heights"]["lon_ph"][:].copy() + signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() + x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() + y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() + quality_ph = IS2_val["heights"]["quality_ph"] # Optional variables for solar background sensitivity testing photon_solar_elevation = np.full_like(photon_delta_time, np.nan, dtype=float) photon_background_rate = np.full_like(photon_delta_time, np.nan, dtype=float) - if 'solar_elevation' in IS2_val['geolocation']: - segment_solar_elevation = IS2_val['geolocation']['solar_elevation'][:].copy() + if "solar_elevation" in IS2_val["geolocation"]: + segment_solar_elevation = IS2_val["geolocation"]["solar_elevation"][ + : + ].copy() photon_solar_elevation = interpolate_by_time( delta_time, segment_solar_elevation, photon_delta_time ) - if 'bckgrd_atlas' in IS2_val and 'bckgrd_rate' in IS2_val['bckgrd_atlas'] and 'delta_time' in IS2_val['bckgrd_atlas']: - bckgrd_rate = IS2_val['bckgrd_atlas']['bckgrd_rate'][:].copy() - bckgrd_time = IS2_val['bckgrd_atlas']['delta_time'][:].copy() + if ( + "bckgrd_atlas" in IS2_val + and "bckgrd_rate" in IS2_val["bckgrd_atlas"] + and "delta_time" in IS2_val["bckgrd_atlas"] + ): + bckgrd_rate = IS2_val["bckgrd_atlas"]["bckgrd_rate"][:].copy() + bckgrd_time = IS2_val["bckgrd_atlas"]["delta_time"][:].copy() photon_background_rate = interpolate_by_time( bckgrd_time, bckgrd_rate, photon_delta_time ) - # Adjust x_atc based on segment distances + # Adjust x_atc based on segment distances for seg_index in range(n_seg): idx = Segment_Index_begin[gtx][seg_index] cnt = Segment_PE_count[gtx][seg_index] - x_atc[idx:idx + cnt] += Equator_Segment_Distance[gtx][seg_index] + x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - # Calculate relative distances + # Calculate relative distances relative_AT_dist = (x_atc - x_atc[0]) / 1000 - relative_seg_dist = (Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0]) / 1000 - + relative_seg_dist = ( + Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] + ) / 1000 + # Create a GeoDataFrame to hold segment data for shoreline check - Segment_Is_Land['geometry'] = gpd.points_from_xy(segment_lon, segment_lat) + Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - + # Determine if it is land by the land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - ICESat2_GDF.loc[:, 'is_land'] = Segment_Is_Land_Labels + Segment_Is_Land_Labels = isolate_sea_land_photons( + shoreline_data_path, ICESat2_GDF + ) + ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels # Apply interpolations for required data - is_land_label_interp1d = apply_interpolation(interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph) - ph_ref_elev = apply_interpolation(interpolate_labels(segment_lat, ref_elev), lat_ph) - ph_ref_azimuth = apply_interpolation(interpolate_labels(segment_lat, ref_azimuth), lat_ph) + is_land_label_interp1d = apply_interpolation( + interpolate_labels(segment_lat, Segment_Is_Land_Labels), lat_ph + ) + ph_ref_elev = apply_interpolation( + interpolate_labels(segment_lat, ref_elev), lat_ph + ) + ph_ref_azimuth = apply_interpolation( + interpolate_labels(segment_lat, ref_azimuth), lat_ph + ) ph_geoid = apply_interpolation(interpolate_labels(segment_lat, geoid), lat_ph) - - # Create photon DataFrame for the current beam - sea_photon_dataset = create_photon_dataframe(lat_ph=lat_ph, lon_ph=lon_ph, - ref_elev=ph_ref_elev,ref_azimuth=ph_ref_azimuth, - geoid=ph_geoid,h_ph=h_ph, - quality_ph=quality_ph, - is_land_label_interp1d=is_land_label_interp1d, - signal_conf_photon=signal_conf_photon, - x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. - relative_AT_dist=relative_AT_dist, - solar_elevation=photon_solar_elevation, - background_rate=photon_background_rate) + # Create photon DataFrame for the current beam + sea_photon_dataset = create_photon_dataframe( + lat_ph=lat_ph, + lon_ph=lon_ph, + ref_elev=ph_ref_elev, + ref_azimuth=ph_ref_azimuth, + geoid=ph_geoid, + h_ph=h_ph, + quality_ph=quality_ph, + is_land_label_interp1d=is_land_label_interp1d, + signal_conf_photon=signal_conf_photon, + x_atc=x_atc, # Note: x_atc is not used in the function. Remove if unnecessary. + relative_AT_dist=relative_AT_dist, + solar_elevation=photon_solar_elevation, + background_rate=photon_background_rate, + ) # Filter out land photons - sea_photon_dataset = sea_photon_dataset[sea_photon_dataset['is_land_label'] != 1] - + sea_photon_dataset = sea_photon_dataset[ + sea_photon_dataset["is_land_label"] != 1 + ] + # Add a new column to indicate the beam ID - sea_photon_dataset['beam_id'] = gtx - + sea_photon_dataset["beam_id"] = gtx + # Append the processed dataset for the current beam to the list beam_datasets.append(sea_photon_dataset) - + # Concatenate all beam data into a single DataFrame all_beams_dataset = pd.concat(beam_datasets, ignore_index=True) - + return all_beams_dataset @@ -708,103 +772,116 @@ def filter_photon_dataset_by_hull_area(photon_dataset, hull_area_threshold=3000) Filters photon dataset based on ConvexHull area threshold and returns the filtered dataset, convex hull areas, and convex hull points. """ - lat_bins_grouped = photon_dataset.groupby('lat_bins', observed=False) + lat_bins_grouped = photon_dataset.groupby("lat_bins", observed=False) filtered_dataset = photon_dataset.copy() - + convex_hulls = {} convex_hull_areas = {} - + for lat_bin, bin_data in lat_bins_grouped: if len(bin_data) >= 3: - points = bin_data[['lat', 'photon_height']].to_numpy() + points = bin_data[["lat", "photon_height"]].to_numpy() hull = ConvexHull(points) area = hull.volume convex_hull_areas[lat_bin] = area - + # Store the ConvexHull points if area meets the threshold if area >= hull_area_threshold: convex_hulls[lat_bin] = points[hull.vertices] else: # Remove bins with hull area below the threshold - filtered_dataset = filtered_dataset[filtered_dataset['lat_bins'] != lat_bin] - - return filtered_dataset, convex_hull_areas, convex_hulls - - - - - - + filtered_dataset = filtered_dataset[ + filtered_dataset["lat_bins"] != lat_bin + ] + return filtered_dataset, convex_hull_areas, convex_hulls ########################## -##Discard Functions Below +##Discard Functions Below ########################## -# Extracts key information from filenames, +# Extracts key information from filenames, # which could include processed status, product ID, timestamps, identifiers, or metadata. def extract_file_params(file_path): - ''' + """ Example: input "processed_ATL06_20231115120000_20231115_001_12_data.h5" - Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', + Output: ('processed_', 'ATL06', '2023', '11', '15', '12', '00', '00', '2023', '11', '15', '001', '12', '_data') - ''' + """ # Defines a regex pattern to match strings in the file path - rx = re.compile(r'(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})' - r'(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$') + rx = re.compile( + r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" + r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" + ) # Searches for all matches of the regex pattern in the given params = rx.findall(file_path).pop() return params def create_mask(binned_data, sea_surface_height, seafloor_height, threshold_height): - ''' + """ create a mask for filtering sea surface, sea floor, and threshold_height - ''' - mask = (binned_data['height'] <= sea_surface_height) & \ - (binned_data['height'] >= seafloor_height) & \ - (binned_data['height'] >= threshold_height) + """ + mask = ( + (binned_data["height"] <= sea_surface_height) + & (binned_data["height"] >= seafloor_height) + & (binned_data["height"] >= threshold_height) + ) return mask -def generate_polygons_from_binned_data(lat, height, mask, lat_interval, height_interval): - ''' +def generate_polygons_from_binned_data( + lat, height, mask, lat_interval, height_interval +): + """ horizontal_vertical_bin_dataset Generate polygons for each bin after masking within a rectangle formed by height and latitude intervals - ''' + """ # Create latitude bins based on the specified interval lat_bins = np.arange(lat.min(), lat.max() + lat_interval, lat_interval) - + # Create height bins within the range [-12, 2] based on the specified interval height_bins = np.arange(-12, 2 + height_interval, height_interval) - + # Identify which bin each masked latitude and height value belongs to lat_bin_indices = np.digitize(lat[mask], lat_bins) - + # Determine which height bin each masked point belongs to height_bin_indices = np.digitize(height[mask], height_bins) - + # Combine latitude and height bin indices for each point - combined_bins = list(zip(lat_bin_indices, height_bin_indices)) - - # Find unique bin combinations and count the number of points in each bin + combined_bins = list(zip(lat_bin_indices, height_bin_indices, strict=False)) + + # Find unique bin combinations and count the number of points in each bin unique_bins, counts = np.unique(combined_bins, axis=0, return_counts=True) - - # Filter bins to include only those within valid index ranges - valid_bins = [(bin[0], bin[1]) for bin in unique_bins if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins)] + + # Filter bins to include only those within valid index ranges + valid_bins = [ + (bin[0], bin[1]) + for bin in unique_bins + if 1 <= bin[0] < len(lat_bins) and 1 <= bin[1] < len(height_bins) + ] polygons = [] for bin_lat, bin_height in valid_bins: - if bin_lat > 0 and bin_lat < len(lat_bins) and bin_height > 0 and bin_height < len(height_bins): - bin_points = np.array([(lat[mask][i], height[mask][i]) for i in range(sum(mask)) - if lat_bin_indices[i] == bin_lat and height_bin_indices[i] == bin_height]) + if ( + bin_lat > 0 + and bin_lat < len(lat_bins) + and bin_height > 0 + and bin_height < len(height_bins) + ): + bin_points = np.array( + [ + (lat[mask][i], height[mask][i]) + for i in range(sum(mask)) + if lat_bin_indices[i] == bin_lat + and height_bin_indices[i] == bin_height + ] + ) if len(bin_points) > 2: hull = ConvexHull(bin_points) polygons.append(bin_points[hull.vertices]) return lat_bins, height_bins, valid_bins, polygons - - - diff --git a/aok/core/kd_utils/main.py b/aok/core/kd_utils/main.py index bf046f3..51e39de 100644 --- a/aok/core/kd_utils/main.py +++ b/aok/core/kd_utils/main.py @@ -4,10 +4,9 @@ import re import sys +from config import get_args import pandas as pd -from config import get_args -from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.bathy_processing import process_subsurface_photon_filtering from kd_utils.data_processing import ( Extract_sea_photons, @@ -16,33 +15,40 @@ filter_photon_dataset_by_hull_area, load_data, ) +from kd_utils.Kd_analysis import process_kd_calculation from kd_utils.sea_photons_analysis import process_sea_photon_binning -from kd_utils.visualization import plot_convex_hulls, plot_kd_photons, plot_photon_quality_flags - +from kd_utils.visualization import ( + plot_convex_hulls, + plot_kd_photons, + plot_photon_quality_flags, +) -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) PAIR_ID_MAP = { - 'gt1l': 'gt1_pair', - 'gt1r': 'gt1_pair', - 'gt2l': 'gt2_pair', - 'gt2r': 'gt2_pair', - 'gt3l': 'gt3_pair', - 'gt3r': 'gt3_pair', + "gt1l": "gt1_pair", + "gt1r": "gt1_pair", + "gt2l": "gt2_pair", + "gt2r": "gt2_pair", + "gt3l": "gt3_pair", + "gt3r": "gt3_pair", } def get_target_beams(all_beams, beam_attrs, target_beams_arg): strong_beams = [ - gtx for gtx in all_beams - if beam_attrs[gtx]['atlas_beam_type'].decode('utf-8') == 'strong' + gtx + for gtx in all_beams + if beam_attrs[gtx]["atlas_beam_type"].decode("utf-8") == "strong" ] if not target_beams_arg: return strong_beams[:1] # auto-select first strong beam - requested = [b.strip() for b in target_beams_arg.split(',') if b.strip()] + requested = [b.strip() for b in target_beams_arg.split(",") if b.strip()] return [b for b in requested if b in strong_beams] @@ -58,14 +64,16 @@ def optionally_combine_paired_beams_for_kd(dataset, horizontal_res, enabled=Fals return dataset combined = dataset.copy() - combined['source_beam_id'] = combined['beam_id'] - combined['beam_id'] = combined['beam_id'].map(PAIR_ID_MAP).fillna(combined['beam_id']) + combined["source_beam_id"] = combined["beam_id"] + combined["beam_id"] = ( + combined["beam_id"].map(PAIR_ID_MAP).fillna(combined["beam_id"]) + ) - if 'relative_AT_dist' in combined.columns: - rel_m = pd.to_numeric(combined['relative_AT_dist'], errors='coerce') * 1000.0 - pair_bins = (rel_m / max(horizontal_res, 1)).astype('Int64') + if "relative_AT_dist" in combined.columns: + rel_m = pd.to_numeric(combined["relative_AT_dist"], errors="coerce") * 1000.0 + pair_bins = (rel_m / max(horizontal_res, 1)).astype("Int64") pair_bins = pair_bins.fillna(-1).astype(int) - combined['lat_bins'] = pair_bins + combined["lat_bins"] = pair_bins return combined @@ -87,9 +95,15 @@ def run_pipeline(args): 19 — Check data for reasonableness (filter_photon_dataset_by_hull_area + plot_kd_photons). """ - atl03_h5_file_path = os.path.join(args.workspace_path, args.atl03_path, args.atl03_file) - shoreline_data_path = os.path.join(args.workspace_path, args.other_data_path, args.shoreline_data) - gebco_full_path = os.path.join(args.workspace_path, args.other_data_path, args.gebco_path) + atl03_h5_file_path = os.path.join( + args.workspace_path, args.atl03_path, args.atl03_file + ) + shoreline_data_path = os.path.join( + args.workspace_path, args.other_data_path, args.shoreline_data + ) + gebco_full_path = os.path.join( + args.workspace_path, args.other_data_path, args.gebco_path + ) atl24_file_path = args.atl24_file if atl24_file_path and (not os.path.isabs(atl24_file_path)): atl24_file_path = os.path.join(args.workspace_path, atl24_file_path) @@ -99,9 +113,14 @@ def run_pipeline(args): match = re.search(r"_(\d{14})_", atl03_h5_file_path) timestamp = match.group(1) if match else "unknown" - version_match = re.search(r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path)) + version_match = re.search( + r"ATL03_\d{14}_\d{8}_(\d{3})_\d{2}", os.path.basename(atl03_h5_file_path) + ) atl03_version = int(version_match.group(1)) if version_match else None - logger.info("ATL03 version detected: %s", f"{atl03_version:03d}" if atl03_version else "unknown") + logger.info( + "ATL03 version detected: %s", + f"{atl03_version:03d}" if atl03_version else "unknown", + ) if args.enable_ir_ap_filter: if atl03_version is None or atl03_version < 7: @@ -120,15 +139,21 @@ def run_pipeline(args): is2_mds, is2_attrs, is2_beams = load_data(atl03_h5_file_path, False) target_strong_beams = get_target_beams(is2_beams, is2_attrs, args.target_beams) if not target_strong_beams: - logger.error("No target strong beams found. Check input file and --target_beams.") + logger.error( + "No target strong beams found. Check input file and --target_beams." + ) sys.exit(1) logger.info("Strong beams selected: %s", target_strong_beams) plot_target_beam = [target_strong_beams[0]] - sea_photon_dataset = Extract_sea_photons(is2_mds, target_strong_beams, shoreline_data_path) + sea_photon_dataset = Extract_sea_photons( + is2_mds, target_strong_beams, shoreline_data_path + ) if args.enable_ir_ap_filter and not args.no_plot: - plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, plot_target_beam) + plot_photon_quality_flags( + output_path, timestamp, sea_photon_dataset, plot_target_beam + ) sea_photon_dataset = apply_optional_ir_ap_filter( sea_photon_dataset, @@ -142,8 +167,8 @@ def run_pipeline(args): # Use --disable_solar_background_filter to turn it off. solar_bg_enabled = not args.disable_solar_background_filter if not solar_bg_enabled: - if 'solar_elevation' in sea_photon_dataset.columns: - max_solar_elev = sea_photon_dataset['solar_elevation'].max() + if "solar_elevation" in sea_photon_dataset.columns: + max_solar_elev = sea_photon_dataset["solar_elevation"].max() if max_solar_elev > args.solar_elevation_day_threshold: logger.warning( "Solar background filter is DISABLED via --disable_solar_background_filter " @@ -164,7 +189,9 @@ def run_pipeline(args): ) binned_dataset_sea_surface = process_sea_photon_binning( - sea_photon_dataset, horizontal_res=args.horizontal_res, vertical_res=args.vertical_res + sea_photon_dataset, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, ) post_refraction_refit_enabled = args.enable_post_refraction_refit @@ -175,50 +202,58 @@ def run_pipeline(args): ) post_refraction_refit_enabled = False - sea_surface_height, sea_surface_label, filtered_seafloor_subsurface_photon_dataset = \ - process_subsurface_photon_filtering( - binned_dataset_sea_surface, - gebco_file_path_lists, - args.subsurface_thresh, - args.ignore_subsurface_height_thres, - use_atl24_filter=args.enable_atl24_filter, - atl24_file_path=atl24_file_path, - atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, - use_gebco_filter=args.enable_gebco_filter, - apply_histogram_quality_filter=args.enable_histogram_quality_filter, - histogram_quality_min_ratio=args.histogram_quality_min_ratio, - histogram_quality_depth_min=args.histogram_quality_depth_min, - histogram_quality_depth_max=args.histogram_quality_depth_max, - histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, - histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, - apply_surface_sigma_filter=args.enable_surface_sigma_filter, - surface_sigma_max=args.surface_sigma_max, - apply_refraction_correction=args.enable_refraction_correction, - refraction_water_temp_c=args.refraction_water_temp_c, - refraction_wavelength_nm=args.refraction_wavelength_nm, - apply_post_refraction_refit=post_refraction_refit_enabled, - apply_flattening=args.enable_sea_surface_flattening, - flattening_window_m=args.sea_surface_flattening_window_m, - horizontal_res=args.horizontal_res, - vertical_res=args.vertical_res, - ) + ( + sea_surface_height, + sea_surface_label, + filtered_seafloor_subsurface_photon_dataset, + ) = process_subsurface_photon_filtering( + binned_dataset_sea_surface, + gebco_file_path_lists, + args.subsurface_thresh, + args.ignore_subsurface_height_thres, + use_atl24_filter=args.enable_atl24_filter, + atl24_file_path=atl24_file_path, + atl24_max_match_distance_deg=args.atl24_max_match_distance_deg, + use_gebco_filter=args.enable_gebco_filter, + apply_histogram_quality_filter=args.enable_histogram_quality_filter, + histogram_quality_min_ratio=args.histogram_quality_min_ratio, + histogram_quality_depth_min=args.histogram_quality_depth_min, + histogram_quality_depth_max=args.histogram_quality_depth_max, + histogram_quality_ref_depth_min=args.histogram_quality_ref_depth_min, + histogram_quality_ref_depth_max=args.histogram_quality_ref_depth_max, + apply_surface_sigma_filter=args.enable_surface_sigma_filter, + surface_sigma_max=args.surface_sigma_max, + apply_refraction_correction=args.enable_refraction_correction, + refraction_water_temp_c=args.refraction_water_temp_c, + refraction_wavelength_nm=args.refraction_wavelength_nm, + apply_post_refraction_refit=post_refraction_refit_enabled, + apply_flattening=args.enable_sea_surface_flattening, + flattening_window_m=args.sea_surface_flattening_window_m, + horizontal_res=args.horizontal_res, + vertical_res=args.vertical_res, + ) - final_filtered_subsurface_photon_dataset = filtered_seafloor_subsurface_photon_dataset[ - filtered_seafloor_subsurface_photon_dataset['beam_id'].isin(target_strong_beams) - ].copy() + final_filtered_subsurface_photon_dataset = ( + filtered_seafloor_subsurface_photon_dataset[ + filtered_seafloor_subsurface_photon_dataset["beam_id"].isin( + target_strong_beams + ) + ].copy() + ) if args.enable_convex_hull_filter: - final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = \ + final_filtered_subsurface_photon_dataset, convex_hull_areas, convex_hulls = ( filter_photon_dataset_by_hull_area( final_filtered_subsurface_photon_dataset, - hull_area_threshold=args.convex_hull_area_threshold + hull_area_threshold=args.convex_hull_area_threshold, ) + ) if plot_target_beam and not args.no_plot: plot_convex_hulls( final_filtered_subsurface_photon_dataset, plot_target_beam, convex_hulls, - convex_hull_areas + convex_hull_areas, ) subsurface_output_path = os.path.join( @@ -230,43 +265,50 @@ def run_pipeline(args): kd_input_dataset = optionally_combine_paired_beams_for_kd( final_filtered_subsurface_photon_dataset, horizontal_res=args.horizontal_res, - enabled=args.enable_paired_beam_combine + enabled=args.enable_paired_beam_combine, ) wave_mult = args.wave_exclusion_multiplier if args.enable_wave_adaptive_fit else 0.0 subsurface_photon_df_added_kd = process_kd_calculation( - kd_input_dataset, decay_zone_threshold=args.decay_zone_threshold, + kd_input_dataset, + decay_zone_threshold=args.decay_zone_threshold, kd_fit_method=args.kd_fit_method, wave_exclusion_multiplier=wave_mult, wave_sigma_calm_threshold=args.wave_sigma_calm_threshold, ) - kd_output_path = os.path.join(output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv") + kd_output_path = os.path.join( + output_path, f"{timestamp}_AddedKdDataset_strongBeams_Further.csv" + ) subsurface_photon_df_added_kd.to_csv(kd_output_path, index=False) - if not args.no_plot and 'relative_AT_dist' in kd_input_dataset.columns: + if not args.no_plot and "relative_AT_dist" in kd_input_dataset.columns: unique_photon_dataset = kd_input_dataset[ - ['relative_AT_dist', 'lat_bins', 'photon_height'] + ["relative_AT_dist", "lat_bins", "photon_height"] ].drop_duplicates() - unique_photon_dataset['relative_AT_dist_center'] = unique_photon_dataset.groupby( - 'lat_bins', observed=False - )['relative_AT_dist'].transform('mean') + unique_photon_dataset["relative_AT_dist_center"] = ( + unique_photon_dataset.groupby( + "lat_bins", observed=False + )["relative_AT_dist"].transform("mean") + ) _closest_rows = [] - for _lat_bin, _group in unique_photon_dataset.groupby('lat_bins', observed=False): + for _lat_bin, _group in unique_photon_dataset.groupby( + "lat_bins", observed=False + ): if _group.empty: continue - _center = _group['relative_AT_dist_center'].iloc[0] + _center = _group["relative_AT_dist_center"].iloc[0] _group = _group.copy() - _group['dist_to_center'] = abs(_group['relative_AT_dist'] - _center) - _closest_rows.append(_group.loc[[_group['dist_to_center'].idxmin()]]) + _group["dist_to_center"] = abs(_group["relative_AT_dist"] - _center) + _closest_rows.append(_group.loc[[_group["dist_to_center"].idxmin()]]) if not _closest_rows: logger.warning("No along-track bins survived filtering. Skipping plot.") else: closest_to_center = pd.concat(_closest_rows).reset_index(drop=True) kd_df_merged_distance = closest_to_center.merge( - subsurface_photon_df_added_kd, - on='lat_bins', - how='left' - ).drop(columns=['relative_AT_dist_center', 'dist_to_center'], errors='ignore') + subsurface_photon_df_added_kd, on="lat_bins", how="left" + ).drop( + columns=["relative_AT_dist_center", "dist_to_center"], errors="ignore" + ) plot_kd_photons( output_path, @@ -279,7 +321,7 @@ def run_pipeline(args): logger.info("SUCCESS! Kd output: %s", kd_output_path) -if __name__ == '__main__': +if __name__ == "__main__": cli_args = get_args() try: run_pipeline(cli_args) diff --git a/aok/core/kd_utils/sea_photons_analysis.py b/aok/core/kd_utils/sea_photons_analysis.py index 729a22c..f986142 100644 --- a/aok/core/kd_utils/sea_photons_analysis.py +++ b/aok/core/kd_utils/sea_photons_analysis.py @@ -1,12 +1,12 @@ +from datetime import datetime +import netCDF4 import numpy as np -import geopandas as gpd -from datetime import datetime import pandas as pd -import netCDF4 from scipy.signal import find_peaks from scipy.stats import norm + # Function to apply binning beam-by-beam by calling the function of horizontal_vertical_bin_dataset def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res): """ @@ -21,18 +21,20 @@ def process_sea_photon_binning(sea_photon_dataset, horizontal_res, vertical_res) binned_beam_datasets = [] # Group the dataset by 'beam_id' and process each group separately - for beam_id, beam_data in sea_photon_dataset.groupby('beam_id'): - print(f'Processing binning for beam: {beam_id}') + for beam_id, beam_data in sea_photon_dataset.groupby("beam_id"): + print(f"Processing binning for beam: {beam_id}") # Apply binning to the current beam dataset - binned_beam_data = horizontal_vertical_bin_dataset(beam_data, horizontal_res, vertical_res) + binned_beam_data = horizontal_vertical_bin_dataset( + beam_data, horizontal_res, vertical_res + ) # Append the binned data for the current beam to the list binned_beam_datasets.append(binned_beam_data) # Combine all binned beam datasets into a single DataFrame binned_dataset_sea_surface = pd.concat(binned_beam_datasets, ignore_index=True) - + return binned_dataset_sea_surface @@ -48,35 +50,52 @@ def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): 10 — Re-build histograms based on corrected depths (called again after refraction correction when apply_post_refraction_refit is enabled). """ - + # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise valid_range = (-70, 5) - valid_mask = (dataset['photon_height'] > valid_range[0]) & (dataset['photon_height'] < valid_range[1]) + valid_mask = (dataset["photon_height"] > valid_range[0]) & ( + dataset["photon_height"] < valid_range[1] + ) # Apply the valid_mask to filter unwanted values # and create a copy to avoid SettingWithCopyWarning filtered_dataset = dataset[valid_mask].copy() # Calculate the number of height bins - height_range = abs(filtered_dataset['photon_height'].max() - filtered_dataset['photon_height'].min()) - height_bin_number = max(1, round(height_range / vertical_res)) # Ensure at least one bin + height_range = abs( + filtered_dataset["photon_height"].max() + - filtered_dataset["photon_height"].min() + ) + height_bin_number = max( + 1, round(height_range / vertical_res) + ) # Ensure at least one bin # Calculate the number of latitude bins - lat_range = abs(filtered_dataset['lat'].max() - filtered_dataset['lat'].min()) + lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin # Create bins for latitude - lat_bins = pd.cut(filtered_dataset['lat'], bins=lat_bin_number, labels=np.arange(lat_bin_number)) + lat_bins = pd.cut( + filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) + ) # Create bins for height - height_bins = pd.cut(filtered_dataset['photon_height'], bins=height_bin_number, - labels=np.round(np.linspace(filtered_dataset['photon_height'].min(), - filtered_dataset['photon_height'].max(), - num=height_bin_number), decimals=1)) + height_bins = pd.cut( + filtered_dataset["photon_height"], + bins=height_bin_number, + labels=np.round( + np.linspace( + filtered_dataset["photon_height"].min(), + filtered_dataset["photon_height"].max(), + num=height_bin_number, + ), + decimals=1, + ), + ) # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, 'lat_bins'] = lat_bins - filtered_dataset.loc[:, 'height_bins'] = height_bins + filtered_dataset.loc[:, "lat_bins"] = lat_bins + filtered_dataset.loc[:, "height_bins"] = height_bins filtered_dataset = filtered_dataset.reset_index(drop=True) return filtered_dataset @@ -98,61 +117,66 @@ def get_sea_surface_height_static(binned_data, threshold): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface """ - - #set flag for the df save + + # set flag for the df save firstTimeIndex = True - + # Create sea height list sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - # Group dataset along horizental (latitude bins) - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) # Loop through groups to detect sea surface for k, v in data_groups.items(): # based on lat_utm - lat_bin_average = v['lat'].mean() + lat_bin_average = v["lat"].mean() # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(['height_bins'], observed=False).count()) + new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) # Check if new_df is not empty before finding the bin with the highest photon count - if not new_df.empty: + if not new_df.empty: # Find the vertical bin with the highest photon count - largest_h_bin = new_df['lat'].argmax() - + largest_h_bin = new_df["lat"].argmax() + # Select the index of the bin with the highest count largest_h_index = new_df.index[largest_h_bin] - + # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[v['height_bins'] == largest_h_index, 'photon_height'] + photons_sea_surface = v.loc[ + v["height_bins"] == largest_h_index, "photon_height" + ] lat_bin_sea_median = photons_sea_surface.median() - + # Append to sea height list sea_surface_height.append(lat_bin_sea_median) mean_lat_bins_seq.append(lat_bin_average) del new_df - + # Get all photons below sea surface # to determine segment type of each subsurface water column # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = \ - v.loc[(v['photon_height'] > (lat_bin_sea_median - threshold)) & - (v['photon_height'] < (lat_bin_sea_median + 2*threshold))] - + photons_sea_surface_up = v.loc[ + (v["photon_height"] > (lat_bin_sea_median - threshold)) + & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) + ] + # Calculate the photon ratio between surface and whole photons - if v['photon_height'].shape[0] > 0: - new_photons_ratio_sea_surface = \ - photons_sea_surface_up.shape[0] / v['photon_height'].shape[0] + if v["photon_height"].shape[0] > 0: + new_photons_ratio_sea_surface = ( + photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] + ) else: new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append((1 - new_photons_ratio_sea_surface)) - + + sea_surface_subsurface_photons_ratio.append( + 1 - new_photons_ratio_sea_surface + ) + else: # Append NaNs if the group is empty sea_surface_height.append(np.nan) @@ -163,38 +187,44 @@ def get_sea_surface_height_static(binned_data, threshold): mean = np.nanmean(sea_surface_height, axis=0) sd = np.nanstd(sea_surface_height, axis=0) - final_sea_surface_height = np.where((sea_surface_height > (mean + 2 * sd)) | - (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = \ - np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) - + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) + # Loop through groups again and return photons below 0.5m of sea height PhotonDFBelowThresholdPeak = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin if not np.isnan(final_sea_surface_height[i]): - NewPhotonDFBelowThresholdPeak = \ - v.loc[v['photon_height'] < (final_sea_surface_height[i] - threshold)] - + NewPhotonDFBelowThresholdPeak = v.loc[ + v["photon_height"] < (final_sea_surface_height[i] - threshold) + ] + if firstTimeIndex: PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex=False - else: - PhotonDFBelowThresholdPeak=\ - pd.concat([PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak]) - - - return final_sea_surface_height, \ - sea_surface_height_abnormal_label, \ - sea_surface_dominated_label,\ - PhotonDFBelowThresholdPeak + firstTimeIndex = False + else: + PhotonDFBelowThresholdPeak = pd.concat( + [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] + ) + + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowThresholdPeak, + ) def get_sea_surface_height_adaptive(binned_data): @@ -219,18 +249,18 @@ def get_sea_surface_height_adaptive(binned_data): sea_surface_dominated_label (array): Labels for surface-dominated bins PhotonDFBelowSurface (pandas.DataFrame): Photons below detected surface with waves removed """ - + firstTimeIndex = True sea_surface_height = [] mean_lat_bins_seq = [] sea_surface_subsurface_photons_ratio = [] - grouped_data = binned_data.groupby(['lat_bins'], group_keys=True, observed=False) + grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) data_groups = dict(list(grouped_data)) for k, v in data_groups.items(): - lat_bin_average = v['lat'].mean() - + lat_bin_average = v["lat"].mean() + if len(v) < 20: # Increase minimum photon count for robustness sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -238,18 +268,22 @@ def get_sea_surface_height_adaptive(binned_data): continue # Finer histogram for better peak resolution - hist, bin_edges = np.histogram(v['photon_height'], - bins=100, # Finer bins - density=True, - range=(v['photon_height'].min(), v['photon_height'].max())) + hist, bin_edges = np.histogram( + v["photon_height"], + bins=100, # Finer bins + density=True, + range=(v["photon_height"].min(), v["photon_height"].max()), + ) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 # Enhanced peak detection - peaks, properties = find_peaks(hist, - height=np.max(hist)*0.3, # Stricter peak height - distance=10, # Wider separation - prominence=np.max(hist)*0.1) # Require prominent peaks - + peaks, properties = find_peaks( + hist, + height=np.max(hist) * 0.3, # Stricter peak height + distance=10, # Wider separation + prominence=np.max(hist) * 0.1, + ) # Require prominent peaks + if len(peaks) == 0: sea_surface_height.append(np.nan) mean_lat_bins_seq.append(np.nan) @@ -263,25 +297,31 @@ def get_sea_surface_height_adaptive(binned_data): # Two-pass Gaussian fit: start with ±1 m, widen if sigma is large # (the ±1 m window underestimates sigma for SWH > 2 m) half_win = 1.0 - peak_data = v[(v['photon_height'] > surface_height - half_win) & - (v['photon_height'] < surface_height + half_win)] + peak_data = v[ + (v["photon_height"] > surface_height - half_win) + & (v["photon_height"] < surface_height + half_win) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) # Refit with wider window if sigma suggests truncation if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[(v['photon_height'] > mu - half_win2) & - (v['photon_height'] < mu + half_win2)] + peak_data2 = v[ + (v["photon_height"] > mu - half_win2) + & (v["photon_height"] < mu + half_win2) + ] if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2['photon_height']) + mu, sigma = norm.fit(peak_data2["photon_height"]) else: mu, sigma = surface_height, 0.2 # More conservative default adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) # at least 1 m below # Extended surface layer to remove wave effects - surface_photons = v[(v['photon_height'] > adaptive_threshold) & - (v['photon_height'] < mu + 3.0 * sigma)] # Wider upper bound + surface_photons = v[ + (v["photon_height"] > adaptive_threshold) + & (v["photon_height"] < mu + 3.0 * sigma) + ] # Wider upper bound ratio = len(surface_photons) / len(v) if len(v) > 0 else np.nan sea_surface_height.append(mu) @@ -291,45 +331,58 @@ def get_sea_surface_height_adaptive(binned_data): # Outlier filtering mean = np.nanmean(sea_surface_height) sd = np.nanstd(sea_surface_height) - final_sea_surface_height = np.where((sea_surface_height > mean + 2*sd) | - (sea_surface_height < mean - 2*sd), - np.nan, - sea_surface_height).tolist() - - sea_surface_height_abnormal_label = np.where(np.isnan(final_sea_surface_height), 0, 1) - sea_surface_dominated_label = np.where(np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1) + final_sea_surface_height = np.where( + (sea_surface_height > mean + 2 * sd) | (sea_surface_height < mean - 2 * sd), + np.nan, + sea_surface_height, + ).tolist() + + sea_surface_height_abnormal_label = np.where( + np.isnan(final_sea_surface_height), 0, 1 + ) + sea_surface_dominated_label = np.where( + np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 + ) # Filter subsurface photons with diagnostics PhotonDFBelowSurface = pd.DataFrame() for i, (k, v) in enumerate(data_groups.items()): if not np.isnan(final_sea_surface_height[i]): half_win = 1.0 - peak_data = v[(v['photon_height'] > final_sea_surface_height[i] - half_win) & - (v['photon_height'] < final_sea_surface_height[i] + half_win)] + peak_data = v[ + (v["photon_height"] > final_sea_surface_height[i] - half_win) + & (v["photon_height"] < final_sea_surface_height[i] + half_win) + ] if len(peak_data) > 10: - mu, sigma = norm.fit(peak_data['photon_height']) + mu, sigma = norm.fit(peak_data["photon_height"]) if sigma > 0.4: half_win2 = min(2.5 * sigma, 3.0) - peak_data2 = v[(v['photon_height'] > mu - half_win2) & - (v['photon_height'] < mu + half_win2)] + peak_data2 = v[ + (v["photon_height"] > mu - half_win2) + & (v["photon_height"] < mu + half_win2) + ] if len(peak_data2) > 10: - mu, sigma = norm.fit(peak_data2['photon_height']) + mu, sigma = norm.fit(peak_data2["photon_height"]) adaptive_threshold = min(mu - 3.0 * sigma, mu - 1.0) else: adaptive_threshold = final_sea_surface_height[i] - 1.0 - NewPhotonDFBelowSurface = v[v['photon_height'] < adaptive_threshold] - + NewPhotonDFBelowSurface = v[v["photon_height"] < adaptive_threshold] + if firstTimeIndex: PhotonDFBelowSurface = NewPhotonDFBelowSurface firstTimeIndex = False else: - PhotonDFBelowSurface = pd.concat([PhotonDFBelowSurface, NewPhotonDFBelowSurface]) + PhotonDFBelowSurface = pd.concat( + [PhotonDFBelowSurface, NewPhotonDFBelowSurface] + ) - return (final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowSurface) + return ( + final_sea_surface_height, + sea_surface_height_abnormal_label, + sea_surface_dominated_label, + PhotonDFBelowSurface, + ) def get_water_temp(date_year, date_month, date_day, latitude, longitude): @@ -349,7 +402,7 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): month = date_month # date[6:8] day = date_day - day_of_year = str(datetime.strptime(date, '%Y%m%d').timetuple().tm_yday) + day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) # Add zero in front of day of year string zero_day_of_year = day_of_year.zfill(3) @@ -360,8 +413,11 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lat_min = 0 new_lat_max = 17998 - new_lat = round(((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) * - (new_lat_max - new_lat_min) + new_lat_min) + new_lat = round( + ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) + * (new_lat_max - new_lat_min) + + new_lat_min + ) # Calculate ratio of longitude from mid-point of IS2 track old_lon = longitude.mean() @@ -370,24 +426,41 @@ def get_water_temp(date_year, date_month, date_day, latitude, longitude): new_lon_min = 0 new_lon_max = 35999 - new_lon = round(((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) * - (new_lon_max - new_lon_min) + new_lon_min) + new_lon = round( + ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) + * (new_lon_max - new_lon_min) + + new_lon_min + ) # Access the SST data using the JPL OpenDap interface - url = 'https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/' \ - + str(year) + '/' + str(zero_day_of_year) + '/' + str(date) \ - + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc' + url = ( + "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" + + str(year) + + "/" + + str(zero_day_of_year) + + "/" + + str(date) + + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" + ) dataset = netCDF4.Dataset(url) # Access the data and convert the temperature from K to C - water_temp = dataset['analysed_sst'][0, new_lat, new_lon] - 273.15 + water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 return water_temp -def refraction_correction(WTemp, WSmodel, Wavelength, - Photon_ref_elev, Ph_ref_azimuth, - PhotonZ, PhotonX, PhotonY, Ph_Conf): +def refraction_correction( + WTemp, + WSmodel, + Wavelength, + Photon_ref_elev, + Ph_ref_azimuth, + PhotonZ, + PhotonX, + PhotonY, + Ph_Conf, +): """ WTemp; there is python library that pulls water temp data WSmodel is the value surface height @@ -417,18 +490,18 @@ def refraction_correction(WTemp, WSmodel, Wavelength, n1 = 1.00029 # refractive index of water - n2 = (a * WaterTemp ** 2) + (b * wl ** 2) + (c * WaterTemp) + (d * wl) + e + n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e # assumption is 0.25416 # This example is refractionCoef = 0.25449 # 1.00029 is refraction of air constant - correction_coef = (1 - (n1 / n2)) + correction_coef = 1 - (n1 / n2) # read photon ref_elev to get theta1 theta1 = np.pi / 2 - Photon_ref_elev # eq 1. Theta2 - theta2 = np.arcsin(((n1 * np.sin(theta1)) / n2)) + theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) # eq 3. S # Approximate water Surface = 1.5 @@ -447,7 +520,7 @@ def refraction_correction(WTemp, WSmodel, Wavelength, phi = theta1 - theta2 # P is the difference between raw and corrected YZ location - P = np.sqrt(R ** 2 + S ** 2 - 2 * R * S * np.cos(phi)) + P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) # alpha is an angle needed alpha = np.arcsin((R * np.sin(phi)) / P) @@ -471,19 +544,27 @@ def refraction_correction(WTemp, WSmodel, Wavelength, outY = PhotonY + DN outZ = PhotonZ + DZ - ''' + """ print('For selected Bathy photon:') print('lat = ', PhotonY[9000]) print('long = ', PhotonX[9000]) print('Raw Depth = ', PhotonZ[9000]) print('D = ', D[9000]) - + print('ref_elev = ', Photon_ref_elev[9000]) - + print('Delta East = ', DE[9000]) print('Delta North = ', DN[9000]) print('Delta Z = ', DZ[9000]) - ''' - return (outX, outY, outZ, Ph_Conf, PhotonX, PhotonY, PhotonZ, Ph_ref_azimuth, - Photon_ref_elev) # We are most interested in out-x, out-y, out-z - + """ + return ( + outX, + outY, + outZ, + Ph_Conf, + PhotonX, + PhotonY, + PhotonZ, + Ph_ref_azimuth, + Photon_ref_elev, + ) # We are most interested in out-x, out-y, out-z diff --git a/aok/core/kd_utils/visualization.py b/aok/core/kd_utils/visualization.py index 8eee8b8..900c07c 100644 --- a/aok/core/kd_utils/visualization.py +++ b/aok/core/kd_utils/visualization.py @@ -1,66 +1,103 @@ # utils/visualization.py -import numpy as np import os -import pandas as pd import time + import geopandas as gpd -from pyproj import Transformer, Proj import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pyproj import Transformer from scipy.spatial import ConvexHull + def plot_photon_height(sea_photon_dataset, hlims=[-25, 10]): fig, ax = plt.subplots() - ax.scatter(sea_photon_dataset['latitude'], sea_photon_dataset['photon_height'], s=1, c='k', alpha=0.15, edgecolors='none', label='ATL03 Photons') - ax.set_xlabel('Relative AT Distance') - ax.set_ylabel('Height (h_ph)') - ax.set_title('Scatter Plot of ATL03 Photons') + ax.scatter( + sea_photon_dataset["latitude"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="ATL03 Photons", + ) + ax.set_xlabel("Relative AT Distance") + ax.set_ylabel("Height (h_ph)") + ax.set_title("Scatter Plot of ATL03 Photons") ax.set_ylim(hlims) ax.legend() plt.show() -def plot_filtered_seafloor_photons(filtered_seafloor_subsurface_dataset, sea_photon_dataset, sea_surface_height, output_path): + +def plot_filtered_seafloor_photons( + filtered_seafloor_subsurface_dataset, + sea_photon_dataset, + sea_surface_height, + output_path, +): """ Plots the filtered seafloor photon data along with sea surface and seafloor elevation data. Parameters: - filtered_seafloor_subsurface_dataset: DataFrame containing the filtered subsurface photon data. - - sea_photon_dataset: DataFrame containing the original sea photon data. + - sea_photon_dataset: DataFrame containing the original sea photon data. - sea_surface_height: Array of sea surface height values. - output_path: Path to save the output plot. """ - + # get sea_surface_x_axis_bins - sea_surface_x_axis_bins = np.linspace(filtered_seafloor_subsurface_dataset['relative_AT_dist'].min(), - filtered_seafloor_subsurface_dataset['relative_AT_dist'].max(), - len(sea_surface_height)) - + sea_surface_x_axis_bins = np.linspace( + filtered_seafloor_subsurface_dataset["relative_AT_dist"].min(), + filtered_seafloor_subsurface_dataset["relative_AT_dist"].max(), + len(sea_surface_height), + ) + hlims = [-25, 10] fig, ax = plt.subplots() # Scatter plot of subsurface photons - ax.scatter(sea_photon_dataset['relative_AT_dist'], sea_photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.scatter( + sea_photon_dataset["relative_AT_dist"], + sea_photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) # Overlay the seafloor elevation data as points - ax.plot(filtered_seafloor_subsurface_dataset['relative_AT_dist'], filtered_seafloor_subsurface_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') - - ax.plot(sea_surface_x_axis_bins, [x - 0.5 for x in sea_surface_height], - linewidth=0.8, color='#DD571C', alpha=0.4, label='0.5 m Below Surface Peak') + ax.plot( + filtered_seafloor_subsurface_dataset["relative_AT_dist"], + filtered_seafloor_subsurface_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) + + ax.plot( + sea_surface_x_axis_bins, + [x - 0.5 for x in sea_surface_height], + linewidth=0.8, + color="#DD571C", + alpha=0.4, + label="0.5 m Below Surface Peak", + ) # Set labels and title - ax.set_xlabel('Distance From Start of Track (km)') - ax.set_ylabel('Photon Height (m)') - ax.set_title('Scatter Plot of Filtered Sea Photon Dataset') + ax.set_xlabel("Distance From Start of Track (km)") + ax.set_ylabel("Photon Height (m)") + ax.set_title("Scatter Plot of Filtered Sea Photon Dataset") ax.set_ylim(hlims) # Add a legend ax.legend() # Save the plot - plt.legend(loc='upper right') - plt.savefig(output_path, dpi=400, format='jpeg') + plt.legend(loc="upper right") + plt.savefig(output_path, dpi=400, format="jpeg") # Show the plot plt.show() @@ -70,32 +107,54 @@ def plot_convex_hulls(photon_dataset, target_beam_ids, convex_hulls, convex_hull """ Plots ConvexHull polygons and annotated areas for each lat_bin group in the dataset. """ - #filter beam type + # filter beam type # photon_dataset - photon_dataset = photon_dataset[photon_dataset['beam_id'].isin(target_beam_ids)] - + photon_dataset = photon_dataset[photon_dataset["beam_id"].isin(target_beam_ids)] + fig, ax = plt.subplots(figsize=(10, 6)) hlims = [-10, 2] - + for lat_bin, hull_points in convex_hulls.items(): hull = ConvexHull(hull_points) - ax.fill(hull_points[hull.vertices, 0], hull_points[hull.vertices, 1], alpha=0.3, label=f'Bin {lat_bin}') - + ax.fill( + hull_points[hull.vertices, 0], + hull_points[hull.vertices, 1], + alpha=0.3, + label=f"Bin {lat_bin}", + ) + # Calculate centroid to place the label centroid = np.mean(hull_points[hull.vertices], axis=0) - ax.text(centroid[0], centroid[1], f'{convex_hull_areas[lat_bin]:.2f}', - horizontalalignment='center', verticalalignment='center', fontsize=10, color='black') - - ax.scatter(photon_dataset['lat'], photon_dataset['photon_height'], - s=1, c='k', alpha=0.15, edgecolors='none', label='Subsurface ATL03 Photons') + ax.text( + centroid[0], + centroid[1], + f"{convex_hull_areas[lat_bin]:.2f}", + horizontalalignment="center", + verticalalignment="center", + fontsize=10, + color="black", + ) + + ax.scatter( + photon_dataset["lat"], + photon_dataset["photon_height"], + s=1, + c="k", + alpha=0.15, + edgecolors="none", + label="Subsurface ATL03 Photons", + ) ax.set_ylim(hlims) - ax.set_xlabel('Latitude') - ax.set_ylabel('Photon Height') - ax.set_title('ConvexHull of Photons within each Lat Bin') + ax.set_xlabel("Latitude") + ax.set_ylabel("Photon Height") + ax.set_title("ConvexHull of Photons within each Lat Bin") plt.show() + # plot photons coloured by quality_ph flag for diagnostic purposes -def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target_beam_ids): +def plot_photon_quality_flags( + output_path, timestamp, sea_photon_dataset, target_beam_ids +): """ Scatter plot of photon height vs. along-track distance, coloured by quality_ph value. @@ -104,145 +163,219 @@ def plot_photon_quality_flags(output_path, timestamp, sea_photon_dataset, target bit 1 (value 2) = possible impulse response effect bit 2 (value 4) = possible TEP """ - dataset = sea_photon_dataset[sea_photon_dataset['beam_id'].isin(target_beam_ids)].copy() + dataset = sea_photon_dataset[ + sea_photon_dataset["beam_id"].isin(target_beam_ids) + ].copy() if dataset.empty: return quality_colors = { - 0: ('dimgray', 'nominal (0)'), - 1: ('red', 'afterpulse (1)'), - 2: ('orange', 'impulse response (2)'), - 3: ('darkred', 'afterpulse + impulse (3)'), - 4: ('royalblue','TEP (4)'), - 5: ('purple', 'afterpulse + TEP (5)'), - 6: ('cyan', 'impulse + TEP (6)'), - 7: ('black', 'all flags (7)'), + 0: ("dimgray", "nominal (0)"), + 1: ("red", "afterpulse (1)"), + 2: ("orange", "impulse response (2)"), + 3: ("darkred", "afterpulse + impulse (3)"), + 4: ("royalblue", "TEP (4)"), + 5: ("purple", "afterpulse + TEP (5)"), + 6: ("cyan", "impulse + TEP (6)"), + 7: ("black", "all flags (7)"), } fig, ax = plt.subplots(figsize=(12, 5)) - unique_flags = sorted(dataset['quality_ph'].dropna().astype(int).unique()) + unique_flags = sorted(dataset["quality_ph"].dropna().astype(int).unique()) for flag in unique_flags: - subset = dataset[dataset['quality_ph'].astype(int) == flag] - color, label = quality_colors.get(flag, ('magenta', f'unknown ({flag})')) - ax.scatter(subset['relative_AT_dist'], subset['photon_height'], - s=0.5, c=color, alpha=0.3, edgecolors='none', label=label) - - ax.set_xlabel('Relative Along-Track Distance (km)', fontsize=12) - ax.set_ylabel('Photon Height (m)', fontsize=12) - ax.set_title('Photon quality_ph flag distribution', fontsize=13) + subset = dataset[dataset["quality_ph"].astype(int) == flag] + color, label = quality_colors.get(flag, ("magenta", f"unknown ({flag})")) + ax.scatter( + subset["relative_AT_dist"], + subset["photon_height"], + s=0.5, + c=color, + alpha=0.3, + edgecolors="none", + label=label, + ) + + ax.set_xlabel("Relative Along-Track Distance (km)", fontsize=12) + ax.set_ylabel("Photon Height (m)", fontsize=12) + ax.set_title("Photon quality_ph flag distribution", fontsize=13) ax.set_ylim([-15, 5]) - ax.legend(loc='lower right', markerscale=6, fontsize=9) + ax.legend(loc="lower right", markerscale=6, fontsize=9) fig.tight_layout() - save_path = os.path.join(output_path, f'{timestamp}_quality_ph_flags.jpg') - plt.savefig(save_path, dpi=300, format='jpeg') + save_path = os.path.join(output_path, f"{timestamp}_quality_ph_flags.jpg") + plt.savefig(save_path, dpi=300, format="jpeg") plt.show() print(f"Quality flag plot saved: {save_path}") # Print summary counts print("\nquality_ph flag summary:") for flag in unique_flags: - count = (dataset['quality_ph'].astype(int) == flag).sum() - _, label = quality_colors.get(flag, ('', f'unknown ({flag})')) + count = (dataset["quality_ph"].astype(int) == flag).sum() + _, label = quality_colors.get(flag, ("", f"unknown ({flag})")) pct = 100.0 * count / len(dataset) print(f" {label:35s}: {count:>8,d} ({pct:.1f}%)") # plot the kd and photon -def plot_kd_photons(OutputPath, timestamp, target_beam_ids, subsurface_photon_dataset, Kd_DF_MergedDistance): - #filter beam type +def plot_kd_photons( + OutputPath, + timestamp, + target_beam_ids, + subsurface_photon_dataset, + Kd_DF_MergedDistance, +): + # filter beam type # photon_dataset - subsurface_photon_dataset = subsurface_photon_dataset[subsurface_photon_dataset['beam_id'].isin(target_beam_ids)] - Kd_DF_MergedDistance = Kd_DF_MergedDistance[Kd_DF_MergedDistance['beam_id'].isin(target_beam_ids)] - + subsurface_photon_dataset = subsurface_photon_dataset[ + subsurface_photon_dataset["beam_id"].isin(target_beam_ids) + ] + Kd_DF_MergedDistance = Kd_DF_MergedDistance[ + Kd_DF_MergedDistance["beam_id"].isin(target_beam_ids) + ] + hlims = [-45, 5] fig, ax1 = plt.subplots(figsize=(10, 6)) - ax1.scatter(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['photon_height'], - s=1.5, c='k', alpha=0.2, edgecolors='none', label='Subsurface ATL03 Photon Height') - ax1.set_xlabel('Relative Along-Track Distance', fontsize=18) - ax1.tick_params(axis='x', labelsize=18) # Added line for x-tick label fontsize - ax1.set_ylabel('Photon Height', color='b', fontsize=18) - ax1.tick_params(axis='y', labelcolor='b', labelsize=18) - if 'seafloor_elevation' in subsurface_photon_dataset.columns: - ax1.plot(subsurface_photon_dataset['relative_AT_dist'], subsurface_photon_dataset['seafloor_elevation'], - linewidth=0.8, c='b', alpha=0.4, label='Seafloor Elevation') + ax1.scatter( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["photon_height"], + s=1.5, + c="k", + alpha=0.2, + edgecolors="none", + label="Subsurface ATL03 Photon Height", + ) + ax1.set_xlabel("Relative Along-Track Distance", fontsize=18) + ax1.tick_params(axis="x", labelsize=18) # Added line for x-tick label fontsize + ax1.set_ylabel("Photon Height", color="b", fontsize=18) + ax1.tick_params(axis="y", labelcolor="b", labelsize=18) + if "seafloor_elevation" in subsurface_photon_dataset.columns: + ax1.plot( + subsurface_photon_dataset["relative_AT_dist"], + subsurface_photon_dataset["seafloor_elevation"], + linewidth=0.8, + c="b", + alpha=0.4, + label="Seafloor Elevation", + ) # ax1.axhline(y=-6, color='blue', linestyle='--', label='y=-6 m') # ax1.set_xlim([1800, 2100]) # Set x-axis limits ax1.set_ylim(hlims) - + ax2 = ax1.twinx() - ax2.scatter(Kd_DF_MergedDistance['relative_AT_dist'], Kd_DF_MergedDistance['kd'], - label='Kd values', color='r', alpha=0.6) - ax2.set_ylabel('Kd Value', color='r', fontsize=18) - ax2.tick_params(axis='y', labelcolor='r',labelsize=18) + ax2.scatter( + Kd_DF_MergedDistance["relative_AT_dist"], + Kd_DF_MergedDistance["kd"], + label="Kd values", + color="r", + alpha=0.6, + ) + ax2.set_ylabel("Kd Value", color="r", fontsize=18) + ax2.tick_params(axis="y", labelcolor="r", labelsize=18) # fig.suptitle('Photon Height and Kd Values along Relative Along-Track Distance') handles1, labels1 = ax1.get_legend_handles_labels() handles2, labels2 = ax2.get_legend_handles_labels() - fig.legend(handles1 + handles2, labels1 + labels2, loc='upper right', bbox_to_anchor=(0.85, 0.85)) + fig.legend( + handles1 + handles2, + labels1 + labels2, + loc="upper right", + bbox_to_anchor=(0.85, 0.85), + ) # fig.legend(handles1 + handles2, labels1 + labels2, loc='lower left', bbox_to_anchor=(0.08, 0.08)) fig.tight_layout() - plt.savefig(os.path.join(OutputPath, f'{timestamp}_IS2_subsurface_kd_2.jpg'), dpi=400, format='jpeg') + plt.savefig( + os.path.join(OutputPath, f"{timestamp}_IS2_subsurface_kd_2.jpg"), + dpi=400, + format="jpeg", + ) plt.show() - -def plot_bin_polygon_data(lat, height, seafloor_height, mask, lat_bins, height_bins, valid_bins, polygons, y_min, y_max): +def plot_bin_polygon_data( + lat, + height, + seafloor_height, + mask, + lat_bins, + height_bins, + valid_bins, + polygons, + y_min, + y_max, +): plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title("Original Height Data") - plt.scatter(lat, height, c=height, cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.scatter(lat, height, c=height, cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.subplot(1, 2, 2) plt.title("Masked Region (ROI)") - plt.scatter(lat[mask], height[mask], c=height[mask], cmap='viridis', s=10) - plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], 'r-', label='Seafloor') + plt.scatter(lat[mask], height[mask], c=height[mask], cmap="viridis", s=10) + plt.plot(np.sort(lat), seafloor_height[np.argsort(lat)], "r-", label="Seafloor") for polygon in polygons: - plt.gca().add_patch(plt.Polygon(polygon, edgecolor='red', facecolor='none', linewidth=1)) + plt.gca().add_patch( + plt.Polygon(polygon, edgecolor="red", facecolor="none", linewidth=1) + ) for lb in lat_bins: - plt.axvline(x=lb, color='gray', linestyle='--', linewidth=0.5) + plt.axvline(x=lb, color="gray", linestyle="--", linewidth=0.5) for hb in height_bins: - plt.axhline(y=hb, color='gray', linestyle='--', linewidth=0.5) + plt.axhline(y=hb, color="gray", linestyle="--", linewidth=0.5) - plt.colorbar(label='Height') - plt.xlabel('Latitude') - plt.ylabel('Height') + plt.colorbar(label="Height") + plt.xlabel("Latitude") + plt.ylabel("Height") plt.ylim(y_min, y_max) plt.legend() plt.tight_layout() plt.show() -# -def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label, - y_limit_top, y_limit_bottom, percentile, file, geo_df, - ref_y, ref_z, beam, epsg_num): + +def produce_figures( + binned_data, + bath_height, + sea_height, + solo_sea_surface_label, + y_limit_top, + y_limit_bottom, + percentile, + file, + geo_df, + ref_y, + ref_z, + beam, + epsg_num, +): """Create figures""" # Create bins for latitude - bath_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(bath_height))+20 + bath_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 + ) - sea_surface_x_axis_bins = np.linspace(binned_data.lat.min(), - binned_data.lat.max(), len(sea_height))+10 + sea_surface_x_axis_bins = ( + np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 + ) # Create new dataframes for median values - bath_median_df = pd.DataFrame({'x': bath_x_axis_bins, 'y': bath_height}) + bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) # Create uniform sea surface based on median sea surface values and filter out surface breaching sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_height1}) + sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) # Create uniform solo sea surface label sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame({'x': sea_surface_x_axis_bins, 'y': sea_surface_label}) + sea_surface_label_df = pd.DataFrame( + {"x": sea_surface_x_axis_bins, "y": sea_surface_label} + ) idx_1 = np.where(sea_surface_label_df.y == 1) idx_0 = np.where(sea_surface_label_df.y == 0) @@ -253,65 +386,110 @@ def produce_figures(binned_data, bath_height, sea_height, solo_sea_surface_label # plt.scatter(x=binned_data.lat, # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c='black') - plt.scatter(geo_df.lat, geo_df.photon_height, s=0.8, marker = 'o', - alpha=0.1, c='red', label='Classified Photons') + plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") + plt.scatter( + geo_df.lat, + geo_df.photon_height, + s=0.8, + marker="o", + alpha=0.1, + c="red", + label="Classified Photons", + ) # plt.scatter(x=geo_df.lat, # y = geo_df.photon_height, marker='o', lw=0, s=0.8, # alpha = 0.8, c = 'black', label = 'Corrected photon bin') # Plot median values - plt.scatter(bath_median_df.x, bath_median_df.y, - marker='o', c='r', alpha=0.8, s=2, label='Median bathymetry') - - plt.scatter(sea_median_df.x, sea_median_df.y, - marker='o', c='b', alpha=1, s=2, label='Median sea surface') - - plt.scatter(sea_surface_label_df.iloc[idx_1].x, sea_surface_label_df.iloc[idx_1].y, - marker='o', c='pink', alpha=1, s=3, label='solo_sea_surface') - plt.scatter(sea_surface_label_df.iloc[idx_0].x, sea_surface_label_df.iloc[idx_0].y, - marker='o', c='g', alpha=1, s=3, label='non_solo_sea_surface') + plt.scatter( + bath_median_df.x, + bath_median_df.y, + marker="o", + c="r", + alpha=0.8, + s=2, + label="Median bathymetry", + ) + + plt.scatter( + sea_median_df.x, + sea_median_df.y, + marker="o", + c="b", + alpha=1, + s=2, + label="Median sea surface", + ) + + plt.scatter( + sea_surface_label_df.iloc[idx_1].x, + sea_surface_label_df.iloc[idx_1].y, + marker="o", + c="pink", + alpha=1, + s=3, + label="solo_sea_surface", + ) + plt.scatter( + sea_surface_label_df.iloc[idx_0].x, + sea_surface_label_df.iloc[idx_0].y, + marker="o", + c="g", + alpha=1, + s=3, + label="non_solo_sea_surface", + ) # Insert titles and subtitles - plt.title('Icesat2 Bathymetry\n' + file) - plt.xlabel('Latitude', fontsize=25) - plt.ylabel('Photon Height (m)', fontsize=25) + plt.title("Icesat2 Bathymetry\n" + file) + plt.xlabel("Latitude", fontsize=25) + plt.ylabel("Photon Height (m)", fontsize=25) plt.xticks(fontsize=16) plt.yticks(fontsize=16) - plt.legend(loc="upper left", prop={'size': 20}) + plt.legend(loc="upper left", prop={"size": 20}) # Limit the x and y axes using parameters plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) plt.ylim(top=y_limit_top, bottom=y_limit_bottom) timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace('.h5', '') + file = file.replace(".h5", "") # Define where to save file plt.tight_layout() - plt.savefig("C:/Workstation/ICESat2_HLS/" + file + '_gt' + - str(beam) + '_' + str(percentile) + - '_EPSG' + str(epsg_num) + '_' + timestr + ".pdf") + plt.savefig( + "C:/Workstation/ICESat2_HLS/" + + file + + "_gt" + + str(beam) + + "_" + + str(percentile) + + "_EPSG" + + str(epsg_num) + + "_" + + timestr + + ".pdf" + ) # plt.show() # plt.close() # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs("EPSG:" + str(epsg_num), - "EPSG:4326", always_xy=True) + transformer = Transformer.from_crs( + "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True + ) print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform( - geo_df.lon.values, geo_df.lat.values) + lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - geo_df['lon_wgs84'] = lon_wgs84 - geo_df['lat_wgs84'] = lat_wgs84 + geo_df["lon_wgs84"] = lon_wgs84 + geo_df["lat_wgs84"] = lat_wgs84 - geodf = gpd.GeoDataFrame(geo_df, - geometry=gpd.points_from_xy(geo_df.lon_wgs84, - geo_df.lat_wgs84)) + geodf = gpd.GeoDataFrame( + geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) + ) geodf.set_crs(epsg=4326, inplace=True) # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") \ No newline at end of file + # driver="GPKG") From d12d38bd2324ac9ee3eef631c4fa9d6edd5f9929 Mon Sep 17 00:00:00 2001 From: Chao Date: Mon, 30 Mar 2026 16:10:41 -0400 Subject: [PATCH 08/11] move config.py and main.py to aok/core and remove unused legacy modules Co-Authored-By: Claude Opus 4.6 --- aok/core/{kd_utils => }/config.py | 0 aok/core/kd_utils/SeaSurfaceFlattening.py | 601 -------- .../kd_utils/SeafloorFilterByGEBCO_Module.py | 122 -- .../IS2_BG_Rate_Land_Ocean_plot.py | 780 ---------- .../analyze_nighttime_BG_rate_icesat2.py | 820 ---------- .../analyze_solarbckgrd_icesat2.py | 785 ---------- aok/core/kd_utils/SolarBckgrd/config.py | 119 -- aok/core/kd_utils/kd_utils.py | 1358 ----------------- aok/core/kd_utils/sliderule_adapter.py | 222 --- aok/core/{kd_utils => }/main.py | 0 10 files changed, 4807 deletions(-) rename aok/core/{kd_utils => }/config.py (100%) delete mode 100644 aok/core/kd_utils/SeaSurfaceFlattening.py delete mode 100644 aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py delete mode 100644 aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py delete mode 100644 aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py delete mode 100644 aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py delete mode 100644 aok/core/kd_utils/SolarBckgrd/config.py delete mode 100644 aok/core/kd_utils/kd_utils.py delete mode 100644 aok/core/kd_utils/sliderule_adapter.py rename aok/core/{kd_utils => }/main.py (100%) diff --git a/aok/core/kd_utils/config.py b/aok/core/config.py similarity index 100% rename from aok/core/kd_utils/config.py rename to aok/core/config.py diff --git a/aok/core/kd_utils/SeaSurfaceFlattening.py b/aok/core/kd_utils/SeaSurfaceFlattening.py deleted file mode 100644 index 1c02e21..0000000 --- a/aok/core/kd_utils/SeaSurfaceFlattening.py +++ /dev/null @@ -1,601 +0,0 @@ -""" -Created on Thu Jul 25 13:25:53 2024 - -@author: wayne -""" - -import scipy.interpolate - -from kd_utils import * - -path = "C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/ATL03_ICESat2/" -ATL03_h5_file = "processed_ATL03_20220122044818_04721407_006_01.h5" -ATL03_h5_file_path = path + ATL03_h5_file - - -# Set the shoreline data path -Current_Path = "C:/Workstation/ICESat2_HLS/Kd_ComparisionPaper/Dataset/" - -# load the shoreline package -shoreline_data_path = Current_Path + "Shorelines/GeoPkgGlobalShoreline.gpkg" - - -# speed of light -c = 299792458.0 - -# bin data first based on horizontal resolution -horizontal_res = 10 - -# vertical resolution -# vertical_res=0.1 -vertical_res = 0.25 -# vertical_res=1 - -# sea subsurface thresh -# determine subsurface by setting how depth below the sea surface -subsurface_thresh = 0.5 - - -# define specific rectangular target zone for batch analysis -# i.e., From subsurface depth 0.5 to specific KdMaxDepth depth (e.g, 6) -Kd_max_depth = 6 - - -######################################## -# readATL03_github -IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams = read_granule( - ATL03_h5_file_path, ATTRIBUTES=True -) - -# extract parameters from ICESat-2 ATLAS HDF5 file name -rx = re.compile( - r"(processed_)?(ATL\d{2})_(\d{4})(\d{2})(\d{2})(\d{2})" - r"(\d{2})(\d{2})_(\d{4})(\d{2})(\d{2})_(\d{3})_(\d{2})(.*?).h5$" -) -SUB, PRD, YY, MM, DD, HH, MN, SS, TRK, CYCL, GRAN, RL, VERS, AUX = rx.findall( - ATL03_h5_file_path -).pop() - - -# initialize data storage variables -# variables of interest for generating three types classification -Segment_ID = {} -Segment_Index_begin = {} -Segment_PE_count = {} - -Equator_Segment_Distance = {} -Segment_Length = {} - -Segment_Is_Land = {} - -# mean geolocation, height and delta time -Segment_Lon = {} -Segment_Lat = {} -Segment_Elev = {} -Segment_Time = {} - -Segment_ref_elev = {} -Segment_ref_azimuth = {} - -# relative_AT_dist = {} -# relative_seg_dist = {} - -# background photon rate -background_rate = {} -background_counts = {} - - -######################################## -# proc02_manually_select_water_forGitHub -# remove water surface based on simple 0.5 m - - -# raw beam-by-beam photon data after cutoff below the max value 0.5 below peak -IS2_atl03Cut50cmBelowPeak = {} - -# for each input beam within the file -for gtx in sorted(IS2_atl03_beams[0:1]): - # for gtx in sorted(IS2_atl03_beams): - # data and attributes for beam gtx - IS2_val = IS2_atl03_mds[gtx] - IS2_attrs = IS2_atl03_attrs[gtx] - - # ATL03 Segment ID - Segment_ID[gtx] = IS2_val["geolocation"]["segment_id"] - # number of valid overlapping ATL03 segments - n_seg = len(Segment_ID[gtx]) - # number of photon events - (n_pe,) = IS2_val["heights"]["delta_time"].shape - - # first photon ID (1-based) in each segment (convert to 0-based indexing) - Segment_Index_begin[gtx] = IS2_val["geolocation"]["ph_index_beg"] - 1 - - # number of photon events in the segment - Segment_PE_count[gtx] = IS2_val["geolocation"]["segment_ph_cnt"] - - # along-track distance from the equator crossing to the start of each ATL03 20 meter geolocation segment - Equator_Segment_Distance[gtx] = IS2_val["geolocation"]["segment_dist_x"] - - # along-track length for each ATL03 segment - Segment_Length[gtx] = IS2_val["geolocation"]["segment_length"] - - # Transmit time of the reference photon - delta_time = IS2_val["geolocation"]["delta_time"] - - # get geolocation lat/lon - segment_lat = IS2_val["geolocation"]["reference_photon_lat"][:].copy() - segment_lon = IS2_val["geolocation"]["reference_photon_lon"][:].copy() - - # get parameters ref elev and azimuth for refraction correction - ref_elev = IS2_val["geolocation"]["ref_elev"][:].copy() - ref_azimuth = IS2_val["geolocation"]["ref_azimuth"][:].copy() - - # get geoid for converting WGS84 ellipsoid to geoid correction - geoid = IS2_val["geophys_corr"]["geoid"][:].copy() - - # photon event heights - h_ph = IS2_val["heights"]["h_ph"][:].copy() - lat_ph = IS2_val["heights"]["lat_ph"][:].copy() - lon_ph = IS2_val["heights"]["lon_ph"][:].copy() - # dist_ph_along = IS2_val['heights']['dist_ph_along'][:].copy() - signal_conf_photon = IS2_val["heights"]["signal_conf_ph"][..., 0].copy() - - # along-track and across-track distance for photon events - x_atc = IS2_val["heights"]["dist_ph_along"][:].copy() - y_atc = IS2_val["heights"]["dist_ph_across"][:].copy() - - # photon quality - quality_ph = IS2_val["heights"]["quality_ph"] - - # calculate along track distance - for seg_index in range(n_seg): - # index for 20m segment j - idx = Segment_Index_begin[gtx][seg_index] - # number of photons in 20m segment - cnt = Segment_PE_count[gtx][seg_index] - # add segment distance to along-track coordinates - x_atc[idx : idx + cnt] += Equator_Segment_Distance[gtx][seg_index] - - # calculate along track distance relative to the beginning of the cut segment - relative_AT_dist = (x_atc - x_atc[0]) / 1000 - - relative_seg_dist = ( - Equator_Segment_Distance[gtx] - Equator_Segment_Distance[gtx][0] - ) / 1000 - - # this function is a significant slowdown without pygeos - Segment_Is_Land["geometry"] = gpd.points_from_xy(segment_lon, segment_lat) - - # create a geo dataframe - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - - # Step1: isolate water using landmask - # spatial joint to determine land/sea mask - Segment_Is_Land_Labels = isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF) - - # # set the labels back to ICESat2 geopandas dataframe - ICESat2_GDF.loc[:, "is_land"] = Segment_Is_Land_Labels - - # interpolate is_land labels based on photon lat - island_interp1d_model = scipy.interpolate.interp1d( - segment_lat[:], Segment_Is_Land_Labels[:], fill_value="extrapolate" - ) - - # Apply interpid model - is_land_label_interp1d = island_interp1d_model(lat_ph[:]) - - # Ref_elev on a per photon level (assign seg ref_elev to photons) - ref_elev_interp1d_model = scipy.interpolate.interp1d( - segment_lat[:], ref_elev[:], fill_value="extrapolate" - ) - # apply interpid model - ph_ref_elev = ref_elev_interp1d_model(lat_ph[:]) - - # Ref_azimuth on a per photon level (assign seg ref_azimuth to photons) - ref_azimuth_interp1d_model = scipy.interpolate.interp1d( - segment_lat[:], ref_azimuth[:], fill_value="extrapolate" - ) - - # apply interpid model - ph_ref_azimuth = ref_azimuth_interp1d_model(lat_ph[:]) - - # interpolate geoid based on segment lat - geoid_interp1d_model = scipy.interpolate.interp1d( - segment_lat[:], geoid[:], fill_value="extrapolate" - ) - - # Apply interpid model - ph_geoid = geoid_interp1d_model(lat_ph[:]) - - # geoid correct - h_ph_geoid_cor = h_ph[:] - ph_geoid[:] - - # Find the epsg code - epsg_code = convert_wgs_to_utm(lon_ph[0], lat_ph[0]) - epsg_num = int(epsg_code.split(":")[-1]) - - # Orthometrically correct the data using the epsg code - lat_utm, lon_utm, h_ph_cor = orthometric_correction(lat_ph, lon_ph, h_ph, epsg_code) - - # Aggregate data into dataframe - sea_photon_dataset = pd.DataFrame( - { - "latitude": lat_ph, - "longitude": lon_ph, - "lat": lat_utm, - "lon": lon_utm, - "photon_height": h_ph_geoid_cor, - "quality_ph": quality_ph, - "is_land_label": is_land_label_interp1d, - "photon_conf": signal_conf_photon, - "ref_elevation": ph_ref_elev, - "ref_azimuth": ph_ref_azimuth, - "relative_AT_dist": relative_AT_dist, - }, - columns=[ - "latitude", - "longitude", - "lat", - "lon", - "photon_height", - "quality_ph", - "is_land_label", - "photon_conf", - "ref_elevation", - "ref_azimuth", - "relative_AT_dist", - ], - ) - - # #remove the saturated photons - # sea_photon_dataset=sea_photon_dataset[sea_photon_dataset['quality_ph'] < 1] - - # remove the land photons - sea_photon_dataset = sea_photon_dataset[sea_photon_dataset["is_land_label"] != 1] - - # bin the data by 25 cm - binned_dataset_sea_surface = horizontal_vertical_bin_dataset( - sea_photon_dataset, horizontal_res, vertical_res - ) - - # get sea surface height - # threshold to determine how much depth below sea surface will be accounted - # Output: - # ->final_sea_surface_height - # ->sea_surface_height_abnormal_label - # ->sea_surface_dominated_label - ( - sea_surface_height, - sea_surface_label, - solo_sea_surface_label, - subsurface_photon_dataset, - ) = get_sea_surface_height(binned_dataset_sea_surface, subsurface_thresh) - - -sea_surface_height.reverse() - - -#################################################################### -# Tasks: Sea Surface flattening -# I have a list value of sea_surface_height over each 10 m bin, I would like to -# 1) calculate the mean sea_surface_height value along each big bin with 500 m ; -# 2) Within each 10-m along track increment, -# adjust all of the photons (i.e., subsurface_photon_dataset) within that 10-m column either up or down -# based on the deviation of the local sea surface height from the mean sea surface height - -###################### -# Step 1: Creating a DataFrame for sea_surface_height with a corresponding 10 m small bin index -# each element in sea_surface_height corresponds to a 10 m bin -df_sea_surface = pd.DataFrame( - { - "sea_surface_height": sea_surface_height, # sea_surface_height is provided as a list - "small_bins": np.arange(len(sea_surface_height)), # Index for each 10 m bin - } -) - -# # Map 'small_bins'(i.e., 'lat_bins') to 500 m 'big_bins' -df_sea_surface["big_bins"] = ( - df_sea_surface["small_bins"] // 50 -) # 50 10 m bins in each 500 m bin - - -###################### -# Step 2: Calculate Mean Sea_surface_height value for Each 500-m Big Bin -mean_sea_surface_elevation_bigBin = ( - df_sea_surface.groupby("big_bins")["sea_surface_height"].mean().reset_index() -) -mean_sea_surface_elevation_bigBin.rename( - columns={"sea_surface_height": "mean_sea_surface_height_big_bins"}, inplace=True -) - -# Prepare subsurface_photon_dataset for merging -subsurface_photon_dataset["big_bins"] = ( - subsurface_photon_dataset["lat_bins"].astype(int) // 50 -) - -# Merge the mean elevation data with the subsurface_photon_dataset -subsurface_photon_dataset = subsurface_photon_dataset.merge( - mean_sea_surface_elevation_bigBin, on="big_bins", how="left" -) - -# Merge sea_surface_height data with subsurface_photon_dataset based on the small_bins -subsurface_photon_dataset = subsurface_photon_dataset.merge( - df_sea_surface[["small_bins", "sea_surface_height"]], - left_on="lat_bins", - right_on="small_bins", - how="left", -) - - -####################### -# Step 3: Within each 10 m small bin, adjust the photon heights -# based on the deviation between their sea surface height at each small bin and the mean sea surface height within each 500-m bin -subsurface_photon_dataset["adjusted_photon_height"] = subsurface_photon_dataset[ - "photon_height" -] - ( - subsurface_photon_dataset["mean_sea_surface_height_big_bins"] - - subsurface_photon_dataset["sea_surface_height"] -) - -subsurface_photon_dataset["Dif_photon_height"] = ( - subsurface_photon_dataset["mean_sea_surface_height_big_bins"] - - subsurface_photon_dataset["sea_surface_height"] -) -###################################### -################################### -###################################### -################################### -###Visualization - -from matplotlib.patches import Rectangle -import matplotlib.pyplot as plt -import matplotlib.transforms as mtransforms -from mpl_toolkits.axes_grid1.inset_locator import inset_axes - -sea_surface_x_axis_bins = np.linspace( - relative_seg_dist.min(), relative_seg_dist.max(), len(sea_surface_height) -) - - -mean_sea_surface_x_axis_bins = np.linspace( - relative_seg_dist.min(), - relative_seg_dist.max(), - len(mean_sea_surface_elevation_bigBin), -) - - -# Adjusting the global font size -plt.rcParams["font.size"] = 18 # You can choose your own font size here - -# Define your zoom area limits for the x-axis -zoom_xlim = (10.5, 11.5) # for example, from 10 to 20 on your x-axis - -# Filter the data based on your x-axis limits -# Here, we're creating a boolean index that matches your zoom criteria -x_index_within_zoom = (sea_photon_dataset.relative_AT_dist >= zoom_xlim[0]) & ( - sea_photon_dataset.relative_AT_dist <= zoom_xlim[1] -) - -# Apply the index to your x and y data -zoomed_relative_AT_dist = sea_photon_dataset.relative_AT_dist[x_index_within_zoom] -zoomed_photon_height = sea_photon_dataset.photon_height[x_index_within_zoom] - -# get the x index for adjusted subsurface_photon -subsurface_x_index_within_zoom = ( - subsurface_photon_dataset.relative_AT_dist >= zoom_xlim[0] -) & (subsurface_photon_dataset.relative_AT_dist <= zoom_xlim[1]) -subsurface_zoomed_relative_AT_dist = subsurface_photon_dataset.relative_AT_dist[ - subsurface_x_index_within_zoom -] -zoomed_photon_height_adjusted = subsurface_photon_dataset.adjusted_photon_height[ - subsurface_x_index_within_zoom -] -zoomed_photon_height_Dif = subsurface_photon_dataset.Dif_photon_height[ - subsurface_x_index_within_zoom -] - - -# Now proceed with creating your main plot -hlims = [-40, 5] # height limits -sub_hlims = [-10, 3] # height limits - - -######################### -fig = plt.figure(figsize=(16, 8), dpi=300, facecolor="w", edgecolor="k") -ax = fig.add_subplot(111) -ax.set_xlabel("Distance From Start of Track (km)") -ax.set_ylabel("Photon Height (m)") - -ax.scatter( - sea_photon_dataset.relative_AT_dist, - sea_photon_dataset.photon_height, - s=10, - c="k", - alpha=0.15, - edgecolors="none", - label="ATL03 Photons", -) -ax.plot( - sea_surface_x_axis_bins, - sea_surface_height, - linewidth=0.8, - color="b", - label="Surface Peak", -) -ax.plot( - sea_surface_x_axis_bins, - [x - 0.5 for x in sea_surface_height], - linewidth=0.8, - color="#DD571C", - label="0.5 m Below Surface Peak", -) -ax.plot( - sea_surface_x_axis_bins, - [x - 1 for x in sea_surface_height], - linewidth=0.8, - color="#FDA172", - label="1 m Below Surface Peak", -) -ax.plot( - sea_surface_x_axis_bins, - [x - 2 for x in sea_surface_height], - linewidth=0.8, - color="#FCAE1E", - label="2 m Below Surface Peak", -) - -# Create an inset axis for the zoomed area -# axins = inset_axes(ax, width=5, height=3, loc='lower left') # adjust the location as needed - -# Assuming 'ax' is your main axes -trans = mtransforms.blended_transform_factory(ax.figure.transFigure, ax.transAxes) -axins = inset_axes( - ax, - width=5, - height=3, - loc=3, - bbox_to_anchor=( - 0.2, - 0.1, - 0.4, - 0.4, - ), # The numbers are x0, y0, width, and height in percentages of the figure size. - bbox_transform=trans, - borderpad=0, -) - -# Plot the filtered data on the inset axes -axins.scatter( - zoomed_relative_AT_dist, - zoomed_photon_height, - s=10, - c="k", - alpha=0.15, - edgecolors="none", - label="Zoomed ATL03 Photons", -) - -axins.scatter( - subsurface_zoomed_relative_AT_dist, - zoomed_photon_height_adjusted, - s=10, - c="r", - alpha=0.15, - edgecolors="none", - label="Zoomed Adjusted ATL03 Photons", -) - - -axins.scatter( - subsurface_zoomed_relative_AT_dist, - zoomed_photon_height_Dif, - s=20, - c="g", - alpha=0.15, - edgecolors="none", - label="Zoomed Dif ATL03 Photons", -) - -# Add horizontal line at y=0 -axins.axhline(y=0, color="blue", alpha=0.15, linestyle="--", linewidth=1) - -# # For the lines, you might need to adjust your data arrays to match the zoomed range -# # assuming your sea_surface_x_axis_bins aligns directly with your sea_surface_height data -# zoomed_sea_surface_x = [x for x in sea_surface_x_axis_bins if zoom_xlim[0] <= x <= zoom_xlim[1]] -# zoomed_sea_surface_height = sea_surface_height[:len(zoomed_sea_surface_x)] # adjust based on your data alignment - -# First, we need to find the indices of the items in sea_surface_x_axis_bins that fall within the zoomed range. -zoomed_indices = [ - index - for index, x in enumerate(sea_surface_x_axis_bins) - if zoom_xlim[0] <= x <= zoom_xlim[1] -] - -# Then, use these indices to select the corresponding items from both sea_surface_x_axis_bins and sea_surface_height. -zoomed_sea_surface_x = [sea_surface_x_axis_bins[i] for i in zoomed_indices] -zoomed_sea_surface_height = [sea_surface_height[i] for i in zoomed_indices] - - -# 0197f6,#BE5504,#FA8128 -axins.plot( - zoomed_sea_surface_x, - zoomed_sea_surface_height, - linewidth=1, - color="b", - label="Zoomed Surface Peak", -) -axins.plot( - zoomed_sea_surface_x, - [x - 0.5 for x in zoomed_sea_surface_height], - linewidth=1, - color="#DD571C", - label="0.5 m Below Zoomed Surface Peak", -) -axins.plot( - zoomed_sea_surface_x, - [x - 1 for x in zoomed_sea_surface_height], - linewidth=1, - color="#FDA172", - label="1 m Below Zoomed Surface Peak", -) -axins.plot( - zoomed_sea_surface_x, - [x - 2 for x in zoomed_sea_surface_height], - linewidth=1, - color="#FCAE1E", - label="2 m Below Zoomed Surface Peak", -) - -# mean sea surface height at 500m bins -axins.plot( - mean_sea_surface_x_axis_bins, - mean_sea_surface_elevation_bigBin, - linewidth=1, - color="r", - label="Zoomed Surface Peak 500m bin", -) - - -# Set the x limits for the inset axes; y limits remain unchanged -axins.set_xlim(zoom_xlim) -axins.set_ylim(sub_hlims) - -axins.set_title("Zoomed in") - - -# Adding the rectangle indicating the zoomed area -# Draw a rectangle on the main plot to indicate the zoomed area -# We use the data coordinates for the rectangle, as it needs to correspond to the actual data points on the main axes. -rec_hlims = [-10, 3] -rectangle = Rectangle( - (zoom_xlim[0], rec_hlims[0]), - zoom_xlim[1] - zoom_xlim[0], - rec_hlims[1] - rec_hlims[0], - linewidth=1, - edgecolor="r", - facecolor="none", - linestyle="--", -) -ax.add_patch(rectangle) - - -# Finalize the main plot adjustments -ax.legend(loc="lower right") -ax.set_ylim(hlims) - -plt.savefig( - path - + "/" - + YY - + MM - + DD - + "Track" - + TRK - + "Plot" - + gtx - + "_With_depth_colored.jpg", - dpi=500, - bbox_inches="tight", -) -# Display the plot -plt.show() diff --git a/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py b/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py deleted file mode 100644 index 68234ae..0000000 --- a/aok/core/kd_utils/SeafloorFilterByGEBCO_Module.py +++ /dev/null @@ -1,122 +0,0 @@ -import numpy as np -import rasterio -from rtree import index -from shapely.geometry import box - - -# 1. Function to create an R-tree spatial index for raster bounds -def create_spatial_index(gebco_paths): - """ - Create an R-tree spatial index for raster bounds to quickly find relevant rasters. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - - Returns: - raster_data_dict (dict): Dictionary containing loaded rasters and their respective data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - """ - idx = index.Index() - raster_data_dict = {} - for i, path in enumerate(gebco_paths): - with rasterio.Env(GTIFF_SRS_SOURCE="EPSG"): - gebco_raster = rasterio.open(path) - raster_data = gebco_raster.read(1) - raster_data_dict[path] = (gebco_raster, raster_data) - bounds = gebco_raster.bounds - idx.insert( - i, (bounds.left, bounds.bottom, bounds.right, bounds.top), obj=path - ) - return raster_data_dict, idx - - -# 2. Function to determine which rasters are needed for a given set of coordinates using spatial index in a batch manner -def get_relevant_rasters_using_index(lons, lats, raster_data_dict, spatial_index): - """ - Determine which raster datasets are relevant for the given coordinates using spatial index. - This function uses a more efficient approach by performing a bounding box query for batches of points. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - raster_data_dict (dict): Dictionary containing raster datasets and their data. - spatial_index (rtree.index.Index): R-tree spatial index for raster bounds. - - Returns: - relevant_rasters (list): List of relevant raster datasets and their respective data. - """ - # Create a bounding box that covers all points - min_lon, max_lon = lons.min(), lons.max() - min_lat, max_lat = lats.min(), lats.max() - bounding_box = box(min_lon, min_lat, max_lon, max_lat) - - # Get all rasters that intersect with the bounding box - matches = list(spatial_index.intersection((bounding_box.bounds), objects=True)) - relevant_paths = {match.object for match in matches} - relevant_rasters = [ - (raster_data_dict[path][0], raster_data_dict[path][1]) - for path in relevant_paths - ] - return relevant_rasters - - -# 3. Function to get seafloor elevation from the relevant rasters in a vectorized manner -def get_seafloor_elevation(lons, lats, relevant_rasters): - """ - Get seafloor elevations for a batch of points based on longitude and latitude from relevant rasters. - - Parameters: - lons (numpy.ndarray): Array of longitudes. - lats (numpy.ndarray): Array of latitudes. - relevant_rasters (list): List of relevant raster datasets and their respective data. - - Returns: - seafloor_elevations (numpy.ndarray): Array of seafloor elevations. - """ - points = np.vstack((lons, lats)).T - seafloor_elevations = np.full(len(lons), np.nan) - - for gebco_raster, raster_data in relevant_rasters: - # Use rasterio.sample to get values for multiple points in a batch - values = list(gebco_raster.sample(points)) - for idx, value in enumerate(values): - if np.isnan(seafloor_elevations[idx]) and value is not None: - seafloor_elevations[idx] = value[0] - - return seafloor_elevations - - -# 4. The main process function that ties everything together -def process_seafloor_data(gebco_paths, sea_photon_dataset): - """ - Main function to process the seafloor data. - - Parameters: - gebco_paths (list): List of paths to GEBCO raster files. - sea_photon_dataset (pandas.DataFrame): Dataset containing longitude, latitude, and photon height. - - Returns: - filtered_sea_photon_dataset (pandas.DataFrame): Filtered dataset containing points above the seafloor. - """ - # Create spatial index and load GEBCO Raster Data - raster_data_dict, spatial_index = create_spatial_index(gebco_paths) - - # Get the relevant rasters using spatial index - lons = sea_photon_dataset["longitude"].values - lats = sea_photon_dataset["latitude"].values - relevant_rasters = get_relevant_rasters_using_index( - lons, lats, raster_data_dict, spatial_index - ) - - # Get seafloor elevation for all points - sea_photon_dataset["seafloor_elevation"] = get_seafloor_elevation( - lons, lats, relevant_rasters - ) - - # Filter out points below the seafloor - filtered_sea_photon_dataset = sea_photon_dataset[ - sea_photon_dataset["photon_height"] > sea_photon_dataset["seafloor_elevation"] - ] - # filtered_sea_photon_dataset.drop(columns=['seafloor_elevation'], inplace=True) - - return filtered_sea_photon_dataset diff --git a/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py b/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py deleted file mode 100644 index e73f1f0..0000000 --- a/aok/core/kd_utils/SolarBckgrd/IS2_BG_Rate_Land_Ocean_plot.py +++ /dev/null @@ -1,780 +0,0 @@ -import logging -from multiprocessing import Pool -import os - -from config import get_args -import geopandas as gpd -import h5py -import matplotlib.pyplot as plt -from noise_utils.data_processing import isolate_sea_land_photons -import numpy as np -import pandas as pd -from scipy.interpolate import interp1d -from scipy.optimize import curve_fit - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -# Define model functions -def power_model(x, a, b, c): - return a * np.power(x, b) + c - - -def exp_model(x, a, b, c): - """Exponential model: a * exp(b * x) + c""" - return a * np.exp(b * x) + c - - -def twilight_model(x, a, b, c): - return a * np.exp(-b * x**2) + c # Gaussian form for twilight - - -def process_file(args): - """Process a single ATL03 file, skipping files missing required datasets.""" - file, beam, shoreline_data_path = args - try: - with h5py.File(file, "r") as f: - # Check for required datasets - required_keys = [ - f"{beam}/bckgrd_atlas/bckgrd_rate", - f"{beam}/bckgrd_atlas/delta_time", - f"{beam}/heights/delta_time", - f"{beam}/heights/h_ph", - f"{beam}/geolocation/reference_photon_lat", - f"{beam}/geolocation/reference_photon_lon", - f"{beam}/geolocation/solar_azimuth", - f"{beam}/geolocation/solar_elevation", - ] - for key in required_keys: - if key not in f: - raise KeyError(f"Missing dataset: {key}") - - bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) - bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] - heights = f[f"{beam}/heights/h_ph"][:] - segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] - segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] - segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] - - Segment_Is_Land = pd.DataFrame( - { - "latitude": segment_lat, - "longitude": segment_lon, - "delta_time": segment_delta_time, - } - ) - - # Create a GeoDataFrame with geometry - Segment_Is_Land["geometry"] = gpd.points_from_xy( - Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] - ) - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - - # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons( - shoreline_data_path, ICESat2_GDF - ) - ICESat2_GDF["is_land"] = Segment_Is_Land_Labels - - # Interpolate is_land labels to bckgrd_time - interp_is_land = interp1d( - ICESat2_GDF["delta_time"], - ICESat2_GDF["is_land"].astype(int), - bounds_error=False, - fill_value=( - ICESat2_GDF["is_land"].astype(int).iloc[0], - ICESat2_GDF["is_land"].astype(int).iloc[-1], - ), - ) - is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - - # Compute total background rates for land and ocean - land_mask = is_land_interpolated == 1 - ocean_mask = is_land_interpolated == 0 - total_background_rate_land = ( - np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - ) - total_background_rate_ocean = ( - np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan - ) - - # Solar elevation handling - FILL_VALUE = 3.4028235e38 - solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( - np.float64 - ) - valid_solar = solar_elevations[ - (solar_elevations != FILL_VALUE) - & (np.abs(solar_elevations) < 1e30) - & np.isfinite(solar_elevations) - ] - solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) - - mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan - - result = { - "file": os.path.basename(file), - "total_background_rate_land": total_background_rate_land, - "total_background_rate_ocean": total_background_rate_ocean, - "solar_elevation": solar_elevation, - "mean_photon_height": mean_photon_height, - } - logger.info(f"Processed {file}: {result}") - return result - except Exception as e: - logger.error(f"Error processing {file}: {e!s}") - return {"file": os.path.basename(file), "error": str(e)} - - -def fit_surface_model( - df, - classification, - surface_type, - model_func, - p0, - bounds, - x_col="solar_elevation", - y_col="solar_background_rate", -): - """Fit a model to a specific classification and surface type.""" - subset = df[ - (df["classification"] == classification) & (df["surface_type"] == surface_type) - ].copy() - if len(subset) < 3: - print(f"Warning: Insufficient data for {classification} {surface_type} model.") - return None, None, None - - mask = ( - subset[x_col].notna() - & subset[y_col].notna() - & np.isfinite(subset[x_col]) - & np.isfinite(subset[y_col]) - ) - valid_x = subset.loc[mask, x_col] - valid_y = subset.loc[mask, y_col] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." - ) - return None, None, None - - try: - popt, _ = curve_fit( - model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 - ) - y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( - (valid_y - np.mean(valid_y)) ** 2 - ) - return popt, r2, lambda x: model_func(x, *popt) - except RuntimeError as e: - print(f"Fit failed for {classification} {surface_type}: {e}") - return None, None, None - - -def load_and_process_files(atl03_directory, beam, shoreline_data_path): - """Load and process ATL03 files in parallel.""" - atl03_files = [ - os.path.join(atl03_directory, f) - for f in os.listdir(atl03_directory) - if f.endswith(".h5") and "ATL03" in f - ] - print(f"Found {len(atl03_files)} ATL03 files.") - - if not atl03_files: - raise ValueError("No ATL03 files found in the directory.") - - with Pool() as pool: - results = pool.map( - process_file, [(f, beam, shoreline_data_path) for f in atl03_files] - ) - - print("Results from process_file:", results) - df = pd.DataFrame(results) - - if "error" in df.columns: - error_files = df[df["error"].notna()]["file"].tolist() - print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df["error"].isna()].drop(columns=["error"]) - - if df.empty: - raise ValueError( - "No files were successfully processed. Check the error messages above." - ) - - print("DataFrame columns after processing:", df.columns) - return df - - -def assign_surface_type(df): - """Assign surface type based on background rates.""" - required_columns = ["total_background_rate_land", "total_background_rate_ocean"] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - - df["surface_type"] = np.where( - df["total_background_rate_land"].notna() - & (df["total_background_rate_land"] > 0), - "Land", - np.where( - df["total_background_rate_ocean"].notna() - & (df["total_background_rate_ocean"] > 0), - "Ocean", - "Mixed", - ), - ) - return df - - -def classify_data(df, twilight_threshold=6.0): - """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df["classification"] = np.where( - df["solar_elevation"].isna(), - "Unknown", - np.where( - df["solar_elevation"] > twilight_threshold, - "Daytime", - np.where( - df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" - ), - ), - ) - daytime_df = df[df["classification"] == "Daytime"].copy() - nighttime_df = df[df["classification"] == "Nighttime"].copy() - twilight_df = df[df["classification"] == "Twilight"].copy() - unknown_df = df[df["classification"] == "Unknown"].copy() - print( - f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" - ) - return df, daytime_df, nighttime_df, twilight_df, unknown_df - - -def calculate_non_solar_rate(nighttime_df): - """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) - non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) - non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) - print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") - print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") - print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") - return non_solar_rate - - -def compute_solar_background_rates(df, non_solar_rate): - """Compute solar background rates for land and ocean.""" - df["solar_background_rate_land"] = ( - df["total_background_rate_land"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate_ocean"] = ( - df["total_background_rate_ocean"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate"] = np.where( - df["surface_type"] == "Land", - df["solar_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan - ), - ) - return df - - -def fit_daytime_model(df, daytime_df, non_solar_rate): - """Fit daytime models for land and ocean.""" - daytime_models = {} - for surface_type in ["Land", "Ocean"]: - subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print(f"Warning: Insufficient daytime {surface_type} data for modeling.") - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_x = subset.loc[mask, "solar_elevation"] - valid_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for daytime {surface_type} model." - ) - continue - - # Remove outliers - log_y = np.log10(valid_y + 1) - mean_log_y = np.mean(log_y) - std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( - log_y < mean_log_y + 3 * std_log_y - ) - valid_x = valid_x[outlier_mask] - valid_y = valid_y[outlier_mask] - - print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") - try: - p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ( - [0, 0.01, non_solar_rate * 0.9], - [1e8, 0.5, non_solar_rate * 1.1], - ) - popt_exp, _ = curve_fit( - exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 - ) - r2_exp = 1 - np.sum( - (valid_y - exp_model(valid_x, *popt_exp)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - x_shifted = valid_x - min(valid_x) + 1 - p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ( - [0, 0.1, non_solar_rate * 0.9], - [1e8, 1.0, non_solar_rate * 1.1], - ) - popt_power, _ = curve_fit( - power_model, - x_shifted, - valid_y, - p0=p0_power, - bounds=bounds_power, - maxfev=20000, - ) - r2_power = 1 - np.sum( - (valid_y - power_model(x_shifted, *popt_power)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - if r2_power > r2_exp: - print( - f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_power, - "r2": r2_power, - "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), - } - else: - print( - f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_exp, - "r2": r2_exp, - "model": lambda x: exp_model(x, *popt_exp), - } - - # Apply model - daytime_mask = (df["classification"] == "Daytime") & ( - df["surface_type"] == surface_type - ) - valid_daytime_mask = ( - daytime_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] - if r2_power > r2_exp: - x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model_shifted).clip(lower=0) - else: - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( - non_solar_rate - ) - except RuntimeError as e: - print( - f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." - ) - df.loc[ - (df["classification"] == "Daytime") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - return df, daytime_models - - -def fit_twilight_model(df, twilight_df, non_solar_rate): - """Fit twilight models for land and ocean.""" - twilight_models = {} - if len(twilight_df) > 3: - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - df["solar_background_rate"].fillna(0).clip(lower=0) - ) - for surface_type in ["Land", "Ocean"]: - subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print( - f"Warning: Insufficient twilight {surface_type} data for modeling." - ) - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_twilight_x = subset.loc[mask, "solar_elevation"] - valid_twilight_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_twilight_x) < 3: - print( - f"Warning: Fewer than 3 valid points for twilight {surface_type} model." - ) - continue - - print( - f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" - ) - try: - initial_a = np.max(valid_twilight_y) - non_solar_rate - initial_b = 0.01 - initial_c = non_solar_rate - p0 = [initial_a, initial_b, initial_c] - bounds = ( - [0, 0.001, non_solar_rate * 0.9], - [1e6, 1.0, non_solar_rate * 1.1], - ) - - popt, _ = curve_fit( - twilight_model, - valid_twilight_x, - valid_twilight_y, - p0=p0, - bounds=bounds, - maxfev=50000, - ) - y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( - (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 - ) - - if r_squared < 0: - mean_rate = np.mean(valid_twilight_y) - twilight_models[surface_type] = { - "params": [mean_rate], - "r2": 0.0, - "model": lambda x: np.full_like(x, mean_rate), - } - print( - f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" - ) - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - else: - print( - f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" - ) - twilight_models[surface_type] = { - "params": popt, - "r2": r_squared, - "model": lambda x: twilight_model(x, *popt), - } - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] - df.loc[valid_twilight_mask, "solar_background_rate"] = ( - twilight_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - ) - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - except RuntimeError as e: - print( - f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." - ) - df.loc[ - (df["classification"] == "Twilight") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - else: - print( - "Too few twilight points for modeling. Setting twilight rates to non-solar rate." - ) - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - non_solar_rate - ) - return df, twilight_models - - -def create_diagnostic_plot( - df, - daytime_df, - twilight_df, - daytime_models, - twilight_models, - non_solar_rate, - output_dir, -): - """Create diagnostic plot of background rate vs. solar elevation.""" - plt.figure(figsize=(10, 6)) - plt.scatter( - df["solar_elevation"], - df["total_background_rate"], - c="gray", - alpha=0.5, - label="Total", - ) - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], - c="blue", - alpha=0.5, - label="Daytime Land", - ) - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="cyan", - alpha=0.5, - label="Daytime Ocean", - ) - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], - c="orange", - alpha=0.5, - label="Twilight Land", - ) - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="yellow", - alpha=0.5, - label="Twilight Ocean", - ) - plt.axhline( - non_solar_rate, - color="r", - linestyle="--", - label=f"Non-Solar: {non_solar_rate:.2e} ph/s", - ) - - # Plot daytime fits - for surface_type in daytime_models: - if daytime_models[surface_type]["model"] is not None: - x_fit = np.linspace( - min(daytime_df["solar_elevation"]), - max(daytime_df["solar_elevation"]), - 100, - ) - y_fit = daytime_models[surface_type]["model"]( - x_fit - if "power" not in daytime_models[surface_type] - else x_fit - min(daytime_df["solar_elevation"]) + 1 - ) - plt.plot( - x_fit, - y_fit, - label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - # Plot twilight fits - for surface_type in twilight_models: - if twilight_models[surface_type]["model"] is not None: - x_twilight = np.linspace( - min(twilight_df["solar_elevation"]), - max(twilight_df["solar_elevation"]), - 100, - ) - y_twilight = twilight_models[surface_type]["model"](x_twilight) - plt.plot( - x_twilight, - y_twilight, - label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - plt.yscale("log") - plt.ylim(1e3, 1e8) - plt.xlabel("Solar Elevation (degrees)") - plt.ylabel("Background Rate (ph/s)") - plt.legend() - plt.title("Background Rate vs. Solar Elevation (Land vs. Ocean)") - plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) - plt.close() - - -def create_calibration_plot(df, daytime_df, output_dir): - """Create a calibration plot using mean photon height as a proxy.""" - if not daytime_df.empty: - sample_file = daytime_df.iloc[0] - heights = sample_file["mean_photon_height"] - if np.isfinite(heights): - solar_rate = sample_file["solar_background_rate"] - bin_size = 0.1 - hist, edges = np.histogram( - np.array([heights] * 1000), - bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), - ) - photons_per_bin = ( - solar_rate - * (1000 / np.sum(hist)) - * bin_size - / (max(heights) - min(heights) + 2) - ) - calibrated_counts = np.maximum(hist - photons_per_bin, 0) - calibrated_heights = [] - for i, count in enumerate(calibrated_counts): - if count > 0: - calibrated_heights.extend( - np.random.uniform(edges[i], edges[i + 1], int(count)) - ) - - plt.figure(figsize=(8, 6)) - plt.hist( - [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" - ) - plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") - plt.xlabel("Photon Height (m)") - plt.ylabel("Count") - plt.legend() - plt.title(f'Calibration Example: {sample_file["file"]}') - plt.savefig(os.path.join(output_dir, "calibration_example.png")) - plt.close() - - -def save_results(df, output_dir): - """Save the results to a CSV file.""" - df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) - - -# New function to calculate averages -def calculate_background_averages(daytime_df, nighttime_df): - """Calculate average background rates for daytime and nighttime, land and ocean.""" - # Calculate averages, ignoring NaN values - daytime_land_avg = np.nanmean(daytime_df["total_background_rate_land"]) - daytime_ocean_avg = np.nanmean(daytime_df["total_background_rate_ocean"]) - nighttime_land_avg = np.nanmean(nighttime_df["total_background_rate_land"]) - nighttime_ocean_avg = np.nanmean(nighttime_df["total_background_rate_ocean"]) - - # Log the averages - logger.info(f"Average Daytime Land Background Rate: {daytime_land_avg:.2e} ph/s") - logger.info(f"Average Daytime Ocean Background Rate: {daytime_ocean_avg:.2e} ph/s") - logger.info( - f"Average Nighttime Land Background Rate: {nighttime_land_avg:.2e} ph/s" - ) - logger.info( - f"Average Nighttime Ocean Background Rate: {nighttime_ocean_avg:.2e} ph/s" - ) - - return { - "daytime_land": daytime_land_avg, - "daytime_ocean": daytime_ocean_avg, - "nighttime_land": nighttime_land_avg, - "nighttime_ocean": nighttime_ocean_avg, - } - - -# New function to plot averages -def plot_background_averages(averages, output_dir): - """Plot the average background rates for daytime and nighttime, land and ocean.""" - labels = ["Daytime Land", "Daytime Ocean", "Nighttime Land", "Nighttime Ocean"] - rates = [ - averages["daytime_land"] if not pd.isna(averages["daytime_land"]) else 0, - averages["daytime_ocean"] if not pd.isna(averages["daytime_ocean"]) else 0, - averages["nighttime_land"] if not pd.isna(averages["nighttime_land"]) else 0, - averages["nighttime_ocean"] if not pd.isna(averages["nighttime_ocean"]) else 0, - ] - - plt.figure(figsize=(8, 6)) - bars = plt.bar(labels, rates, color=["blue", "cyan", "purple", "lightgreen"]) - - plt.ylabel("Average Background Rate (ph/s)") - plt.title( - "Average Background Rate Comparison\n(Daytime vs. Nighttime, Land vs. Ocean)" - ) - plt.yscale("log") - plt.ylim(1e3, 1e8) - - for bar in bars: - yval = bar.get_height() - plt.text( - bar.get_x() + bar.get_width() / 2, - yval, - f"{yval:.2e}", - ha="center", - va="bottom", - ) - - plt.savefig(os.path.join(output_dir, "average_background_rate_comparison.png")) - plt.close() - logger.info("Average background rate comparison plot generated.") - - -def analyze_icesat2_data( - atl03_directory, - beam="gt1l", - twilight_threshold=6.0, - output_dir="output", - shoreline_data_path=None, -): - """Main analysis function orchestrating the workflow.""" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - df = load_and_process_files(atl03_directory, beam, shoreline_data_path) - if df.empty: - raise ValueError("No data processed successfully. Check logs for errors.") - - df = assign_surface_type(df) - - # Compute total_background_rate based on surface_type - df["total_background_rate"] = np.where( - df["surface_type"] == "Land", - df["total_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan - ), - ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( - df, twilight_threshold - ) - - averages = calculate_background_averages(daytime_df, nighttime_df) - plot_background_averages(averages, output_dir) - - -if __name__ == "__main__": - args = get_args() - atl03_directory = os.path.join(args.workspace_path, args.atl03_path) - shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) - output_dir = os.path.join(args.workspace_path, args.output_path) - analyze_icesat2_data( - atl03_directory, - args.beam, - output_dir=output_dir, - shoreline_data_path=shoreline_data_path, - ) - print("Analysis complete.") diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py deleted file mode 100644 index 48dfa66..0000000 --- a/aok/core/kd_utils/SolarBckgrd/analyze_nighttime_BG_rate_icesat2.py +++ /dev/null @@ -1,820 +0,0 @@ -import logging -from multiprocessing import Pool -import os - -from config import get_args -import geopandas as gpd -import h5py -import matplotlib.pyplot as plt -from noise_utils.data_processing import isolate_sea_land_photons -import numpy as np -import pandas as pd -from scipy.interpolate import interp1d -from scipy.optimize import curve_fit - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -# Define model functions -def power_model(x, a, b, c): - return a * np.power(x, b) + c - - -def exp_model(x, a, b, c): - """Exponential model: a * exp(b * x) + c""" - return a * np.exp(b * x) + c - - -def twilight_model(x, a, b, c): - return a * np.exp(-b * x**2) + c # Gaussian form for twilight - - -def process_file(args): - """Process a single ATL03 file, skipping files missing required datasets.""" - file, beam, shoreline_data_path = args - try: - with h5py.File(file, "r") as f: - # Check for required datasets - required_keys = [ - f"{beam}/bckgrd_atlas/bckgrd_rate", - f"{beam}/bckgrd_atlas/delta_time", - f"{beam}/heights/delta_time", - f"{beam}/heights/h_ph", - f"{beam}/geolocation/reference_photon_lat", - f"{beam}/geolocation/reference_photon_lon", - f"{beam}/geolocation/solar_azimuth", - f"{beam}/geolocation/solar_elevation", - ] - for key in required_keys: - if key not in f: - raise KeyError(f"Missing dataset: {key}") - - bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) - bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] - heights = f[f"{beam}/heights/h_ph"][:] - segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] - segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] - segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] - - Segment_Is_Land = pd.DataFrame( - { - "latitude": segment_lat, # Use unique column names to avoid conflicts - "longitude": segment_lon, - "delta_time": segment_delta_time, - } - ) - - # Create a GeoDataFrame with geometry - Segment_Is_Land["geometry"] = gpd.points_from_xy( - Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] - ) - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - - # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons( - shoreline_data_path, ICESat2_GDF - ) - ICESat2_GDF["is_land"] = Segment_Is_Land_Labels - - # Interpolate is_land labels to bckgrd_time - interp_is_land = interp1d( - ICESat2_GDF["delta_time"], - ICESat2_GDF["is_land"].astype(int), - bounds_error=False, - fill_value=( - ICESat2_GDF["is_land"].astype(int).iloc[0], - ICESat2_GDF["is_land"].astype(int).iloc[-1], - ), - ) - is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - - # ##########Debug################################# - # # Interpolate latitude and longitude to bckgrd_time - # interp_lat = interp1d( - # ICESat2_GDF['delta_time'], - # ICESat2_GDF['latitude'], - # bounds_error=False, - # fill_value=(ICESat2_GDF['latitude'].iloc[0], ICESat2_GDF['latitude'].iloc[-1]) - # ) - # interp_lon = interp1d( - # ICESat2_GDF['delta_time'], - # ICESat2_GDF['longitude'], - # bounds_error=False, - # fill_value=(ICESat2_GDF['longitude'].iloc[0], ICESat2_GDF['longitude'].iloc[-1]) - # ) - # interpolated_lat = interp_lat(bckgrd_time) - # interpolated_lon = interp_lon(bckgrd_time) - - # # Create a GeoDataFrame for is_land_interpolated with lat/lon - # is_land_df = pd.DataFrame({ - # 'delta_time': bckgrd_time, - # 'latitude': interpolated_lat, - # 'longitude': interpolated_lon, - # 'is_land': is_land_interpolated - # }) - # is_land_gdf = gpd.GeoDataFrame( - # is_land_df, - # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), - # crs="EPSG:4326" - # ) - - # # Save to a GeoPackage file - # file_dir = os.path.dirname(file) - # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") - # if not os.path.exists(is_land_output_dir): - # os.makedirs(is_land_output_dir) - # output_filename = os.path.join(is_land_output_dir, f"{os.path.basename(file)}_is_land.gpkg") - # is_land_gdf.to_file(output_filename, driver="GPKG") - # logger.info(f"Saved is_land_interpolated to {output_filename}") - ############################################## - - # Compute total background rates for land and ocean - land_mask = is_land_interpolated == 1 - ocean_mask = is_land_interpolated == 0 - total_background_rate_land = ( - np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - ) - total_background_rate_ocean = ( - np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan - ) - - # Solar elevation handling - FILL_VALUE = 3.4028235e38 - solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( - np.float64 - ) - valid_solar = solar_elevations[ - (solar_elevations != FILL_VALUE) - & (np.abs(solar_elevations) < 1e30) - & np.isfinite(solar_elevations) - ] - solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) - - mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan - - result = { - "file": os.path.basename(file), - "total_background_rate_land": total_background_rate_land, - "total_background_rate_ocean": total_background_rate_ocean, - "solar_elevation": solar_elevation, - "mean_photon_height": mean_photon_height, - } - logger.info(f"Processed {file}: {result}") - return result - except Exception as e: - logger.error(f"Error processing {file}: {e!s}") - return {"file": os.path.basename(file), "error": str(e)} - - -def fit_surface_model( - df, - classification, - surface_type, - model_func, - p0, - bounds, - x_col="solar_elevation", - y_col="solar_background_rate", -): - """Fit a model to a specific classification and surface type.""" - subset = df[ - (df["classification"] == classification) & (df["surface_type"] == surface_type) - ].copy() - if len(subset) < 3: - print(f"Warning: Insufficient data for {classification} {surface_type} model.") - return None, None, None - - mask = ( - subset[x_col].notna() - & subset[y_col].notna() - & np.isfinite(subset[x_col]) - & np.isfinite(subset[y_col]) - ) - valid_x = subset.loc[mask, x_col] - valid_y = subset.loc[mask, y_col] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." - ) - return None, None, None - - try: - popt, _ = curve_fit( - model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 - ) - y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( - (valid_y - np.mean(valid_y)) ** 2 - ) - return popt, r2, lambda x: model_func(x, *popt) - except RuntimeError as e: - print(f"Fit failed for {classification} {surface_type}: {e}") - return None, None, None - - -def load_and_process_files(atl03_directory, beam, shoreline_data_path): - """Load and process ATL03 files in parallel.""" - atl03_files = [ - os.path.join(atl03_directory, f) - for f in os.listdir(atl03_directory) - if f.endswith(".h5") and "ATL03" in f - ] - print(f"Found {len(atl03_files)} ATL03 files.") - - if not atl03_files: - raise ValueError("No ATL03 files found in the directory.") - - with Pool() as pool: - results = pool.map( - process_file, [(f, beam, shoreline_data_path) for f in atl03_files] - ) - - print("Results from process_file:", results) # Debug output - df = pd.DataFrame(results) - - if "error" in df.columns: - error_files = df[df["error"].notna()]["file"].tolist() - print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df["error"].isna()].drop(columns=["error"]) - - if df.empty: - raise ValueError( - "No files were successfully processed. Check the error messages above." - ) - - print("DataFrame columns after processing:", df.columns) - return df - - -def assign_surface_type(df): - """Assign surface type based on background rates.""" - required_columns = ["total_background_rate_land", "total_background_rate_ocean"] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - - df["surface_type"] = np.where( - df["total_background_rate_land"].notna() - & (df["total_background_rate_land"] > 0), - "Land", - np.where( - df["total_background_rate_ocean"].notna() - & (df["total_background_rate_ocean"] > 0), - "Ocean", - "Mixed", - ), - ) - return df - - -def classify_data(df, twilight_threshold=6.0): - """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df["classification"] = np.where( - df["solar_elevation"].isna(), - "Unknown", - np.where( - df["solar_elevation"] > twilight_threshold, - "Daytime", - np.where( - df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" - ), - ), - ) - daytime_df = df[df["classification"] == "Daytime"].copy() - nighttime_df = df[df["classification"] == "Nighttime"].copy() - twilight_df = df[df["classification"] == "Twilight"].copy() - unknown_df = df[df["classification"] == "Unknown"].copy() - print( - f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" - ) - return df, daytime_df, nighttime_df, twilight_df, unknown_df - - -def calculate_non_solar_rate(nighttime_df): - """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) - non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) - non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) - print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") - print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") - print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") - return non_solar_rate - - -def compute_solar_background_rates(df, non_solar_rate): - """Compute solar background rates for land and ocean.""" - df["solar_background_rate_land"] = ( - df["total_background_rate_land"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate_ocean"] = ( - df["total_background_rate_ocean"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate"] = np.where( - df["surface_type"] == "Land", - df["solar_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan - ), - ) - return df - - -def fit_daytime_model(df, daytime_df, non_solar_rate): - """Fit daytime models for land and ocean.""" - daytime_models = {} - for surface_type in ["Land", "Ocean"]: - subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print(f"Warning: Insufficient daytime {surface_type} data for modeling.") - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_x = subset.loc[mask, "solar_elevation"] - valid_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for daytime {surface_type} model." - ) - continue - - # Remove outliers - log_y = np.log10(valid_y + 1) - mean_log_y = np.mean(log_y) - std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( - log_y < mean_log_y + 3 * std_log_y - ) - valid_x = valid_x[outlier_mask] - valid_y = valid_y[outlier_mask] - - print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") - try: - p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ( - [0, 0.01, non_solar_rate * 0.9], - [1e8, 0.5, non_solar_rate * 1.1], - ) - popt_exp, _ = curve_fit( - exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 - ) - r2_exp = 1 - np.sum( - (valid_y - exp_model(valid_x, *popt_exp)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - x_shifted = valid_x - min(valid_x) + 1 - p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ( - [0, 0.1, non_solar_rate * 0.9], - [1e8, 1.0, non_solar_rate * 1.1], - ) - popt_power, _ = curve_fit( - power_model, - x_shifted, - valid_y, - p0=p0_power, - bounds=bounds_power, - maxfev=20000, - ) - r2_power = 1 - np.sum( - (valid_y - power_model(x_shifted, *popt_power)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - if r2_power > r2_exp: - print( - f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_power, - "r2": r2_power, - "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), - } - else: - print( - f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_exp, - "r2": r2_exp, - "model": lambda x: exp_model(x, *popt_exp), - } - - # Apply model - daytime_mask = (df["classification"] == "Daytime") & ( - df["surface_type"] == surface_type - ) - valid_daytime_mask = ( - daytime_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] - if r2_power > r2_exp: - x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model_shifted).clip(lower=0) - else: - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( - non_solar_rate - ) - except RuntimeError as e: - print( - f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." - ) - df.loc[ - (df["classification"] == "Daytime") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - return df, daytime_models - - -def fit_twilight_model(df, twilight_df, non_solar_rate): - """Fit twilight models for land and ocean.""" - twilight_models = {} - if len(twilight_df) > 3: - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - df["solar_background_rate"].fillna(0).clip(lower=0) - ) - for surface_type in ["Land", "Ocean"]: - subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print( - f"Warning: Insufficient twilight {surface_type} data for modeling." - ) - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_twilight_x = subset.loc[mask, "solar_elevation"] - valid_twilight_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_twilight_x) < 3: - print( - f"Warning: Fewer than 3 valid points for twilight {surface_type} model." - ) - continue - - print( - f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" - ) - try: - initial_a = np.max(valid_twilight_y) - non_solar_rate - initial_b = 0.01 # Smaller b for a flatter Gaussian - initial_c = non_solar_rate - p0 = [initial_a, initial_b, initial_c] - bounds = ( - [0, 0.001, non_solar_rate * 0.9], - [1e6, 1.0, non_solar_rate * 1.1], - ) - - popt, _ = curve_fit( - twilight_model, - valid_twilight_x, - valid_twilight_y, - p0=p0, - bounds=bounds, - maxfev=50000, - ) - y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( - (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 - ) - - if r_squared < 0: - # Fallback to a constant model (mean of the data) - mean_rate = np.mean(valid_twilight_y) - twilight_models[surface_type] = { - "params": [mean_rate], - "r2": 0.0, # R² of a constant model is 0 by definition - "model": lambda x: np.full_like(x, mean_rate), - } - print( - f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" - ) - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - else: - print( - f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" - ) - twilight_models[surface_type] = { - "params": popt, - "r2": r_squared, - "model": lambda x: twilight_model(x, *popt), - } - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] - df.loc[valid_twilight_mask, "solar_background_rate"] = ( - twilight_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - ) - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - except RuntimeError as e: - print( - f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." - ) - df.loc[ - (df["classification"] == "Twilight") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - else: - print( - "Too few twilight points for modeling. Setting twilight rates to non-solar rate." - ) - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - non_solar_rate - ) - return df, twilight_models - - -def create_diagnostic_plot( - df, - daytime_df, - twilight_df, - nighttime_df, - daytime_models, - twilight_models, - non_solar_rate, - output_dir, -): - """Create diagnostic plot of background rate vs. solar elevation, including nighttime comparison.""" - plt.figure(figsize=(10, 6)) - - # Total background rate (all data) - plt.scatter( - df["solar_elevation"], - df["total_background_rate"], - c="gray", - alpha=0.5, - label="Total", - ) - - # Daytime data - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], - c="blue", - alpha=0.5, - label="Daytime Land", - ) - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="cyan", - alpha=0.5, - label="Daytime Ocean", - ) - - # Twilight data - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], - c="orange", - alpha=0.5, - label="Twilight Land", - ) - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="yellow", - alpha=0.5, - label="Twilight Ocean", - ) - - # Nighttime data (using total_background_rate since solar contribution is minimal) - plt.scatter( - nighttime_df[nighttime_df["surface_type"] == "Land"]["solar_elevation"], - nighttime_df[nighttime_df["surface_type"] == "Land"]["total_background_rate"], - c="purple", - alpha=0.5, - label="Nighttime Land", - ) - plt.scatter( - nighttime_df[nighttime_df["surface_type"] == "Ocean"]["solar_elevation"], - nighttime_df[nighttime_df["surface_type"] == "Ocean"]["total_background_rate"], - c="lightgreen", - alpha=0.5, - label="Nighttime Ocean", - ) - - # Non-solar background rate line - plt.axhline( - non_solar_rate, - color="r", - linestyle="--", - label=f"Non-Solar: {non_solar_rate:.2e} ph/s", - ) - - # Plot daytime fits - for surface_type in daytime_models: - if daytime_models[surface_type]["model"] is not None: - x_fit = np.linspace( - min(daytime_df["solar_elevation"]), - max(daytime_df["solar_elevation"]), - 100, - ) - y_fit = daytime_models[surface_type]["model"]( - x_fit - if "power" not in daytime_models[surface_type] - else x_fit - min(daytime_df["solar_elevation"]) + 1 - ) - plt.plot( - x_fit, - y_fit, - label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - # Plot twilight fits - for surface_type in twilight_models: - if twilight_models[surface_type]["model"] is not None: - x_twilight = np.linspace( - min(twilight_df["solar_elevation"]), - max(twilight_df["solar_elevation"]), - 100, - ) - y_twilight = twilight_models[surface_type]["model"](x_twilight) - plt.plot( - x_twilight, - y_twilight, - label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - # Set plot properties - plt.yscale("log") - plt.ylim(1e3, 1e8) - plt.xlabel("Solar Elevation (degrees)") - plt.ylabel("Background Rate (ph/s)") - plt.legend() - plt.title( - "Background Rate vs. Solar Elevation (Land vs. Ocean) with Nighttime Comparison" - ) - - # Save the plot - plt.savefig( - os.path.join(output_dir, "background_vs_elevation_with_nighttime_2021.png") - ) - plt.close() - - -def create_calibration_plot(df, daytime_df, output_dir): - """Create a calibration plot using mean photon height as a proxy.""" - if not daytime_df.empty: - sample_file = daytime_df.iloc[0] - heights = sample_file["mean_photon_height"] - if np.isfinite(heights): - solar_rate = sample_file["solar_background_rate"] - bin_size = 0.1 - # Simulate histogram with 1000 points - hist, edges = np.histogram( - np.array([heights] * 1000), - bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), - ) - photons_per_bin = ( - solar_rate - * (1000 / np.sum(hist)) - * bin_size - / (max(heights) - min(heights) + 2) - ) - calibrated_counts = np.maximum(hist - photons_per_bin, 0) - calibrated_heights = [] - for i, count in enumerate(calibrated_counts): - if count > 0: - calibrated_heights.extend( - np.random.uniform(edges[i], edges[i + 1], int(count)) - ) - - plt.figure(figsize=(8, 6)) - plt.hist( - [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" - ) - plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") - plt.xlabel("Photon Height (m)") - plt.ylabel("Count") - plt.legend() - plt.title(f'Calibration Example: {sample_file["file"]}') - plt.savefig(os.path.join(output_dir, "calibration_example.png")) - plt.close() - - -def save_results(df, output_dir): - """Save the results to a CSV file.""" - df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) - - -def analyze_icesat2_data( - atl03_directory, - beam="gt1l", - twilight_threshold=6.0, - output_dir="output", - shoreline_data_path=None, -): - """Main analysis function orchestrating the workflow.""" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - df = load_and_process_files(atl03_directory, beam, shoreline_data_path) - if df.empty: - raise ValueError("No data processed successfully. Check logs for errors.") - - df = assign_surface_type(df) - - # Compute total_background_rate based on surface_type - df["total_background_rate"] = np.where( - df["surface_type"] == "Land", - df["total_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan - ), - ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( - df, twilight_threshold - ) - - non_solar_rate = calculate_non_solar_rate(nighttime_df) - df = compute_solar_background_rates(df, non_solar_rate) - - # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates - daytime_df = df[df["classification"] == "Daytime"].copy() - nighttime_df = df[df["classification"] == "Nighttime"].copy() - twilight_df = df[df["classification"] == "Twilight"].copy() - unknown_df = df[df["classification"] == "Unknown"].copy() - - df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) - df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) - save_results(df, output_dir) - - # Updated call to include nighttime_df - create_diagnostic_plot( - df, - daytime_df, - twilight_df, - nighttime_df, - daytime_models, - twilight_models, - non_solar_rate, - output_dir, - ) - # create_calibration_plot(df, daytime_df, output_dir) - - return df, { - "daytime_models": daytime_models, - "twilight_models": twilight_models, - "non_solar_rate": non_solar_rate, - } - - -if __name__ == "__main__": - args = get_args() - atl03_directory = os.path.join(args.workspace_path, args.atl03_path) - shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) - output_dir = os.path.join(args.workspace_path, args.output_path) - df, models = analyze_icesat2_data( - atl03_directory, - args.beam, - output_dir=output_dir, - shoreline_data_path=shoreline_data_path, - ) - print("Analysis complete.") diff --git a/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py b/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py deleted file mode 100644 index 1267057..0000000 --- a/aok/core/kd_utils/SolarBckgrd/analyze_solarbckgrd_icesat2.py +++ /dev/null @@ -1,785 +0,0 @@ -import logging -from multiprocessing import Pool -import os - -from config import get_args -import geopandas as gpd -import h5py -import matplotlib.pyplot as plt -from noise_utils.data_processing import isolate_sea_land_photons -import numpy as np -import pandas as pd -from scipy.interpolate import interp1d -from scipy.optimize import curve_fit - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -# Define model functions -def power_model(x, a, b, c): - return a * np.power(x, b) + c - - -def exp_model(x, a, b, c): - """Exponential model: a * exp(b * x) + c""" - return a * np.exp(b * x) + c - - -def twilight_model(x, a, b, c): - return a * np.exp(-b * x**2) + c # Gaussian form for twilight - - -def process_file(args): - """Process a single ATL03 file, skipping files missing required datasets.""" - file, beam, shoreline_data_path = args - try: - with h5py.File(file, "r") as f: - # Check for required datasets - required_keys = [ - f"{beam}/bckgrd_atlas/bckgrd_rate", - f"{beam}/bckgrd_atlas/delta_time", - f"{beam}/heights/delta_time", - f"{beam}/heights/h_ph", - f"{beam}/geolocation/reference_photon_lat", - f"{beam}/geolocation/reference_photon_lon", - f"{beam}/geolocation/solar_azimuth", - f"{beam}/geolocation/solar_elevation", - ] - for key in required_keys: - if key not in f: - raise KeyError(f"Missing dataset: {key}") - - bckgrd_rate = f[f"{beam}/bckgrd_atlas/bckgrd_rate"][:].astype(np.float64) - bckgrd_time = f[f"{beam}/bckgrd_atlas/delta_time"][:] - heights = f[f"{beam}/heights/h_ph"][:] - segment_lat = f[f"{beam}/geolocation/reference_photon_lat"][:] - segment_lon = f[f"{beam}/geolocation/reference_photon_lon"][:] - segment_delta_time = f[f"{beam}/geolocation/delta_time"][:] - - Segment_Is_Land = pd.DataFrame( - { - "latitude": segment_lat, # Use unique column names to avoid conflicts - "longitude": segment_lon, - "delta_time": segment_delta_time, - } - ) - - # Create a GeoDataFrame with geometry - Segment_Is_Land["geometry"] = gpd.points_from_xy( - Segment_Is_Land["longitude"], Segment_Is_Land["latitude"] - ) - ICESat2_GDF = gpd.GeoDataFrame(Segment_Is_Land, crs="EPSG:4326") - - # Determine land/ocean classification - Segment_Is_Land_Labels = isolate_sea_land_photons( - shoreline_data_path, ICESat2_GDF - ) - ICESat2_GDF["is_land"] = Segment_Is_Land_Labels - - # Interpolate is_land labels to bckgrd_time - interp_is_land = interp1d( - ICESat2_GDF["delta_time"], - ICESat2_GDF["is_land"].astype(int), - bounds_error=False, - fill_value=( - ICESat2_GDF["is_land"].astype(int).iloc[0], - ICESat2_GDF["is_land"].astype(int).iloc[-1], - ), - ) - is_land_interpolated = interp_is_land(bckgrd_time).round().astype(int) - - # ##########Debug################################# - # # Interpolate latitude and longitude to bckgrd_time - # interp_lat = interp1d( - # ICESat2_GDF['delta_time'], - # ICESat2_GDF['latitude'], - # bounds_error=False, - # fill_value=(ICESat2_GDF['latitude'].iloc[0], ICESat2_GDF['latitude'].iloc[-1]) - # ) - # interp_lon = interp1d( - # ICESat2_GDF['delta_time'], - # ICESat2_GDF['longitude'], - # bounds_error=False, - # fill_value=(ICESat2_GDF['longitude'].iloc[0], ICESat2_GDF['longitude'].iloc[-1]) - # ) - # interpolated_lat = interp_lat(bckgrd_time) - # interpolated_lon = interp_lon(bckgrd_time) - - # # Create a GeoDataFrame for is_land_interpolated with lat/lon - # is_land_df = pd.DataFrame({ - # 'delta_time': bckgrd_time, - # 'latitude': interpolated_lat, - # 'longitude': interpolated_lon, - # 'is_land': is_land_interpolated - # }) - # is_land_gdf = gpd.GeoDataFrame( - # is_land_df, - # geometry=gpd.points_from_xy(is_land_df['longitude'], is_land_df['latitude']), - # crs="EPSG:4326" - # ) - - # # Save to a GeoPackage file - # file_dir = os.path.dirname(file) - # is_land_output_dir = os.path.join(file_dir, "is_land_interpolated") - # if not os.path.exists(is_land_output_dir): - # os.makedirs(is_land_output_dir) - # output_filename = os.path.join(is_land_output_dir, f"{os.path.basename(file)}_is_land.gpkg") - # is_land_gdf.to_file(output_filename, driver="GPKG") - # logger.info(f"Saved is_land_interpolated to {output_filename}") - ############################################## - - # Compute total background rates for land and ocean - land_mask = is_land_interpolated == 1 - ocean_mask = is_land_interpolated == 0 - total_background_rate_land = ( - np.mean(bckgrd_rate[land_mask]) if np.sum(land_mask) > 0 else np.nan - ) - total_background_rate_ocean = ( - np.mean(bckgrd_rate[ocean_mask]) if np.sum(ocean_mask) > 0 else np.nan - ) - - # Solar elevation handling - FILL_VALUE = 3.4028235e38 - solar_elevations = f[f"{beam}/geolocation/solar_elevation"][:].astype( - np.float64 - ) - valid_solar = solar_elevations[ - (solar_elevations != FILL_VALUE) - & (np.abs(solar_elevations) < 1e30) - & np.isfinite(solar_elevations) - ] - solar_elevation = np.nan if valid_solar.size == 0 else np.mean(valid_solar) - - mean_photon_height = np.mean(heights) if heights.size > 0 else np.nan - - result = { - "file": os.path.basename(file), - "total_background_rate_land": total_background_rate_land, - "total_background_rate_ocean": total_background_rate_ocean, - "solar_elevation": solar_elevation, - "mean_photon_height": mean_photon_height, - } - logger.info(f"Processed {file}: {result}") - return result - except Exception as e: - logger.error(f"Error processing {file}: {e!s}") - return {"file": os.path.basename(file), "error": str(e)} - - -def fit_surface_model( - df, - classification, - surface_type, - model_func, - p0, - bounds, - x_col="solar_elevation", - y_col="solar_background_rate", -): - """Fit a model to a specific classification and surface type.""" - subset = df[ - (df["classification"] == classification) & (df["surface_type"] == surface_type) - ].copy() - if len(subset) < 3: - print(f"Warning: Insufficient data for {classification} {surface_type} model.") - return None, None, None - - mask = ( - subset[x_col].notna() - & subset[y_col].notna() - & np.isfinite(subset[x_col]) - & np.isfinite(subset[y_col]) - ) - valid_x = subset.loc[mask, x_col] - valid_y = subset.loc[mask, y_col] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for {classification} {surface_type} model." - ) - return None, None, None - - try: - popt, _ = curve_fit( - model_func, valid_x, valid_y, p0=p0, bounds=bounds, maxfev=20000 - ) - y_pred = model_func(valid_x, *popt) - r2 = 1 - np.sum((valid_y - y_pred) ** 2) / np.sum( - (valid_y - np.mean(valid_y)) ** 2 - ) - return popt, r2, lambda x: model_func(x, *popt) - except RuntimeError as e: - print(f"Fit failed for {classification} {surface_type}: {e}") - return None, None, None - - -def load_and_process_files(atl03_directory, beam, shoreline_data_path): - """Load and process ATL03 files in parallel.""" - atl03_files = [ - os.path.join(atl03_directory, f) - for f in os.listdir(atl03_directory) - if f.endswith(".h5") and "ATL03" in f - ] - print(f"Found {len(atl03_files)} ATL03 files.") - - if not atl03_files: - raise ValueError("No ATL03 files found in the directory.") - - with Pool() as pool: - results = pool.map( - process_file, [(f, beam, shoreline_data_path) for f in atl03_files] - ) - - print("Results from process_file:", results) # Debug output - df = pd.DataFrame(results) - - if "error" in df.columns: - error_files = df[df["error"].notna()]["file"].tolist() - print(f"Errors in {len(error_files)} files: {error_files}") - df = df[df["error"].isna()].drop(columns=["error"]) - - if df.empty: - raise ValueError( - "No files were successfully processed. Check the error messages above." - ) - - print("DataFrame columns after processing:", df.columns) - return df - - -def assign_surface_type(df): - """Assign surface type based on background rates.""" - required_columns = ["total_background_rate_land", "total_background_rate_ocean"] - missing_columns = [col for col in required_columns if col not in df.columns] - if missing_columns: - raise KeyError(f"Missing required columns in DataFrame: {missing_columns}") - - df["surface_type"] = np.where( - df["total_background_rate_land"].notna() - & (df["total_background_rate_land"] > 0), - "Land", - np.where( - df["total_background_rate_ocean"].notna() - & (df["total_background_rate_ocean"] > 0), - "Ocean", - "Mixed", - ), - ) - return df - - -def classify_data(df, twilight_threshold=6.0): - """Classify data into Daytime, Nighttime, Twilight, or Unknown.""" - df["classification"] = np.where( - df["solar_elevation"].isna(), - "Unknown", - np.where( - df["solar_elevation"] > twilight_threshold, - "Daytime", - np.where( - df["solar_elevation"] < -twilight_threshold, "Nighttime", "Twilight" - ), - ), - ) - daytime_df = df[df["classification"] == "Daytime"].copy() - nighttime_df = df[df["classification"] == "Nighttime"].copy() - twilight_df = df[df["classification"] == "Twilight"].copy() - unknown_df = df[df["classification"] == "Unknown"].copy() - print( - f"Daytime: {len(daytime_df)}, Nighttime: {len(nighttime_df)}, Twilight: {len(twilight_df)}, Unknown: {len(unknown_df)}" - ) - return df, daytime_df, nighttime_df, twilight_df, unknown_df - - -def calculate_non_solar_rate(nighttime_df): - """Calculate the non-solar background rate from nighttime data.""" - non_solar_rate_land = np.nanmean(nighttime_df["total_background_rate_land"]) - non_solar_rate_ocean = np.nanmean(nighttime_df["total_background_rate_ocean"]) - non_solar_rate = np.nanmean([non_solar_rate_land, non_solar_rate_ocean]) - print(f"Non-solar background rate (land): {non_solar_rate_land:.2e} ph/s") - print(f"Non-solar background rate (ocean): {non_solar_rate_ocean:.2e} ph/s") - print(f"Non-solar background rate (combined): {non_solar_rate:.2e} ph/s") - return non_solar_rate - - -def compute_solar_background_rates(df, non_solar_rate): - """Compute solar background rates for land and ocean.""" - df["solar_background_rate_land"] = ( - df["total_background_rate_land"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate_ocean"] = ( - df["total_background_rate_ocean"] - non_solar_rate - ).clip(lower=0) - df["solar_background_rate"] = np.where( - df["surface_type"] == "Land", - df["solar_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["solar_background_rate_ocean"], np.nan - ), - ) - return df - - -def fit_daytime_model(df, daytime_df, non_solar_rate): - """Fit daytime models for land and ocean.""" - daytime_models = {} - for surface_type in ["Land", "Ocean"]: - subset = daytime_df[daytime_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print(f"Warning: Insufficient daytime {surface_type} data for modeling.") - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_x = subset.loc[mask, "solar_elevation"] - valid_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_x) < 3: - print( - f"Warning: Fewer than 3 valid points for daytime {surface_type} model." - ) - continue - - # Remove outliers - log_y = np.log10(valid_y + 1) - mean_log_y = np.mean(log_y) - std_log_y = np.std(log_y) - outlier_mask = (log_y > mean_log_y - 3 * std_log_y) & ( - log_y < mean_log_y + 3 * std_log_y - ) - valid_x = valid_x[outlier_mask] - valid_y = valid_y[outlier_mask] - - print(f"Daytime regression ({surface_type}): {len(valid_x)} valid data points") - try: - p0_exp = [np.max(valid_y) - non_solar_rate, 0.05, non_solar_rate] - bounds_exp = ( - [0, 0.01, non_solar_rate * 0.9], - [1e8, 0.5, non_solar_rate * 1.1], - ) - popt_exp, _ = curve_fit( - exp_model, valid_x, valid_y, p0=p0_exp, bounds=bounds_exp, maxfev=20000 - ) - r2_exp = 1 - np.sum( - (valid_y - exp_model(valid_x, *popt_exp)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - x_shifted = valid_x - min(valid_x) + 1 - p0_power = [np.max(valid_y) - non_solar_rate, 0.6, non_solar_rate] - bounds_power = ( - [0, 0.1, non_solar_rate * 0.9], - [1e8, 1.0, non_solar_rate * 1.1], - ) - popt_power, _ = curve_fit( - power_model, - x_shifted, - valid_y, - p0=p0_power, - bounds=bounds_power, - maxfev=20000, - ) - r2_power = 1 - np.sum( - (valid_y - power_model(x_shifted, *popt_power)) ** 2 - ) / np.sum((valid_y - np.mean(valid_y)) ** 2) - - if r2_power > r2_exp: - print( - f"Daytime model ({surface_type}, power law): rate = {popt_power[0]:.2e} * solar_elevation^{popt_power[1]:.2f} + {popt_power[2]:.2e}, R² = {r2_power:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_power, - "r2": r2_power, - "model": lambda x: power_model(x - min(valid_x) + 1, *popt_power), - } - else: - print( - f"Daytime model ({surface_type}, exponential): rate = {popt_exp[0]:.2e} * exp({popt_exp[1]:.2f} * solar_elevation) + {popt_exp[2]:.2e}, R² = {r2_exp:.3f}" - ) - daytime_models[surface_type] = { - "params": popt_exp, - "r2": r2_exp, - "model": lambda x: exp_model(x, *popt_exp), - } - - # Apply model - daytime_mask = (df["classification"] == "Daytime") & ( - df["surface_type"] == surface_type - ) - valid_daytime_mask = ( - daytime_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_daytime_mask, "solar_elevation"] - if r2_power > r2_exp: - x_for_model_shifted = x_for_model - min(valid_x) + 1 - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model_shifted).clip(lower=0) - else: - df.loc[valid_daytime_mask, "solar_background_rate"] = daytime_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - df.loc[daytime_mask & ~valid_daytime_mask, "solar_background_rate"] = ( - non_solar_rate - ) - except RuntimeError as e: - print( - f"Daytime model fitting failed for {surface_type}: {e}. Using non-solar rate as fallback." - ) - df.loc[ - (df["classification"] == "Daytime") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - return df, daytime_models - - -def fit_twilight_model(df, twilight_df, non_solar_rate): - """Fit twilight models for land and ocean.""" - twilight_models = {} - if len(twilight_df) > 3: - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - df["solar_background_rate"].fillna(0).clip(lower=0) - ) - for surface_type in ["Land", "Ocean"]: - subset = twilight_df[twilight_df["surface_type"] == surface_type].copy() - if len(subset) < 3: - print( - f"Warning: Insufficient twilight {surface_type} data for modeling." - ) - continue - - mask = ( - subset["solar_elevation"].notna() - & subset["solar_background_rate"].notna() - & np.isfinite(subset["solar_elevation"]) - & np.isfinite(subset["solar_background_rate"]) - ) - valid_twilight_x = subset.loc[mask, "solar_elevation"] - valid_twilight_y = subset.loc[mask, "solar_background_rate"] - - if len(valid_twilight_x) < 3: - print( - f"Warning: Fewer than 3 valid points for twilight {surface_type} model." - ) - continue - - print( - f"Twilight regression ({surface_type}): {len(valid_twilight_x)} valid data points" - ) - try: - initial_a = np.max(valid_twilight_y) - non_solar_rate - initial_b = 0.01 # Smaller b for a flatter Gaussian - initial_c = non_solar_rate - p0 = [initial_a, initial_b, initial_c] - bounds = ( - [0, 0.001, non_solar_rate * 0.9], - [1e6, 1.0, non_solar_rate * 1.1], - ) - - popt, _ = curve_fit( - twilight_model, - valid_twilight_x, - valid_twilight_y, - p0=p0, - bounds=bounds, - maxfev=50000, - ) - y_pred = twilight_model(valid_twilight_x, *popt) - r_squared = 1 - np.sum((valid_twilight_y - y_pred) ** 2) / np.sum( - (valid_twilight_y - np.mean(valid_twilight_y)) ** 2 - ) - - if r_squared < 0: - # Fallback to a constant model (mean of the data) - mean_rate = np.mean(valid_twilight_y) - twilight_models[surface_type] = { - "params": [mean_rate], - "r2": 0.0, # R² of a constant model is 0 by definition - "model": lambda x: np.full_like(x, mean_rate), - } - print( - f"Twilight model ({surface_type}) failed with R² = {r_squared:.3f}. Using constant model: {mean_rate:.2e} ph/s" - ) - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - df.loc[valid_twilight_mask, "solar_background_rate"] = mean_rate - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - else: - print( - f"Twilight model ({surface_type}): rate = {popt[0]:.2e} * exp(-{popt[1]:.2f} * solar_elevation^2) + {popt[2]:.2e}, R² = {r_squared:.3f}" - ) - twilight_models[surface_type] = { - "params": popt, - "r2": r_squared, - "model": lambda x: twilight_model(x, *popt), - } - - twilight_mask = (df["classification"] == "Twilight") & ( - df["surface_type"] == surface_type - ) - valid_twilight_mask = ( - twilight_mask - & df["solar_elevation"].notna() - & np.isfinite(df["solar_elevation"]) - ) - x_for_model = df.loc[valid_twilight_mask, "solar_elevation"] - df.loc[valid_twilight_mask, "solar_background_rate"] = ( - twilight_models[ - surface_type - ]["model"](x_for_model).clip(lower=0) - ) - df.loc[ - twilight_mask & ~valid_twilight_mask, "solar_background_rate" - ] = non_solar_rate - except RuntimeError as e: - print( - f"Twilight model fitting failed for {surface_type}: {e}. Setting twilight rates to non-solar rate." - ) - df.loc[ - (df["classification"] == "Twilight") - & (df["surface_type"] == surface_type), - "solar_background_rate", - ] = non_solar_rate - else: - print( - "Too few twilight points for modeling. Setting twilight rates to non-solar rate." - ) - df.loc[df["classification"] == "Twilight", "solar_background_rate"] = ( - non_solar_rate - ) - return df, twilight_models - - -def create_diagnostic_plot( - df, - daytime_df, - twilight_df, - daytime_models, - twilight_models, - non_solar_rate, - output_dir, -): - """Create diagnostic plot of background rate vs. solar elevation.""" - plt.figure(figsize=(10, 6)) - plt.scatter( - df["solar_elevation"], - df["total_background_rate"], - c="gray", - alpha=0.5, - label="Total", - ) - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Land"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Land"]["solar_background_rate"], - c="blue", - alpha=0.5, - label="Daytime Land", - ) - plt.scatter( - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_elevation"], - daytime_df[daytime_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="cyan", - alpha=0.5, - label="Daytime Ocean", - ) - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Land"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Land"]["solar_background_rate"], - c="orange", - alpha=0.5, - label="Twilight Land", - ) - plt.scatter( - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_elevation"], - twilight_df[twilight_df["surface_type"] == "Ocean"]["solar_background_rate"], - c="yellow", - alpha=0.5, - label="Twilight Ocean", - ) - plt.axhline( - non_solar_rate, - color="r", - linestyle="--", - label=f"Non-Solar: {non_solar_rate:.2e} ph/s", - ) - - # Plot daytime fits - for surface_type in daytime_models: - if daytime_models[surface_type]["model"] is not None: - x_fit = np.linspace( - min(daytime_df["solar_elevation"]), - max(daytime_df["solar_elevation"]), - 100, - ) - y_fit = daytime_models[surface_type]["model"]( - x_fit - if "power" not in daytime_models[surface_type] - else x_fit - min(daytime_df["solar_elevation"]) + 1 - ) - plt.plot( - x_fit, - y_fit, - label=f'Daytime {surface_type} Fit (R²={daytime_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - # Plot twilight fits - for surface_type in twilight_models: - if twilight_models[surface_type]["model"] is not None: - x_twilight = np.linspace( - min(twilight_df["solar_elevation"]), - max(twilight_df["solar_elevation"]), - 100, - ) - y_twilight = twilight_models[surface_type]["model"](x_twilight) - plt.plot( - x_twilight, - y_twilight, - label=f'Twilight {surface_type} Fit (R²={twilight_models[surface_type]["r2"]:.3f})', - linestyle="-" if surface_type == "Land" else "--", - ) - - plt.yscale("log") - plt.ylim(1e3, 1e8) - plt.xlabel("Solar Elevation (degrees)") - plt.ylabel("Background Rate (ph/s)") - plt.legend() - plt.title("Background Rate vs. Solar Elevation (Land vs. Ocean)") - plt.savefig(os.path.join(output_dir, "background_vs_elevation_exp_fit_2021.png")) - plt.close() - - -def create_calibration_plot(df, daytime_df, output_dir): - """Create a calibration plot using mean photon height as a proxy.""" - if not daytime_df.empty: - sample_file = daytime_df.iloc[0] - heights = sample_file["mean_photon_height"] - if np.isfinite(heights): - solar_rate = sample_file["solar_background_rate"] - bin_size = 0.1 - # Simulate histogram with 1000 points - hist, edges = np.histogram( - np.array([heights] * 1000), - bins=np.arange(min(heights) - 1, max(heights) + 1, bin_size), - ) - photons_per_bin = ( - solar_rate - * (1000 / np.sum(hist)) - * bin_size - / (max(heights) - min(heights) + 2) - ) - calibrated_counts = np.maximum(hist - photons_per_bin, 0) - calibrated_heights = [] - for i, count in enumerate(calibrated_counts): - if count > 0: - calibrated_heights.extend( - np.random.uniform(edges[i], edges[i + 1], int(count)) - ) - - plt.figure(figsize=(8, 6)) - plt.hist( - [heights] * 1000, bins=100, alpha=0.5, label="Original (Simulated)" - ) - plt.hist(calibrated_heights, bins=100, alpha=0.5, label="Calibrated") - plt.xlabel("Photon Height (m)") - plt.ylabel("Count") - plt.legend() - plt.title(f'Calibration Example: {sample_file["file"]}') - plt.savefig(os.path.join(output_dir, "calibration_example.png")) - plt.close() - - -def save_results(df, output_dir): - """Save the results to a CSV file.""" - df.to_csv(os.path.join(output_dir, "atl03_analysis_updated.csv"), index=False) - - -def analyze_icesat2_data( - atl03_directory, - beam="gt1l", - twilight_threshold=6.0, - output_dir="output", - shoreline_data_path=None, -): - """Main analysis function orchestrating the workflow.""" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - df = load_and_process_files(atl03_directory, beam, shoreline_data_path) - if df.empty: - raise ValueError("No data processed successfully. Check logs for errors.") - - df = assign_surface_type(df) - - # Compute total_background_rate based on surface_type - df["total_background_rate"] = np.where( - df["surface_type"] == "Land", - df["total_background_rate_land"], - np.where( - df["surface_type"] == "Ocean", df["total_background_rate_ocean"], np.nan - ), - ) - - df, daytime_df, nighttime_df, twilight_df, unknown_df = classify_data( - df, twilight_threshold - ) - - non_solar_rate = calculate_non_solar_rate(nighttime_df) - df = compute_solar_background_rates(df, non_solar_rate) - - # Recreate filtered DataFrames to include the updated columns from compute_solar_background_rates - daytime_df = df[df["classification"] == "Daytime"].copy() - nighttime_df = df[df["classification"] == "Nighttime"].copy() - twilight_df = df[df["classification"] == "Twilight"].copy() - unknown_df = df[df["classification"] == "Unknown"].copy() - - df, daytime_models = fit_daytime_model(df, daytime_df, non_solar_rate) - df, twilight_models = fit_twilight_model(df, twilight_df, non_solar_rate) - save_results(df, output_dir) - create_diagnostic_plot( - df, - daytime_df, - twilight_df, - daytime_models, - twilight_models, - non_solar_rate, - output_dir, - ) - create_calibration_plot(df, daytime_df, output_dir) - - return df, { - "daytime_models": daytime_models, - "twilight_models": twilight_models, - "non_solar_rate": non_solar_rate, - } - - -if __name__ == "__main__": - args = get_args() - atl03_directory = os.path.join(args.workspace_path, args.atl03_path) - shoreline_data_path = os.path.join(args.workspace_path, args.shoreline_data) - output_dir = os.path.join(args.workspace_path, args.output_path) - df, models = analyze_icesat2_data( - atl03_directory, - args.beam, - output_dir=output_dir, - shoreline_data_path=shoreline_data_path, - ) - print("Analysis complete.") diff --git a/aok/core/kd_utils/SolarBckgrd/config.py b/aok/core/kd_utils/SolarBckgrd/config.py deleted file mode 100644 index ed34089..0000000 --- a/aok/core/kd_utils/SolarBckgrd/config.py +++ /dev/null @@ -1,119 +0,0 @@ -import argparse - - -def get_args(): - parser = argparse.ArgumentParser( - description="Analyze ICESat-2 ATL03 data for solar background." - ) - - # task name - parser.add_argument("--taskID", type=str, default="2022_granules", help="task name") - # workspace path - parser.add_argument( - "--workspace_path", - type=str, - default="/work/users/w/a/wayne128/ICESat2/IS2_kd_py/", - help="Root path for all data processing.", - ) - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCCENTRAL/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCNORTH/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/NC/NCSOUTH/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2018_granules/h5_files/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2019_granules/h5_files/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/', - # help="Base path for ICESat-2 ATL03 data.") - - # parser.add_argument("--atl03_path", type=str, default='Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2021_granules/h5_files/', - # help="Base path for ICESat-2 ATL03 data.") - - parser.add_argument( - "--atl03_path", - type=str, - default="Dataset/ATL03_ICESat2/Extracted_HDF_Files/CD/2020_granules/h5_files/", - help="Base path for ICESat-2 ATL03 data.", - ) - - # parser.add_argument("--parallel", action="store_true", help="Process files in parallel") - parser.add_argument("--beam", default="gt1l", help="Beam to process (e.g., gt1l)") - - parser.add_argument( - "--atl03_file", - type=str, - default="processed_ATL03_20221001234100_01721701_006_01.h5", - help="Name of the ATL03 H5 file to process.", - ) # processed_ATL03_20221001114643_01641707_006_01 - - parser.add_argument( - "--output_path", - type=str, - default="Dataset/Results/", - help="Directory for saving results.", - ) - - # Shoreline data - parser.add_argument( - "--shoreline_data", - type=str, - # default='Shorelines/GeoPkgGlobalShoreline.gpkg', - default="Dataset/Shorelines/ne_10m_land/ne_10m_land.shp", - help="Path to the global shoreline dataset.", - ) - - # Bathymetry datasets - parser.add_argument( - "--gebco_path", - nargs="+", - type=str, - default="Dataset/Bathy/gebco_2024_geotiff/", - help="Path to GEBCO bathymetry datasets.", - ) - - # Resolution settings - parser.add_argument( - "--horizontal_res", - type=int, - default=500, - help="Horizontal resolution for the analysis (in meters).", - ) - parser.add_argument( - "--vertical_res", - type=float, - default=0.25, - help="Vertical resolution for the analysis.", - ) - - # Analysis parameters - parser.add_argument( - "--subsurface_thresh", - type=float, - default=0.5, - help="Threshold for photons below the sea surface.", - ) - parser.add_argument( - "--ignore_subsurface_height_thres", - type=float, - default=-1, - help="Maximum depth thres for Kd calculations (in meters).", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - args = get_args() - # Access parameters like this: - atl03_file_path = args.workspace_path + args.atl03_file - - print("Processing file:", atl03_file_path) - print("Output path:", args.output_path) diff --git a/aok/core/kd_utils/kd_utils.py b/aok/core/kd_utils/kd_utils.py deleted file mode 100644 index 5aa45ca..0000000 --- a/aok/core/kd_utils/kd_utils.py +++ /dev/null @@ -1,1358 +0,0 @@ -#!/usr/bin/env python - -##### -# This group of functions processes ICESat-2 data and creates a bathymetric model. -# To do this, it follows a number of steps in the form of functions, including: -# 1. Reading data (ReadATL03()) -# 2. Orthometrically correcting the dataset (OrthometricCorrection()) -# 3. Pulling down the data segment ID (getAtl03SegID()) -# 4. Bin the data along latitudinal and height gradients (bin_data()) -# 5. Calculate sea height (get_sea_height()) -# 6. Get water temperature (get_water_temp()) -# 7. Correct bathymetric surface for refraction (RefractionCorrection()) -# 8. Calculate bathymetric height (get_bath_height()) -# 9. Produce figures (produce_figures()) -##### -from datetime import datetime - -# os.environ["PROJ_LIB"] = r"C:\Users\wayne\anaconda3\Library\share\proj" -import io -import logging -import math -import os -import re -import time - -import geopandas as gpd -import h5py -import hdbscan - -# from pyproj import Transformer -import matplotlib.pyplot as plt -import netCDF4 -import numpy as np -import pandas as pd - -# import fiona -from pyproj import Proj, Transformer -from sklearn.preprocessing import MinMaxScaler - - -# this function from icesat2_toolkit -# PURPOSE: read ICESat-2 ATL03 HDF5 data files -def read_granule(FILENAME, ATTRIBUTES=False, **kwargs): - """ - Reads ICESat-2 ATL03 Global Geolocated Photons data files - - Parameters - ---------- - FILENAME: str - full path to ATL03 file - ATTRIBUTES: bool, default False - read file, group and variable attributes - - Returns - ------- - IS2_atl03_mds: dict - ATL03 variables - IS2_atl03_attrs: dict - ATL03 attributes - IS2_atl03_beams: list - valid ICESat-2 beams within ATL03 file - """ - # Open the HDF5 file for reading - if isinstance(FILENAME, io.IOBase): - fileID = h5py.File(FILENAME, "r") - else: - fileID = h5py.File(os.path.expanduser(FILENAME), "r") - - # Output HDF5 file information - logging.info(fileID.filename) - logging.info(list(fileID.keys())) - - # allocate python dictionaries for ICESat-2 ATL03 variables and attributes - IS2_atl03_mds = {} - IS2_atl03_attrs = {} - - # read each input beam within the file - IS2_atl03_beams = [] - for gtx in [k for k in fileID.keys() if bool(re.match(r"gt\d[lr]", k))]: - # check if subsetted beam contains data - # check in both the geolocation and heights groups - try: - fileID[gtx]["geolocation"]["segment_id"] - fileID[gtx]["heights"]["delta_time"] - except KeyError: - pass - else: - IS2_atl03_beams.append(gtx) - - # for each included beam - for gtx in IS2_atl03_beams: - # get each HDF5 variable - IS2_atl03_mds[gtx] = {} - IS2_atl03_mds[gtx]["heights"] = {} - IS2_atl03_mds[gtx]["geolocation"] = {} - IS2_atl03_mds[gtx]["bckgrd_atlas"] = {} - IS2_atl03_mds[gtx]["geophys_corr"] = {} - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_mds[gtx]["heights"][key] = val[:] - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_mds[gtx]["geolocation"][key] = val[:] - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_mds[gtx]["bckgrd_atlas"][key] = val[:] - # ICESat-2 Geophysical Corrections Group: Values for tides (ocean, - # solid earth, pole, load, and equilibrium), inverted barometer (IB) - # effects, and range corrections for tropospheric delays - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_mds[gtx]["geophys_corr"][key] = val[:] - - # Getting attributes of included variables - if ATTRIBUTES: - # Getting attributes of IS2_atl03_mds beam variables - IS2_atl03_attrs[gtx] = {} - IS2_atl03_attrs[gtx]["heights"] = {} - IS2_atl03_attrs[gtx]["geolocation"] = {} - IS2_atl03_attrs[gtx]["bckgrd_atlas"] = {} - IS2_atl03_attrs[gtx]["geophys_corr"] = {} - # Global Group Attributes - for att_name, att_val in fileID[gtx].attrs.items(): - IS2_atl03_attrs[gtx][att_name] = att_val - # ICESat-2 Measurement Group - for key, val in fileID[gtx]["heights"].items(): - IS2_atl03_attrs[gtx]["heights"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["heights"][key][att_name] = att_val - # ICESat-2 Geolocation Group - for key, val in fileID[gtx]["geolocation"].items(): - IS2_atl03_attrs[gtx]["geolocation"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geolocation"][key][att_name] = att_val - # ICESat-2 Background Photon Rate Group - for key, val in fileID[gtx]["bckgrd_atlas"].items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["bckgrd_atlas"][key][att_name] = att_val - # ICESat-2 Geophysical Corrections Group - for key, val in fileID[gtx]["geophys_corr"].items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[gtx]["geophys_corr"][key][att_name] = att_val - - # ICESat-2 spacecraft orientation at time - IS2_atl03_mds["orbit_info"] = {} - IS2_atl03_attrs["orbit_info"] = {} - for key, val in fileID["orbit_info"].items(): - IS2_atl03_mds["orbit_info"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID["orbit_info"].attrs.items(): - IS2_atl03_attrs["orbit_info"][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs["orbit_info"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["orbit_info"][key][att_name] = att_val - - # information ancillary to the data product - # number of GPS seconds between the GPS epoch (1980-01-06T00:00:00Z UTC) - # and ATLAS Standard Data Product (SDP) epoch (2018-01-01T00:00:00Z UTC) - # Add this value to delta time parameters to compute full gps_seconds - # could alternatively use the Julian day of the ATLAS SDP epoch: 2458119.5 - # and add leap seconds since 2018-01-01T00:00:00Z UTC (ATLAS SDP epoch) - IS2_atl03_mds["ancillary_data"] = {} - IS2_atl03_attrs["ancillary_data"] = {} - ancillary_keys = [ - "atlas_sdp_gps_epoch", - "data_end_utc", - "data_start_utc", - "end_cycle", - "end_geoseg", - "end_gpssow", - "end_gpsweek", - "end_orbit", - "end_region", - "end_rgt", - "granule_end_utc", - "granule_start_utc", - "release", - "start_cycle", - "start_geoseg", - "start_gpssow", - "start_gpsweek", - "start_orbit", - "start_region", - "start_rgt", - "version", - ] - for key in ancillary_keys: - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"][key] = fileID["ancillary_data"][key][:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"][key] = {} - for att_name, att_val in fileID["ancillary_data"][key].attrs.items(): - IS2_atl03_attrs["ancillary_data"][key][att_name] = att_val - - # transmit-echo-path (tep) parameters - IS2_atl03_mds["ancillary_data"]["tep"] = {} - IS2_atl03_attrs["ancillary_data"]["tep"] = {} - for key, val in fileID["ancillary_data"]["tep"].items(): - # get each HDF5 variable - IS2_atl03_mds["ancillary_data"]["tep"][key] = val[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs["ancillary_data"]["tep"][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs["ancillary_data"]["tep"][key][att_name] = att_val - - # channel dead time and first photon bias derived from ATLAS calibration - cal1, cal2 = ("ancillary_data", "calibrations") - for var in ["dead_time", "first_photon_bias"]: - IS2_atl03_mds[cal1][var] = {} - IS2_atl03_attrs[cal1][var] = {} - for key, val in fileID[cal1][cal2][var].items(): - # get each HDF5 variable - if isinstance(val, h5py.Dataset): - IS2_atl03_mds[cal1][var][key] = val[:] - elif isinstance(val, h5py.Group): - IS2_atl03_mds[cal1][var][key] = {} - for k, v in val.items(): - IS2_atl03_mds[cal1][var][key][k] = v[:] - # Getting attributes of group and included variables - if ATTRIBUTES: - # Variable Attributes - IS2_atl03_attrs[cal1][var][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][att_name] = att_val - if isinstance(val, h5py.Group): - for k, v in val.items(): - IS2_atl03_attrs[cal1][var][key][k] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[cal1][var][key][k][att_name] = att_val - - # get ATLAS impulse response variables for the transmitter echo path (TEP) - tep1, tep2 = ("atlas_impulse_response", "tep_histogram") - IS2_atl03_mds[tep1] = {} - IS2_atl03_attrs[tep1] = {} - for pce in ["pce1_spot1", "pce2_spot3"]: - IS2_atl03_mds[tep1][pce] = {tep2: {}} - IS2_atl03_attrs[tep1][pce] = {tep2: {}} - # for each TEP variable - for key, val in fileID[tep1][pce][tep2].items(): - IS2_atl03_mds[tep1][pce][tep2][key] = val[:] - # Getting attributes of included variables - if ATTRIBUTES: - # Global Group Attributes - for att_name, att_val in fileID[tep1][pce][tep2].attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][att_name] = att_val - # Variable Attributes - IS2_atl03_attrs[tep1][pce][tep2][key] = {} - for att_name, att_val in val.attrs.items(): - IS2_atl03_attrs[tep1][pce][tep2][key][att_name] = att_val - - # Global File Attributes - if ATTRIBUTES: - for att_name, att_val in fileID.attrs.items(): - IS2_atl03_attrs[att_name] = att_val - - # Closing the HDF5 file - fileID.close() - # Return the datasets and variables - return (IS2_atl03_mds, IS2_atl03_attrs, IS2_atl03_beams) - - -# convert_wgs_to_utm function, see https://stackoverflow.com/a/40140326/4556479 -def convert_wgs_to_utm(lon: float, lat: float): - """Based on lat and lng, return best utm epsg-code""" - utm_band = str((math.floor((lon + 180) / 6) % 60) + 1) - if len(utm_band) == 1: - utm_band = "0" + utm_band - if lat >= 0: - epsg_code = "epsg:326" + utm_band - return epsg_code - epsg_code = "epsg:327" + utm_band - return epsg_code - - -def orthometric_correction(lat, lon, Z, epsg): - # Define the Proj string - # To transform from WGS84 ellipsoidal height - # to EGM2008 orthometric height using PyProj - # proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +vunits=m +no_defs +geoidgrids=egm2008-1.gtx' - # # Define the Proj string for WGS84 ellipsoidal height - # wgs84_proj_string = '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs' - - # # Define the Proj string for EGM2008 orthometric height: egm08_25,egm2008-1 - # egm2008_proj_string = \ - # '+proj=latlong +ellps=WGS84 +datum=WGS84 +no_defs ' \ - # '+geoidgrids=C:/Workstation/ICESat2_HLS/Code/Geoids/egm08_25.gtx' - - # transform ellipsoid (WGS84) height to orthometric height - # transformer = Transformer.from_crs(wgs84_proj_string, egm2008_proj_string, always_xy=True) - transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True) - X_egm08, Y_egm08, Z_egm08 = transformer.transform(lon, lat, Z) - - # transform WGS84 proj to local UTM - myProj = Proj(epsg) - X_utm, Y_utm = myProj(lon, lat) - - return Y_utm, X_utm, Z_egm08 - - -# Snippet by Eric Guenther (via Amy N.) for assigning photons to a segment -def get_atl03_seg_id(atl03_ph_index_beg, atl03_segment_id, atl03_heights_len): - # We need to know spacecraft orbit info, - # which is provided across segments. - # This first function assigns photons to the segment they belong to. - # We end up making a new array - # that has more points to match the photon. Segment is defined as every 100m in the long track. - - # Filter all data where atl03_ph_index starts at 0 (0 indicates errors) - indsNotZero = atl03_ph_index_beg != 0 - atl03_ph_index_beg = atl03_ph_index_beg[indsNotZero] - atl03_segment_id = atl03_segment_id[indsNotZero] - - # Subtract 1 from ph_index_beg to start at python 0th pos - atl03_ph_index_beg = atl03_ph_index_beg - 1 - - # Sometimes the ph_index_beg is not at the 0th position, it is not, - # add it in and then add the associated segment id - # Warning, this is assuming that the segment id for the points are from - # the segment id directly before it, this assumption might fail, but I have - # not come across a case yet where it does. If you want to play it safe - # you could comment this section out and then if the first position is not - # 0 then all photons before the first position will not be assigned a - # segment id. - # if atl03_ph_index_beg[0] != 0: - # atl03_ph_index_beg = np.append(0,atl03_ph_index_beg) - # first_seg_id = atl03_segment_id[0] -1 - # atl03_segment_id = np.append(first_seg_id,atl03_segment_id) - - # Append atl03_height_len to end of array for final position - atl03_ph_index_beg = np.append(atl03_ph_index_beg, atl03_heights_len) - - # Make array equal to the length of the atl03_heights photon level data - ph_segment_id = np.zeros(atl03_heights_len) - - # Iterate through ph_index_beg, from the first to second to last number - # and set the photons between ph_index_beg i to ph_index_beg i + 1 to - # segment id i - for i in range(len(atl03_ph_index_beg) - 1): - ph_segment_id[atl03_ph_index_beg[i] : atl03_ph_index_beg[i + 1]] = ( - atl03_segment_id[i] - ) - - # Return list of segment_id at the photon level - return ph_segment_id - - -def ref_linear_interp(x, y): - # initialize an empty list - arr = [] - - # get unique x values - ux = np.unique(x) - for u in ux: - # get y values for x=u - idx = y[x == u] - - # try to get the y values for x=u-1 and x=u - # if this is not possible, set min and max to y values for x=u - try: - min = y[x == u - 1][0] - max = y[x == u][0] - except: - min = y[x == u][0] - max = y[x == u][0] - - # try to get the y values for x=u and x=u+1 - # if this is not possible, set min and max to y values for x=u - try: - min = y[x == u][0] - max = y[x == u + 1][0] - except: - min = y[x == u][0] - max = y[x == u][0] - - # if the min and max values are the same, - # fill the sub array with the value - if min == max: - sub = np.full((len(idx)), min) - arr.append(sub) - - # if min and max are different, - # create a sub array using linear interpolation - else: - sub = np.linspace(min, max, len(idx)) - arr.append(sub) - - # concatenate all the sub arrays into a single array and return - return np.concatenate(arr, axis=None).ravel() - - -# Bin data along vertical and horizontal scales -def horizontal_vertical_bin_dataset(dataset, lat_res, vertical_res): - """Bin data along vertical and horizontal scales - for later segmentation""" - - # Filter values within the range (-50, 10), because photons elevation outside this range will be real noise - valid_range = (-50, 10) - valid_mask = (dataset["photon_height"] > valid_range[0]) & ( - dataset["photon_height"] < valid_range[1] - ) - - # Apply the valid_mask to filter unwanted values - filtered_dataset = dataset[valid_mask] - - # Calculate the number of height bins - height_range = abs( - filtered_dataset["photon_height"].max() - - filtered_dataset["photon_height"].min() - ) - height_bin_number = max( - 1, round(height_range / vertical_res) - ) # Ensure at least one bin - - # Calculate the number of latitude bins - lat_range = abs(filtered_dataset["lat"].max() - filtered_dataset["lat"].min()) - lat_bin_number = max(1, round(lat_range / lat_res)) # Ensure at least one bin - - # Create bins for latitude - lat_bins = pd.cut( - filtered_dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) - - # Create bins for height - height_bins = pd.cut( - filtered_dataset["photon_height"], - bins=height_bin_number, - labels=np.round( - np.linspace( - filtered_dataset["photon_height"].min(), - filtered_dataset["photon_height"].max(), - num=height_bin_number, - ), - decimals=1, - ), - ) - - # Add bins to dataframe using .loc to avoid SettingWithCopyWarning - filtered_dataset.loc[:, "lat_bins"] = lat_bins - filtered_dataset.loc[:, "height_bins"] = height_bins - filtered_dataset = filtered_dataset.reset_index(drop=True) - - return filtered_dataset - - -# thinking about grid searching to detect bathymetric directly -# rather than bin and then search -def horizontal_vertical_grid_density_cal( - dataset, lat_res, vertical_res, density_threshold -): - """Bin data along vertical and horizontal scales - and calculate high-density points using a grid method""" - - # Calculate the number of bins required both vertically - lat_bin_number = round(abs(dataset["lat"].min() - dataset["lat"].max()) / lat_res) - # and horizontally based on resolution size - height_bin_number = round( - abs(dataset["photon_height"].min() - dataset["photon_height"].max()) - / vertical_res - ) - - # Create the grid - grid = np.zeros((lat_bin_number, height_bin_number)) - - # Iterate over the dataset and assign points to cells - for _, row in dataset.iterrows(): - lat_index = int((row["lat"] - dataset["lat"].min()) / lat_res) - height_index = int( - (row["photon_height"] - dataset["photon_height"].min()) / vertical_res - ) - grid[lat_index, height_index] += 1 - - # Identify high-density cells - high_density_cells = np.argwhere(grid > density_threshold) - - # Create a copy of the dataset - dataset_copy = dataset.copy() - - # Assign the cell indices as bins to the dataset - dataset_copy["lat_bins"] = pd.cut( - dataset["lat"], bins=lat_bin_number, labels=np.arange(lat_bin_number) - ) - dataset_copy["height_bins"] = pd.cut( - dataset["photon_height"], - bins=height_bin_number, - labels=np.arange(height_bin_number), - ) - - # Reset the index of the copied dataset - dataset_copy = dataset_copy.reset_index(drop=True) - - return dataset_copy, high_density_cells - - -# Bin data along horizontal scale only -def horizontal_bin_dataset(dataset, lat_res): - """Bin data along the horizontal scale (lat) only - for later segmentation""" - - # Calculate the number of bins required horizontally based on resolution size - lat_bin_number = round( - abs(dataset["lat_utm"].min() - dataset["lat_utm"].max()) / lat_res - ) - - # Cut lat bins - # lat_bins = pd.cut(dataset['lat'], bins=lat_bin_number, labels=np.array(range(lat_bin_number))) - lat_bins = pd.cut( - dataset["lat_utm"], bins=lat_bin_number, labels=np.array(range(lat_bin_number)) - ) - - # Create a copy of the dataset - dataset_copy = dataset.copy() - - # Add lat bins to the dataframe - dataset_copy["lat_bins"] = lat_bins - - # Reset the index of the copied dataset - dataset_copy = dataset_copy.reset_index(drop=True) - - return dataset_copy - - -# Bin data first, and then throw away only the surface bin -def get_rm_sea_surface_bin(binned_dataset): - """Calculate mean sea height for easier calculation of depth and cleaner - figures""" - - # set flag for the df save - flag = 1 - - # group dataset by lat bins - grouped_data = binned_dataset.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - lat_bin_average = v["lat"].mean() - - # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby("height_bins", observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat"].argmax() - - # Select the index of the bin with the highest count - largest_h_index = new_df.index[largest_h_bin] - - # get all values below this bin - # Use boolean indexing to select only the values below the peak bin - new_photon_array_without_peak_bin = v.loc[v["height_bins"] < largest_h_index] - - if flag == 1: - photon_array_without_peak_bin = new_photon_array_without_peak_bin - flag = 2 - - else: - photon_array_without_peak_bin = photon_array_without_peak_bin.append( - new_photon_array_without_peak_bin - ) - - del new_df - - return photon_array_without_peak_bin - - -def get_sea_surface_height(binned_data, threshold): - """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - # set flag for the df save - firstTimeIndex = True - - # Create sea height list - sea_surface_height = [] - mean_lat_bins_seq = [] - sea_surface_subsurface_photons_ratio = [] - - # Group dataset by latitude bins - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - # based on lat_utm - lat_bin_average = v["lat"].mean() - - # Create new dataframe based on occurrence of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat"].argmax() - - # Select the index of the bin with the highest count - largest_h_index = new_df.index[largest_h_bin] - - # Calculate the median value of all photon height values within this bin - photons_sea_surface = v.loc[ - v["height_bins"] == largest_h_index, "photon_height" - ] - lat_bin_sea_median = photons_sea_surface.median() - - # Append to sea height list - sea_surface_height.append(lat_bin_sea_median) - mean_lat_bins_seq.append(lat_bin_average) - del new_df - - # Get all photons below sea surface - # to determine segment type of each subsurface water column - # Use calculated sea height to determine photons at 0.5m below peak - photons_sea_surface_up = v.loc[ - (v["photon_height"] > (lat_bin_sea_median - threshold)) - & (v["photon_height"] < (lat_bin_sea_median + 2 * threshold)) - ] - - # Calculate the photon ratio between surface and whole photons - if v["photon_height"].shape[0] > 0: - new_photons_ratio_sea_surface = ( - photons_sea_surface_up.shape[0] / v["photon_height"].shape[0] - ) - else: - new_photons_ratio_sea_surface = np.nan - - sea_surface_subsurface_photons_ratio.append(1 - new_photons_ratio_sea_surface) - - # Filter out sea height bin values outside 2 SD of mean. - mean = np.nanmean(sea_surface_height, axis=0) - sd = np.nanstd(sea_surface_height, axis=0) - - final_sea_surface_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - - sea_surface_height_abnormal_label = np.where( - np.isnan(final_sea_surface_height), 0, 1 - ) - - # Determine label based on ratio of sea surface photons and subsurface photons - sea_surface_dominated_label = np.where( - np.array(sea_surface_subsurface_photons_ratio) >= 0.2, 0, 1 - ) - - # Loop through groups again and return photons below 0.5m of sea height - PhotonDFBelowThresholdPeak = pd.DataFrame() - for i, (k, v) in enumerate(data_groups.items()): - # Get all values below this bin - NewPhotonDFBelowThresholdPeak = v.loc[ - v["photon_height"] < (final_sea_surface_height[i] - threshold) - ] - - if firstTimeIndex: - PhotonDFBelowThresholdPeak = NewPhotonDFBelowThresholdPeak - firstTimeIndex = False - else: - PhotonDFBelowThresholdPeak = pd.concat( - [PhotonDFBelowThresholdPeak, NewPhotonDFBelowThresholdPeak] - ) - - return ( - final_sea_surface_height, - sea_surface_height_abnormal_label, - sea_surface_dominated_label, - PhotonDFBelowThresholdPeak, - ) - - -# -# Arbitrary cutoff below the max value - 0.5 m below peak -# (we may also use 1 m but we can add that later) (apply to raw photon data, each of 6 beams) -def get_photon_below_sea_surface(binned_data, threshold): - """Calculate mean sea height for easier calculation of depth and cleaner figures""" - - # set flag for the df save - firstTimeIndex = 1 - - # Create sea height list - sea_surface_height = [] - # mean_lat_bins_seq=[] - - grouped_data = binned_data.groupby(["lat_bins"], group_keys=True, observed=False) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average sea height - for k, v in data_groups.items(): - lat_bin_average = v["lat_utm"].mean() - - # Create new dataframe based on occurance of photons per height bin - new_df = pd.DataFrame(v.groupby(["height_bins"], observed=False).count()) - - # Return the bin with the highest count - largest_h_bin = new_df["lat_utm"].argmax() - - # Select the index of the bin with the highest count - largest_h = new_df.index[largest_h_bin] - - # Calculate the median value of all values within this bin - lat_bin_sea_median = v.loc[ - v["height_bins"] == largest_h, "photon_height" - ].median() - - # Append to sea height list - sea_surface_height.append(lat_bin_sea_median) - # mean_lat_bins_seq.append(lat_bin_average) - del new_df - - # Filter out sea height bin values outside 2 SD of mean. - mean = np.nanmean(sea_surface_height, axis=0) - sd = np.nanstd(sea_surface_height, axis=0) - Final_sea_height = np.where( - (sea_surface_height > (mean + 2 * sd)) | (sea_surface_height < (mean - 2 * sd)), - np.nan, - sea_surface_height, - ).tolist() - Abnormal_sea_height_label = np.where(np.isnan(Final_sea_height), 1, 0) - - # Loop through groups again and return photons below 0.5m of sea height - for k, v in data_groups.items(): - # get all values below this bin - # Use calculated sea height to determine photons at 0.5m below peak - NewPhotonArrayBelowThresholdPeak = v.loc[ - v["photon_height"] < (sea_surface_height[k] - threshold) - ] - - if firstTimeIndex == 1: - PhotonArrayBelowThresholdPeak = NewPhotonArrayBelowThresholdPeak - firstTimeIndex = 2 - - else: - PhotonArrayBelowThresholdPeak = PhotonArrayBelowThresholdPeak.append( - NewPhotonArrayBelowThresholdPeak - ) - - return PhotonArrayBelowThresholdPeak - - -# Function to get elevation for multiple points -def get_seafloor_bathy_GEBCO_batch(lons, lats, raster, raster_data): - # Convert geographic coordinates to the raster's coordinate system - rows, cols = raster.index(lons, lats) - rows, cols = np.array(rows), np.array(cols) - - # Ensure the indices are within bounds - valid_mask = ( - (rows >= 0) & (rows < raster.height) & (cols >= 0) & (cols < raster.width) - ) - elevations = np.full(lons.shape, np.nan) - elevations[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]] - - return elevations - - -def get_water_temp(date_year, date_month, date_day, latitude, longitude): - """ - Pull down surface water temperature along the track from the JPL GHRSST opendap website. - - The GHRSST data are gridded tiles with dimension 17998 x 35999. - To get the specific grid tile of the SST, you must convert from lat, lon coordinates - to the gridded tile ratio of the SST data product using the coordinates of the IS2 data. - """ - # Get date from data filename - # data_path[-33:-25] - date = date_year + date_month + date_day - # date[0:4] - year = date_year - # date[4:6] - month = date_month - # date[6:8] - day = date_day - day_of_year = str(datetime.strptime(date, "%Y%m%d").timetuple().tm_yday) - # Add zero in front of day of year string - zero_day_of_year = day_of_year.zfill(3) - - # Calculate ratio of latitude from mid-point of IS2 track - old_lat = latitude.mean() - old_lat_min = -90 - old_lat_max = 90 - new_lat_min = 0 - new_lat_max = 17998 - - new_lat = round( - ((old_lat - old_lat_min) / (old_lat_max - old_lat_min)) - * (new_lat_max - new_lat_min) - + new_lat_min - ) - - # Calculate ratio of longitude from mid-point of IS2 track - old_lon = longitude.mean() - old_lon_min = -180 - old_lon_max = 180 - new_lon_min = 0 - new_lon_max = 35999 - - new_lon = round( - ((old_lon - old_lon_min) / (old_lon_max - old_lon_min)) - * (new_lon_max - new_lon_min) - + new_lon_min - ) - - # Access the SST data using the JPL OpenDap interface - url = ( - "https://opendap.jpl.nasa.gov/opendap/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/" - + str(year) - + "/" - + str(zero_day_of_year) - + "/" - + str(date) - + "090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc" - ) - - dataset = netCDF4.Dataset(url) - - # Access the data and convert the temperature from K to C - water_temp = dataset["analysed_sst"][0, new_lat, new_lon] - 273.15 - return water_temp - - -def refraction_correction( - WTemp, - WSmodel, - Wavelength, - Photon_ref_elev, - Ph_ref_azimuth, - PhotonZ, - PhotonX, - PhotonY, - Ph_Conf, -): - """ - WTemp; there is python library that pulls water temp data - WSmodel is the value surface height - Wavelength is fixed - """ - - # Only process photons below water surface model - PhotonX = PhotonX[PhotonZ <= WSmodel] - PhotonY = PhotonY[PhotonZ <= WSmodel] - Photon_ref_elev = Photon_ref_elev[PhotonZ <= WSmodel] - Ph_ref_azimuth = Ph_ref_azimuth[PhotonZ <= WSmodel] - Ph_Conf = Ph_Conf[PhotonZ <= WSmodel] - PhotonZ = PhotonZ[PhotonZ <= WSmodel] - - # water temp for refraction correction - WaterTemp = WTemp - - # Refraction coefficient # - a = -0.000001501562500 - b = 0.000000107084865 - c = -0.000042759374989 - d = -0.000160475520686 - e = 1.398067112092424 - wl = Wavelength - - # refractive index of air - n1 = 1.00029 - - # refractive index of water - n2 = (a * WaterTemp**2) + (b * wl**2) + (c * WaterTemp) + (d * wl) + e - - # assumption is 0.25416 - # This example is refractionCoef = 0.25449 - # 1.00029 is refraction of air constant - correction_coef = 1 - (n1 / n2) - - # read photon ref_elev to get theta1 - theta1 = np.pi / 2 - Photon_ref_elev - - # eq 1. Theta2 - theta2 = np.arcsin((n1 * np.sin(theta1)) / n2) - - # eq 3. S - # Approximate water Surface = 1.5 - # D = raw uncorrected depth - D = WSmodel - PhotonZ - - # For Triangle DTS - S = D / np.cos(theta1) - - # eq 2. R - R = (S * n1) / n2 - Gamma = (np.pi / 2) - theta1 - - # For triangle RPS - # phi is an angle needed - phi = theta1 - theta2 - - # P is the difference between raw and corrected YZ location - P = np.sqrt(R**2 + S**2 - 2 * R * S * np.cos(phi)) - - # alpha is an angle needed - alpha = np.arcsin((R * np.sin(phi)) / P) - - # Beta angle needed for Delta Y an d Delta Z - Beta = Gamma - alpha - - # Delta Y - DY = P * np.cos(Beta) - - # Delta Z - DZ = P * np.sin(Beta) - - # Delta Easting - DE = DY * np.sin(Ph_ref_azimuth) - - # Delta Northing - DN = DY * np.cos(Ph_ref_azimuth) - - outX = PhotonX + DE - outY = PhotonY + DN - outZ = PhotonZ + DZ - - """ - print('For selected Bathy photon:') - print('lat = ', PhotonY[9000]) - print('long = ', PhotonX[9000]) - print('Raw Depth = ', PhotonZ[9000]) - print('D = ', D[9000]) - - print('ref_elev = ', Photon_ref_elev[9000]) - - print('Delta East = ', DE[9000]) - print('Delta North = ', DN[9000]) - print('Delta Z = ', DZ[9000]) - """ - return ( - outX, - outY, - outZ, - Ph_Conf, - PhotonX, - PhotonY, - PhotonZ, - Ph_ref_azimuth, - Photon_ref_elev, - ) # We are most interested in out-x, out-y, out-z - - -def get_bath_height_percentile_thresh( - binned_data, percentile_thresh, sea_surface_height, vertical_res -): - """Detect bathymetric level per bin based on percentile_thresh""" - # Create sea height list - bath_height = [] - - geo_photon_height = [] - geo_longitude = [] - geo_latitude = [] - - # Group data by latitude - # Filter out surface data that are two bins below median surface value calculated above - binned_data_bath = binned_data[ - (binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) - ] - grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Create a percentile threshold of photon counts in each grid, - # grouped by both x and y axes. - count_threshold = np.percentile( - binned_data.groupby(["lat_bins", "height_bins"]) - .size() - .reset_index() - .groupby("lat_bins")[[0]] - .max(), - percentile_thresh, - ) - - # Loop through groups and return average bathy height - for k, v in data_groups.items(): - new_df = pd.DataFrame(v.groupby("height_bins").count()) - bath_bin = new_df["lat"].argmax() - bath_bin_h = new_df.index[bath_bin] - - # Set threshold of photon counts per bin - # here this script determines whether there is bathymetry signals by - # the photon counts per bin below sea surface height - if new_df.iloc[bath_bin]["lat"] >= count_threshold: - geo_photon_height.append( - v.loc[v["height_bins"] == bath_bin_h, "cor_photon_height"].values - ) - geo_longitude.append(v.loc[v["height_bins"] == bath_bin_h, "lon"].values) - geo_latitude.append(v.loc[v["height_bins"] == bath_bin_h, "lat"].values) - - bath_bin_median = v.loc[ - v["height_bins"] == bath_bin_h, "cor_photon_height" - ].median() - bath_height.append(bath_bin_median) - del new_df - - else: - bath_height.append(np.nan) - del new_df - - geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() - geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() - geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() - geo_depth = sea_surface_height - geo_photon_list - geo_df = pd.DataFrame( - { - "lon": geo_longitude_list, - "lat": geo_latitude_list, - "photon_height": geo_photon_list, - "depth": geo_depth, - } - ) - - del geo_longitude_list, geo_latitude_list, geo_photon_list - - return bath_height, geo_df - - -def get_bath_height_HDBSCAN( - lat_binned_data, percentile_thresh, sea_surface_height, vertical_res -): - """Calculate bathymetric level per lat bin based on horizontal resolution""" - # Create sea height list - bath_height = [] - geo_photon_height = [] - geo_longitude = [] - geo_latitude = [] - - # Group data by latitude - # Filter out surface data that are two bins (2 times height resolution) - # below median sea surface value calculated above - binned_data_bath = lat_binned_data[ - (lat_binned_data["photon_height"] < sea_surface_height - (vertical_res * 2)) - ] - - grouped_data = binned_data_bath.groupby(["lat_bins"], group_keys=True) - data_groups = dict(list(grouped_data)) - - # Loop through groups and return average bathymetric height - for k, v in data_groups.items(): - # assign each group of dataset to a new dataframe - new_df = pd.DataFrame(v) - - # Check if the DataFrame is empty - if new_df.empty: - # If the DataFrame is empty, append null values to the lists and skip to the next iteration - bath_height.append(np.nan) - geo_photon_height.append([]) - geo_longitude.append([]) - geo_latitude.append([]) - del new_df - continue - - lat_height_pairs = list( - zip(new_df["lat_utm"], new_df["cor_photon_height"], strict=False) - ) - - # Convert to a numpy array for sklearn - lat_height_pairs_array = np.array(lat_height_pairs) - - # Perform HDBSCAN clustering on the photons below sea surface for each lat bin - # scaler = StandardScaler() - scaler = MinMaxScaler() - data_scaled = scaler.fit_transform(lat_height_pairs_array) - - min_cluster_size = 4 - - # Check the number of data points is greater than the initial min_cluster_size - if len(data_scaled) < min_cluster_size: - min_cluster_size = len(data_scaled) // 2 - if min_cluster_size < 2: - bath_height.append(np.nan) - geo_photon_height.append([]) - geo_longitude.append([]) - geo_latitude.append([]) - del new_df - continue - - # cluster = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, leaf_size=20) - cluster = hdbscan.HDBSCAN( - min_cluster_size=min_cluster_size, - min_samples=3, - cluster_selection_epsilon=20, - leaf_size=25, - core_dist_n_jobs=-1, - ) - cluster.fit(data_scaled) - - # Get the labels assigned to each point by the HDBSCAN model - labels = cluster.labels_ - - # Count the number of points assigned to each cluster - unique, counts = np.unique(labels, return_counts=True) - - # Create a dictionary that maps each cluster label to the count of points assigned to it - clusters_dict = dict(zip(unique, counts, strict=False)) - - # Print the dictionary to see the size of each cluster - print(clusters_dict) - - # Identify the label of the largest cluster - max_cluster_label = max(clusters_dict, key=clusters_dict.get) - - # Check if no cluster is found - if max_cluster_label == -1: - bath_height.append(np.nan) - else: - # Add labels to the DataFrame - new_df_copy = new_df.copy() - new_df_copy["cluster"] = labels - - # Subset the DataFrame to get only the data points in the largest cluster - bath_cluster_data = new_df[new_df_copy["cluster"] == max_cluster_label] - - # calculate the median height value as the average bathymetric height - bath_bin_median = bath_cluster_data["cor_photon_height"].median() - bath_height.append(bath_bin_median) - - # Extract the longitude, latitude, and photon height for each lat bin - # and add it to respective lists - geo_photon_height.append(bath_cluster_data["cor_photon_height"].values) - geo_longitude.append(bath_cluster_data["lon_utm"].values) - geo_latitude.append(bath_cluster_data["lat_utm"].values) - - del new_df - - # Convert the lists to a single list - geo_longitude_list = np.concatenate(geo_longitude).ravel().tolist() - geo_latitude_list = np.concatenate(geo_latitude).ravel().tolist() - geo_photon_list = np.concatenate(geo_photon_height).ravel().tolist() - - # Calculate depth - geo_depth = sea_surface_height - np.array(geo_photon_list) - - # Create a DataFrame - geo_df = pd.DataFrame( - { - "lon": geo_longitude_list, - "lat": geo_latitude_list, - "photon_height": geo_photon_list, - "depth": geo_depth.tolist(), - } - ) - - return bath_height, geo_df - - -# requires that the input gdf has ranged index values i -# will need to change if index is changed to time or something -# this currently checks point in polygon for EVERY point -# would be significantly sped up if evaluated at 10m or something similar -# maybe later, fine for now -def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): - # try loading the shoreline data - try: - ICESat2_GDF.insert(0, "lat", ICESat2_GDF.geometry.y, False) - ICESat2_GDF.insert(0, "lon", ICESat2_GDF.geometry.x, False) - - # allocation of to be used arrays - zero_int_array = np.int64(np.zeros_like(ICESat2_GDF.geometry.x)) - - # Land flag initialized as -1 - # If shorelines downloaded already, will be set to 0 or 1 - ICESat2_GDF.insert(0, "is_land", zero_int_array - 1, False) - - # set the projection - ICESat2_GDF.set_crs("EPSG:4326", inplace=True) - - # load shoreline dataset to include only the features that intersect the bounding box - # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None - # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. - # engine str, 'fiona' or 'pyogrio' - # somtime it gives error if using fiona - # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') - land_polygon_gdf = gpd.read_file( - shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" - ) - - # continue with getting a new array of 0-or-1 labels for each photon - land_point_labels = np.zeros_like(ICESat2_GDF.is_land.values) - - # update labels for points in the land polygons - pts_in_land = gpd.sjoin(ICESat2_GDF, land_polygon_gdf, predicate="within") - - # get land or not bool value - land_loc = ICESat2_GDF.index.isin(pts_in_land.index) - - # asigned them to new numpy array - land_point_labels[land_loc] = 1 - land_point_labels[~land_loc] = 0 - - return land_point_labels - - except Exception as e: - print(e) - - print("Error loading shoreline data, returning -1s for is_land flag") - - # if the shoreline data is not available - # return the original label array - - return -np.ones_like(ICESat2_GDF.is_land.values) - - -def produce_figures( - binned_data, - bath_height, - sea_height, - solo_sea_surface_label, - y_limit_top, - y_limit_bottom, - percentile, - file, - geo_df, - ref_y, - ref_z, - beam, - epsg_num, -): - """Create figures""" - - # Create bins for latitude - bath_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(bath_height)) + 20 - ) - - sea_surface_x_axis_bins = ( - np.linspace(binned_data.lat.min(), binned_data.lat.max(), len(sea_height)) + 10 - ) - - # Create new dataframes for median values - bath_median_df = pd.DataFrame({"x": bath_x_axis_bins, "y": bath_height}) - - # Create uniform sea surface based on median sea surface values and filter out surface breaching - sea_height1 = [np.nanmedian(sea_height) if i == i else np.nan for i in sea_height] - sea_median_df = pd.DataFrame({"x": sea_surface_x_axis_bins, "y": sea_height1}) - - # Create uniform solo sea surface label - sea_surface_label = solo_sea_surface_label - sea_surface_label_df = pd.DataFrame( - {"x": sea_surface_x_axis_bins, "y": sea_surface_label} - ) - idx_1 = np.where(sea_surface_label_df.y == 1) - idx_0 = np.where(sea_surface_label_df.y == 0) - - # Define figure size - fig = plt.rcParams["figure.figsize"] = (40, 25) - - # Plot raw points - # plt.scatter(x=binned_data.lat, - # y = binned_data.photon_height, marker='o', lw=0, s=1, alpha = 0.8, - # c = 'yellow', label = 'Raw photon height') - plt.scatter(ref_y, ref_z, s=0.5, alpha=0.1, c="black") - plt.scatter( - geo_df.lat, - geo_df.photon_height, - s=0.8, - marker="o", - alpha=0.1, - c="red", - label="Classified Photons", - ) - - # plt.scatter(x=geo_df.lat, - # y = geo_df.photon_height, marker='o', lw=0, s=0.8, - # alpha = 0.8, c = 'black', label = 'Corrected photon bin') - - # Plot median values - plt.scatter( - bath_median_df.x, - bath_median_df.y, - marker="o", - c="r", - alpha=0.8, - s=2, - label="Median bathymetry", - ) - - plt.scatter( - sea_median_df.x, - sea_median_df.y, - marker="o", - c="b", - alpha=1, - s=2, - label="Median sea surface", - ) - - plt.scatter( - sea_surface_label_df.iloc[idx_1].x, - sea_surface_label_df.iloc[idx_1].y, - marker="o", - c="pink", - alpha=1, - s=3, - label="solo_sea_surface", - ) - plt.scatter( - sea_surface_label_df.iloc[idx_0].x, - sea_surface_label_df.iloc[idx_0].y, - marker="o", - c="g", - alpha=1, - s=3, - label="non_solo_sea_surface", - ) - - # Insert titles and subtitles - plt.title("Icesat2 Bathymetry\n" + file) - plt.xlabel("Latitude", fontsize=25) - plt.ylabel("Photon Height (m)", fontsize=25) - plt.xticks(fontsize=16) - plt.yticks(fontsize=16) - - plt.legend(loc="upper left", prop={"size": 20}) - - # Limit the x and y axes using parameters - plt.xlim(left=binned_data.lat.min(), right=binned_data.lat.max()) - plt.ylim(top=y_limit_top, bottom=y_limit_bottom) - - timestr = time.strftime("%Y%m%d_%H%M%S") - file = file.replace(".h5", "") - # Define where to save file - plt.tight_layout() - plt.savefig( - "C:/Workstation/ICESat2_HLS/" - + file - + "_gt" - + str(beam) - + "_" - + str(percentile) - + "_EPSG" - + str(epsg_num) - + "_" - + timestr - + ".pdf" - ) - # plt.show() - # plt.close() - - # convert corrected locations back to wgs84 (useful to contain) - transformer = Transformer.from_crs( - "EPSG:" + str(epsg_num), "EPSG:4326", always_xy=True - ) - print(transformer) - lon_wgs84, lat_wgs84 = transformer.transform(geo_df.lon.values, geo_df.lat.values) - - geo_df["lon_wgs84"] = lon_wgs84 - geo_df["lat_wgs84"] = lat_wgs84 - - geodf = gpd.GeoDataFrame( - geo_df, geometry=gpd.points_from_xy(geo_df.lon_wgs84, geo_df.lat_wgs84) - ) - - geodf.set_crs(epsg=4326, inplace=True) - - # geodf.to_file("C:/Workstation/ICESat2_HLS/" +file + '_gt' + '_' + str(percentile) + '_EPSG' + - # str(epsg_num) + '_' + timestr + ".gpkg", - # driver="GPKG") diff --git a/aok/core/kd_utils/sliderule_adapter.py b/aok/core/kd_utils/sliderule_adapter.py deleted file mode 100644 index a706b0e..0000000 --- a/aok/core/kd_utils/sliderule_adapter.py +++ /dev/null @@ -1,222 +0,0 @@ -import logging - -import numpy as np -import pandas as pd -from pyproj import Geod, Proj - -from kd_utils.data_processing import convert_wgs_to_utm - -logger = logging.getLogger(__name__) - - -def _get_beam_id_from_track_pair(df: pd.DataFrame) -> pd.Series: - if "gt" in df.columns: - return df["gt"].astype(str) - if "track" in df.columns and "pair" in df.columns: - suffix = df["pair"].map({0: "l", 1: "r"}).fillna("x") - return "gt" + df["track"].astype(int).astype(str) + suffix - return pd.Series(["unknown"] * len(df), index=df.index, dtype="object") - - -def _infer_strong_beam_suffix(df: pd.DataFrame) -> str | None: - if "sc_orient" not in df.columns or df.empty: - return None - orient = df["sc_orient"].dropna() - if orient.empty: - return None - # Same logic as your Florida script. - return "l" if int(orient.iloc[0]) == 0 else "r" - - -def _normalize_signal_conf(values: pd.Series) -> pd.Series: - def to_scalar(v): - if isinstance(v, (list, tuple, np.ndarray)): - if len(v) == 0: - return np.nan - return float(v[0]) - try: - return float(v) - except Exception: - return np.nan - - return values.apply(to_scalar) - - -def _relative_distance_km_per_beam(df: pd.DataFrame) -> pd.Series: - out = pd.Series(np.nan, index=df.index, dtype=float) - for beam_id, beam_df in df.groupby("beam_id"): - beam_df = beam_df.sort_values("delta_time") - if "segment_dist" in beam_df.columns: - rel_km = (beam_df["segment_dist"] - beam_df["segment_dist"].min()) / 1000.0 - out.loc[beam_df.index] = rel_km.values - continue - - lons = beam_df["longitude"].to_numpy(dtype=float) - lats = beam_df["latitude"].to_numpy(dtype=float) - if len(lons) == 0: - continue - geod = Geod(ellps="WGS84") - cumulative_m = np.zeros(len(lons), dtype=float) - for i in range(1, len(lons)): - _, _, dist_m = geod.inv(lons[i - 1], lats[i - 1], lons[i], lats[i]) - cumulative_m[i] = cumulative_m[i - 1] + max(dist_m, 0.0) - out.loc[beam_df.index] = cumulative_m / 1000.0 - return out - - -def _project_to_utm(df: pd.DataFrame) -> pd.DataFrame: - projected = df.copy() - projected["lon"] = np.nan - projected["lat"] = np.nan - for beam_id, beam_df in projected.groupby("beam_id"): - if beam_df.empty: - continue - lon0 = float(beam_df["longitude"].iloc[0]) - lat0 = float(beam_df["latitude"].iloc[0]) - epsg = convert_wgs_to_utm(lon0, lat0) - proj = Proj(epsg) - x, y = proj( - beam_df["longitude"].to_numpy(dtype=float), - beam_df["latitude"].to_numpy(dtype=float), - ) - projected.loc[beam_df.index, "lon"] = x - projected.loc[beam_df.index, "lat"] = y - return projected - - -def sliderule_to_framework_dataset( - atl03_gdf, - strong_beams_only: bool = True, -) -> pd.DataFrame: - """ - Convert SlideRule ATL03 photon GeoDataFrame to the framework's sea_photon_dataset schema. - """ - if atl03_gdf is None or len(atl03_gdf) == 0: - return pd.DataFrame( - columns=[ - "beam_id", - "latitude", - "longitude", - "lat", - "lon", - "photon_height", - "quality_ph", - "photon_conf", - "ref_elevation", - "ref_azimuth", - "relative_AT_dist", - "solar_elevation", - "background_rate", - "is_land_label", - ] - ) - - src = pd.DataFrame(atl03_gdf).copy() - src["beam_id"] = _get_beam_id_from_track_pair(src) - if strong_beams_only: - strong_suffix = _infer_strong_beam_suffix(src) - if strong_suffix in ("l", "r"): - src = src[src["beam_id"].str.endswith(strong_suffix)] - - out = pd.DataFrame( - { - "beam_id": src["beam_id"].astype(str), - "latitude": pd.to_numeric(src.get("lat_ph", np.nan), errors="coerce"), - "longitude": pd.to_numeric(src.get("lon_ph", np.nan), errors="coerce"), - "photon_height": pd.to_numeric(src.get("h_ph", np.nan), errors="coerce"), - "quality_ph": pd.to_numeric(src.get("quality_ph", np.nan), errors="coerce"), - "ref_elevation": pd.to_numeric( - src.get("ref_elev", np.nan), errors="coerce" - ), - "ref_azimuth": pd.to_numeric( - src.get("ref_azimuth", np.nan), errors="coerce" - ), - "solar_elevation": pd.to_numeric( - src.get("solar_elevation", np.nan), errors="coerce" - ), - "background_rate": pd.to_numeric( - src.get("bckgrd_rate", np.nan), errors="coerce" - ), - "delta_time": pd.to_numeric(src.get("delta_time", np.nan), errors="coerce"), - "segment_dist": pd.to_numeric( - src.get("segment_dist", np.nan), errors="coerce" - ), - } - ) - - if "signal_conf_ph" in src.columns: - out["photon_conf"] = _normalize_signal_conf(src["signal_conf_ph"]) - else: - out["photon_conf"] = np.nan - - out = out.dropna(subset=["latitude", "longitude", "photon_height"]).copy() - out = _project_to_utm(out) - out["relative_AT_dist"] = _relative_distance_km_per_beam(out) - out["is_land_label"] = 0 - - return out.drop(columns=["delta_time", "segment_dist"], errors="ignore") - - -def fetch_sliderule_atl03( - region: list[dict[str, float]], - t0: str, - t1: str, - rgt: int | None = None, - ph_fields: list[str] | None = None, - sliderule_url: str = "slideruleearth.io", - verbose: bool = False, -): - """ - Fetch ATL03 photons from SlideRule. - """ - try: - from sliderule import icesat2, sliderule - except Exception as e: - raise ImportError( - "sliderule package is required for SlideRule ingestion." - ) from e - - sliderule.init(sliderule_url, verbose=verbose) - fields = ph_fields or [ - "delta_time", - "lat_ph", - "lon_ph", - "h_ph", - "quality_ph", - "signal_conf_ph", - "segment_dist", - "ref_elev", - "ref_azimuth", - "solar_elevation", - "bckgrd_rate", - ] - params = {"poly": region, "t0": t0, "t1": t1, "atl03_ph_fields": fields} - if rgt is not None: - params["rgt"] = int(rgt) - return icesat2.atl03sp(params) - - -def fetch_and_prepare_sliderule_dataset( - region: list[dict[str, float]], - t0: str, - t1: str, - rgt: int | None = None, - strong_beams_only: bool = True, - sliderule_url: str = "slideruleearth.io", - verbose: bool = False, -) -> pd.DataFrame: - """ - One-call helper: - fetch ATL03 photons from SlideRule and convert to framework dataset schema. - """ - atl03_gdf = fetch_sliderule_atl03( - region=region, - t0=t0, - t1=t1, - rgt=rgt, - sliderule_url=sliderule_url, - verbose=verbose, - ) - return sliderule_to_framework_dataset( - atl03_gdf, strong_beams_only=strong_beams_only - ) diff --git a/aok/core/kd_utils/main.py b/aok/core/main.py similarity index 100% rename from aok/core/kd_utils/main.py rename to aok/core/main.py From 87d726154ac4b859643c22df4dd896bbbdf5bbe8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:12:24 +0000 Subject: [PATCH 09/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- aok/core/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aok/core/main.py b/aok/core/main.py index 51e39de..9168ef0 100644 --- a/aok/core/main.py +++ b/aok/core/main.py @@ -5,8 +5,6 @@ import sys from config import get_args -import pandas as pd - from kd_utils.bathy_processing import process_subsurface_photon_filtering from kd_utils.data_processing import ( Extract_sea_photons, @@ -22,6 +20,7 @@ plot_kd_photons, plot_photon_quality_flags, ) +import pandas as pd logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" From 359b6f5f91f0aad9842fed1e51d0d09bc51236db Mon Sep 17 00:00:00 2001 From: Jessica Scheick Date: Tue, 31 Mar 2026 15:32:06 -0400 Subject: [PATCH 10/11] fix codespell errors --- aok/core/config.py | 2 +- aok/core/kd_utils/data_processing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aok/core/config.py b/aok/core/config.py index a67825a..6fa054c 100644 --- a/aok/core/config.py +++ b/aok/core/config.py @@ -448,7 +448,7 @@ def get_args(): # night vs daytime - # solar elevation detemine remove or not for solar background + # solar elevation determine remove or not for solar background # 1-2 hours diff --git a/aok/core/kd_utils/data_processing.py b/aok/core/kd_utils/data_processing.py index 8860c99..7f74001 100644 --- a/aok/core/kd_utils/data_processing.py +++ b/aok/core/kd_utils/data_processing.py @@ -343,7 +343,7 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # bbox can be GeoDataFrame or GeoSeries | shapely Geometry, default None # Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely geometry. # engine str, 'fiona' or 'pyogrio' - # somtime it gives error if using fiona + # sometime it gives error if using fiona # land_polygon_gdf = gpd.read_file(shoreline_data_path, bbox=ICESat2_GDF, engine='fiona') land_polygon_gdf = gpd.read_file( shoreline_data_path, bbox=ICESat2_GDF, engine="pyogrio" @@ -358,7 +358,7 @@ def isolate_sea_land_photons(shoreline_data_path, ICESat2_GDF): # get land or not bool value land_loc = ICESat2_GDF.index.isin(pts_in_land.index) - # asigned them to new numpy array + # assigned them to new numpy array land_point_labels[land_loc] = 1 land_point_labels[~land_loc] = 0 From d561d1b5d661ab7a515e6f619908d189d79603d0 Mon Sep 17 00:00:00 2001 From: hhollandmoritz Date: Tue, 31 Mar 2026 18:41:21 -0400 Subject: [PATCH 11/11] fixes two errors blocking pip install via pyproject.toml, dependencies in dynamic dependencies should be specified via the requirements.txt file; and fallback version of aok now set to 0.0.0 --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22a7696..7bc1d22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,9 +47,7 @@ py-modules = ["_aok_version"] ##### DEPENDENCIES ##### [tool.setuptools.dynamic] -dependencies = [ - "numpy", -] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ @@ -69,7 +67,7 @@ exclude = ["*tests"] version_file = "_aok_version.py" version_file_template = 'version = "{version}"' local_scheme = "node-and-date" -fallback_version = "unknown" +fallback_version = "0.0.0" ##### LINTING, FORMATTING, TYPING #####