From 7ac63a782f07162470c5a8a3acc7a244a69bef1c Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Tue, 10 Mar 2026 20:05:15 +0800 Subject: [PATCH] feat(raster): add GDAL raster support refactor(raster): align executor semantics with main fix(submodules): align pointers with main fix(submodules): restore gitmodules config Revert array and builder to main fix(sedona-gdal): restore missing files and methods after rebase Resolve conflict resolving problems --- Cargo.lock | 19 + c/sedona-gdal/src/dyn_load.rs | 4 + c/sedona-gdal/src/gdal.rs | 17 + c/sedona-gdal/src/gdal_dyn_bindgen.rs | 39 + c/sedona-gdal/src/vsi.rs | 87 + rust/sedona-raster-functions/src/crs_utils.rs | 28 + rust/sedona-raster-functions/src/lib.rs | 1 + rust/sedona-raster-functions/src/register.rs | 1 + rust/sedona-raster-functions/src/rs_count.rs | 338 ++++ rust/sedona-raster-gdal/Cargo.toml | 56 +- .../benches/bench_common.rs | 238 +++ .../benches/rs_as_geotiff.rs | 110 ++ rust/sedona-raster-gdal/benches/rs_clip.rs | 40 + .../benches/rs_from_gdal_raster.rs | 115 ++ .../benches/rs_from_path.rs | 49 + .../benches/rs_map_algebra.rs | 308 ++++ .../benches/rs_polygonize.rs | 72 + rust/sedona-raster-gdal/benches/rs_value.rs | 183 ++ .../benches/rs_zonal_stats.rs | 40 + rust/sedona-raster-gdal/src/lib.rs | 61 +- .../src/raster_band_reader.rs | 539 ++++++ rust/sedona-raster-gdal/src/rs_as_geotiff.rs | 519 ++++++ rust/sedona-raster-gdal/src/rs_as_raster.rs | 714 ++++++++ rust/sedona-raster-gdal/src/rs_clip.rs | 928 +++++++++++ .../src/rs_from_gdal_raster.rs | 254 +++ rust/sedona-raster-gdal/src/rs_from_path.rs | 447 +++++ .../src/rs_geotiff_tiles.rs | 621 +++++++ rust/sedona-raster-gdal/src/rs_map_algebra.rs | 796 +++++++++ rust/sedona-raster-gdal/src/rs_metadata.rs | 271 +++ rust/sedona-raster-gdal/src/rs_polygonize.rs | 366 ++++ rust/sedona-raster-gdal/src/rs_value.rs | 658 ++++++++ rust/sedona-raster-gdal/src/rs_zonal_stats.rs | 1475 +++++++++++++++++ rust/sedona-raster-gdal/src/utils.rs | 39 +- rust/sedona/src/context.rs | 10 + sedona-cli/Cargo.toml | 2 +- 35 files changed, 9415 insertions(+), 30 deletions(-) create mode 100644 rust/sedona-raster-functions/src/rs_count.rs create mode 100644 rust/sedona-raster-gdal/benches/bench_common.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_as_geotiff.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_clip.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_from_gdal_raster.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_from_path.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_map_algebra.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_polygonize.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_value.rs create mode 100644 rust/sedona-raster-gdal/benches/rs_zonal_stats.rs create mode 100644 rust/sedona-raster-gdal/src/raster_band_reader.rs create mode 100644 rust/sedona-raster-gdal/src/rs_as_geotiff.rs create mode 100644 rust/sedona-raster-gdal/src/rs_as_raster.rs create mode 100644 rust/sedona-raster-gdal/src/rs_clip.rs create mode 100644 rust/sedona-raster-gdal/src/rs_from_gdal_raster.rs create mode 100644 rust/sedona-raster-gdal/src/rs_from_path.rs create mode 100644 rust/sedona-raster-gdal/src/rs_geotiff_tiles.rs create mode 100644 rust/sedona-raster-gdal/src/rs_map_algebra.rs create mode 100644 rust/sedona-raster-gdal/src/rs_metadata.rs create mode 100644 rust/sedona-raster-gdal/src/rs_polygonize.rs create mode 100644 rust/sedona-raster-gdal/src/rs_value.rs create mode 100644 rust/sedona-raster-gdal/src/rs_zonal_stats.rs diff --git a/Cargo.lock b/Cargo.lock index a5f233704..14d2d35cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2539,6 +2539,12 @@ version = "3.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dea2df4cf52843e0452895c455a1a2cfbb842a1e7329671acf418fdc53ed4c59" +[[package]] +name = "evalexpr" +version = "13.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25929004897f2bbab309121a60400d36992f6d911d09baa6c172f6cc55706601" + [[package]] name = "fastrand" version = "2.4.1" @@ -5735,18 +5741,31 @@ dependencies = [ name = "sedona-raster-gdal" version = "0.4.0" dependencies = [ + "arrow", "arrow-array", "arrow-buffer", + "arrow-schema", + "async-trait", "criterion", + "datafusion", "datafusion-common", + "datafusion-common-runtime", + "datafusion-expr", + "evalexpr", + "futures", "lru", "sedona-common", + "sedona-expr", "sedona-gdal", + "sedona-geometry", + "sedona-proj", "sedona-raster", + "sedona-raster-functions", "sedona-schema", "sedona-testing", "tempfile", "tokio", + "wkb", ] [[package]] diff --git a/c/sedona-gdal/src/dyn_load.rs b/c/sedona-gdal/src/dyn_load.rs index 9b8612ada..0c8be4b0a 100644 --- a/c/sedona-gdal/src/dyn_load.rs +++ b/c/sedona-gdal/src/dyn_load.rs @@ -157,6 +157,10 @@ fn load_all_symbols(lib: &Library, api: &mut SedonaGdalApi) -> Result<(), GdalIn load_fn!(lib, api, VSIFileFromMemBuffer); load_fn!(lib, api, VSIFCloseL); load_fn!(lib, api, VSIUnlink); + load_fn!(lib, api, VSIGetDirectorySeparator); + load_fn!(lib, api, VSIOpenDir); + load_fn!(lib, api, VSIGetNextDirEntry); + load_fn!(lib, api, VSICloseDir); load_fn!(lib, api, VSIGetMemFileBuffer); load_fn!(lib, api, VSIFree); load_fn!(lib, api, VSIMalloc); diff --git a/c/sedona-gdal/src/gdal.rs b/c/sedona-gdal/src/gdal.rs index 481f9d273..fad4288ff 100644 --- a/c/sedona-gdal/src/gdal.rs +++ b/c/sedona-gdal/src/gdal.rs @@ -171,6 +171,23 @@ impl Gdal { vsi::get_vsi_mem_file_bytes_owned(self.api, file_name) } + /// Open a VSI directory for iteration. + /// See also [`vsi::open_dir`]. + pub fn open_vsi_dir( + &self, + path: &str, + recurse_depth: i32, + options: Option<&crate::cpl::CslStringList>, + ) -> Result { + crate::vsi::open_dir(self.api, path, recurse_depth, options) + } + + /// Return the directory separator used by GDAL for a given VSI path. + /// See also [`vsi::get_directory_separator`]. + pub fn vsi_directory_separator(&self, path: &str) -> Result { + crate::vsi::get_directory_separator(self.api, path) + } + // -- Raster operations --------------------------------------------------- /// Create a bare in-memory MEM dataset with GDAL-owned bands. diff --git a/c/sedona-gdal/src/gdal_dyn_bindgen.rs b/c/sedona-gdal/src/gdal_dyn_bindgen.rs index 1a63f4cd5..f6a6e7dc3 100644 --- a/c/sedona-gdal/src/gdal_dyn_bindgen.rs +++ b/c/sedona-gdal/src/gdal_dyn_bindgen.rs @@ -31,6 +31,9 @@ pub type GDALRWFlag = c_int; pub type OGRwkbByteOrder = c_int; pub type GDALOpenFlags = c_uint; pub type GDALRIOResampleAlg = c_int; +pub type GUIntBig = u64; +pub type GIntBig = i64; +pub type vsi_l_offset = GUIntBig; // --- Opaque handle types --- @@ -44,6 +47,31 @@ pub type OGRFeatureH = *mut c_void; pub type OGRFieldDefnH = *mut c_void; pub type VSILFILE = *mut c_void; +// --- VSI mode constants --- + +pub const VSI_S_IFMT: i32 = 0o170000; +pub const VSI_S_IFDIR: i32 = 0o040000; +pub const VSI_S_IFREG: i32 = 0o100000; + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct VSIDIR { + _unused: [u8; 0], +} + +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct VSIDIREntry { + pub pszName: *mut c_char, + pub nMode: c_int, + pub nSize: vsi_l_offset, + pub nMTime: GIntBig, + pub bModeKnown: c_char, + pub bSizeKnown: c_char, + pub bMTimeKnown: c_char, + pub papszExtra: *mut *mut c_char, +} + // --- Enum types --- #[repr(C)] @@ -459,6 +487,17 @@ pub(crate) struct SedonaGdalApi { >, pub VSIFCloseL: Option c_int>, pub VSIUnlink: Option c_int>, + pub VSIGetDirectorySeparator: + Option *const c_char>, + pub VSIOpenDir: Option< + unsafe extern "C" fn( + pszPath: *const c_char, + nRecurseDepth: c_int, + papszOptions: *const *const c_char, + ) -> *mut VSIDIR, + >, + pub VSIGetNextDirEntry: Option *const VSIDIREntry>, + pub VSICloseDir: Option, pub VSIGetMemFileBuffer: Option< unsafe extern "C" fn( pszFilename: *const c_char, diff --git a/c/sedona-gdal/src/vsi.rs b/c/sedona-gdal/src/vsi.rs index 20a60e11c..c9f204b43 100644 --- a/c/sedona-gdal/src/vsi.rs +++ b/c/sedona-gdal/src/vsi.rs @@ -290,3 +290,90 @@ mod tests { .unwrap(); } } + +pub struct VsiDirEntry { + pub name: String, + pub mode: Option, + pub size: Option, + pub mtime: Option, +} + +pub struct VsiDir { + api: &'static crate::gdal_api::GdalApi, + handle: *mut crate::gdal_dyn_bindgen::VSIDIR, +} + +impl VsiDir { + pub fn next_entry(&mut self) -> Option { + let entry = unsafe { (self.api.inner.VSIGetNextDirEntry?)(self.handle) }; + if entry.is_null() { + return None; + } + let entry = unsafe { &*entry }; + + let name = if entry.pszName.is_null() { + String::new() + } else { + unsafe { std::ffi::CStr::from_ptr(entry.pszName) } + .to_string_lossy() + .into_owned() + }; + + Some(VsiDirEntry { + name, + mode: (entry.bModeKnown != 0).then_some(entry.nMode), + size: (entry.bSizeKnown != 0).then_some(entry.nSize), + mtime: (entry.bMTimeKnown != 0).then_some(entry.nMTime), + }) + } +} + +impl Iterator for VsiDir { + type Item = VsiDirEntry; + fn next(&mut self) -> Option { + self.next_entry() + } +} + +impl Drop for VsiDir { + fn drop(&mut self) { + if !self.handle.is_null() { + if let Some(close) = self.api.inner.VSICloseDir { + unsafe { close(self.handle) }; + } + self.handle = std::ptr::null_mut(); + } + } +} + +pub fn open_dir( + api: &'static crate::gdal_api::GdalApi, + path: &str, + recurse_depth: i32, + options: Option<&crate::cpl::CslStringList>, +) -> crate::errors::Result { + let c_path = std::ffi::CString::new(path)?; + let options_ptr: *const *const std::os::raw::c_char = options + .map(|opts| opts.as_ptr() as *const *const std::os::raw::c_char) + .unwrap_or(std::ptr::null()); + let handle = + unsafe { (api.inner.VSIOpenDir.unwrap())(c_path.as_ptr(), recurse_depth, options_ptr) }; + if handle.is_null() { + return Err(api.last_null_pointer_err("VSIOpenDir")); + } + Ok(VsiDir { api, handle }) +} + +pub fn get_directory_separator( + api: &'static crate::gdal_api::GdalApi, + path: &str, +) -> crate::errors::Result { + let c_path = std::ffi::CString::new(path)?; + let separator_ptr = unsafe { (api.inner.VSIGetDirectorySeparator.unwrap())(c_path.as_ptr()) }; + if separator_ptr.is_null() { + return Err(api.last_null_pointer_err("VSIGetDirectorySeparator")); + } + Ok(unsafe { std::ffi::CStr::from_ptr(separator_ptr) } + .to_string_lossy() + .into_owned()) +} diff --git a/rust/sedona-raster-functions/src/crs_utils.rs b/rust/sedona-raster-functions/src/crs_utils.rs index b65c08b27..abdf7ae62 100644 --- a/rust/sedona-raster-functions/src/crs_utils.rs +++ b/rust/sedona-raster-functions/src/crs_utils.rs @@ -64,6 +64,34 @@ pub fn crs_transform_wkb( Ok(out) } +/// Transform a single coordinate pair from one CRS to another. +/// +/// This is a utility used by raster/spatial functions when only an `(x, y)` +/// coordinate needs reprojection and full geometry decoding would be unnecessary. +/// +/// **Behavior** +/// - Builds a PROJ pipeline for `from_crs` -> `to_crs`. +/// - Applies the transformation in place and returns the transformed coordinate. +/// +/// **Errors** +/// - Returns an error if PROJ cannot build the CRS-to-CRS transform, +/// or if the coordinate transformation itself fails. +pub fn crs_transform_coord( + engine: &dyn CrsEngine, + coord: (f64, f64), + from_crs: &str, + to_crs: &str, +) -> Result<(f64, f64)> { + let trans = engine + .get_transform_crs_to_crs(from_crs, to_crs, None, "") + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let mut coord = coord; + trans + .transform_coord(&mut coord) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(coord) +} + #[cfg(test)] mod tests { use super::*; diff --git a/rust/sedona-raster-functions/src/lib.rs b/rust/sedona-raster-functions/src/lib.rs index dbc3749ed..10b2affb2 100644 --- a/rust/sedona-raster-functions/src/lib.rs +++ b/rust/sedona-raster-functions/src/lib.rs @@ -22,6 +22,7 @@ pub mod register; pub mod rs_band_accessors; pub mod rs_bandpath; pub mod rs_convexhull; +pub mod rs_count; pub mod rs_envelope; pub mod rs_example; pub mod rs_georeference; diff --git a/rust/sedona-raster-functions/src/register.rs b/rust/sedona-raster-functions/src/register.rs index e86fe8e98..8c4ae51c3 100644 --- a/rust/sedona-raster-functions/src/register.rs +++ b/rust/sedona-raster-functions/src/register.rs @@ -42,6 +42,7 @@ pub fn default_function_set() -> FunctionSet { crate::rs_band_accessors::rs_bandnodatavalue_udf, crate::rs_bandpath::rs_bandpath_udf, crate::rs_convexhull::rs_convexhull_udf, + crate::rs_count::rs_count_udf, crate::rs_envelope::rs_envelope_udf, crate::rs_example::rs_example_udf, crate::rs_georeference::rs_georeference_udf, diff --git a/rust/sedona-raster-functions/src/rs_count.rs b/rust/sedona-raster-functions/src/rs_count.rs new file mode 100644 index 000000000..24cd4e96f --- /dev/null +++ b/rust/sedona-raster-functions/src/rs_count.rs @@ -0,0 +1,338 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; +use std::vec; + +use crate::executor::RasterExecutor; +use arrow_array::builder::Int64Builder; +use arrow_array::{cast::AsArray, types::Int32Type, Array, BooleanArray}; +use arrow_schema::DataType; +use datafusion_common::error::Result; +use datafusion_common::exec_err; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::traits::RasterRef; +use sedona_schema::raster::{BandDataType, StorageType}; +use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; + +/// Byte size of a single pixel for the given band data type. +fn data_type_byte_size(data_type: &BandDataType) -> usize { + match data_type { + BandDataType::UInt8 | BandDataType::Int8 => 1, + BandDataType::UInt16 | BandDataType::Int16 => 2, + BandDataType::UInt32 | BandDataType::Int32 | BandDataType::Float32 => 4, + BandDataType::UInt64 | BandDataType::Int64 | BandDataType::Float64 => 8, + } +} + +/// RS_Count() scalar UDF implementation +/// +/// Returns the count of pixels in the specified band. +/// When excludeNoData is true (default), pixels equal to the nodata value are excluded. +/// When excludeNoData is false, returns width * height. +/// Accepts optional band_index (1-based, default 1) and excludeNoData (default true) parameters. +pub fn rs_count_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_count", + vec![ + Arc::new(RsCount {}), + Arc::new(RsCountWithBand {}), + Arc::new(RsCountWithBandAndExclude {}), + ], + Volatility::Immutable, + ) +} + +// --------------------------------------------------------------------------- +// 1-arg kernel: RS_Count(raster) +// --------------------------------------------------------------------------- + +#[derive(Debug)] +struct RsCount {} + +impl SedonaScalarKernel for RsCount { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ArgMatcher::is_raster()], + SedonaType::Arrow(DataType::Int64), + ); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let mut builder = Int64Builder::with_capacity(executor.num_iterations()); + + executor.execute_raster_void(|_i, raster_opt| { + count_pixels(raster_opt, 1, true, &mut builder) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +// --------------------------------------------------------------------------- +// 2-arg kernel: RS_Count(raster, band) +// --------------------------------------------------------------------------- + +#[derive(Debug)] +struct RsCountWithBand {} + +impl SedonaScalarKernel for RsCountWithBand { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ArgMatcher::is_raster(), ArgMatcher::is_integer()], + SedonaType::Arrow(DataType::Int64), + ); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let band_index_array = args[1].clone().into_array(executor.num_iterations())?; + let band_index_array = band_index_array.as_primitive::(); + + let mut builder = Int64Builder::with_capacity(executor.num_iterations()); + + executor.execute_raster_void(|i, raster_opt| { + let band_index = if band_index_array.is_null(i) { + 1 + } else { + band_index_array.value(i) + }; + count_pixels(raster_opt, band_index, true, &mut builder) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +// --------------------------------------------------------------------------- +// 3-arg kernel: RS_Count(raster, band, excludeNoData) +// --------------------------------------------------------------------------- + +#[derive(Debug)] +struct RsCountWithBandAndExclude {} + +impl SedonaScalarKernel for RsCountWithBandAndExclude { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_boolean(), + ], + SedonaType::Arrow(DataType::Int64), + ); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let band_index_array = args[1].clone().into_array(executor.num_iterations())?; + let band_index_array = band_index_array.as_primitive::(); + let exclude_array = args[2].clone().into_array(executor.num_iterations())?; + let exclude_array = exclude_array + .as_any() + .downcast_ref::() + .expect("Expected BooleanArray for excludeNoData"); + + let mut builder = Int64Builder::with_capacity(executor.num_iterations()); + + executor.execute_raster_void(|i, raster_opt| { + let band_index = if band_index_array.is_null(i) { + 1 + } else { + band_index_array.value(i) + }; + let exclude_nodata = if exclude_array.is_null(i) { + true + } else { + exclude_array.value(i) + }; + count_pixels(raster_opt, band_index, exclude_nodata, &mut builder) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +fn count_pixels( + raster_opt: Option<&sedona_raster::array::RasterRefImpl<'_>>, + band_index: i32, + exclude_nodata: bool, + builder: &mut Int64Builder, +) -> Result<()> { + match raster_opt { + None => { + builder.append_null(); + Ok(()) + } + Some(raster) => { + let num_bands = raster.bands().len(); + if band_index < 1 || band_index as usize > num_bands { + return exec_err!( + "Provided band index {} is not in the range [1, {}]", + band_index, + num_bands + ); + } + + let total_pixels = raster.metadata().width() as i64 * raster.metadata().height() as i64; + + let band = raster.bands().band(band_index as usize)?; + let band_meta = band.metadata(); + let nodata_bytes = band_meta.nodata_value(); + + // If not excluding nodata, or no nodata value defined, return total pixel count + if !exclude_nodata || nodata_bytes.is_none() { + builder.append_value(total_pixels); + return Ok(()); + } + + let nodata_bytes = nodata_bytes.unwrap(); + + // OutDbRef bands don't have inline pixel data + if band_meta.storage_type()? == StorageType::OutDbRef { + return exec_err!( + "RS_Count with excludeNoData=true does not support out-db raster bands" + ); + } + + let dt = band_meta.data_type()?; + let pixel_size = data_type_byte_size(&dt); + let data = band.data(); + + let nodata_count = data + .chunks_exact(pixel_size) + .filter(|chunk| *chunk == nodata_bytes) + .count() as i64; + + builder.append_value(total_pixels - nodata_count); + Ok(()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{BooleanArray, Int32Array, Int64Array}; + use datafusion_common::ScalarValue; + use datafusion_expr::ScalarUDF; + use sedona_schema::datatypes::RASTER; + use sedona_testing::compare::assert_array_equal; + use sedona_testing::rasters::{generate_test_rasters, raster_from_single_band}; + use sedona_testing::testers::ScalarUdfTester; + + #[test] + fn udf_count_metadata() { + let udf: ScalarUDF = rs_count_udf().into(); + assert_eq!(udf.name(), "rs_count"); + } + + #[test] + fn udf_count_default() { + let udf: ScalarUDF = rs_count_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER]); + tester.assert_return_type(DataType::Int64); + + // generate_test_rasters: raster 0 is 1x2=2 pixels, data=[0,0,1,0] (u16) + // nodata=0 -> pixel 0 is nodata, pixel 1 is not -> count=1 + // raster 1: null + // raster 2: 3x4=12 pixels, data=[0..11] as u16 + // nodata=0 -> pixel 0 is nodata -> count=11 + let rasters = generate_test_rasters(3, Some(1)).unwrap(); + let expected: Arc = + Arc::new(Int64Array::from(vec![Some(1), None, Some(11)])); + let result = tester.invoke_array(Arc::new(rasters)).unwrap(); + assert_array_equal(&result, &expected); + } + + #[test] + fn udf_count_exclude_false() { + let udf: ScalarUDF = rs_count_udf().into(); + let tester = ScalarUdfTester::new( + udf, + vec![ + RASTER, + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Boolean), + ], + ); + + // With excludeNoData=false, should return total pixel count + let rasters = generate_test_rasters(3, Some(1)).unwrap(); + let bands = Int32Array::from(vec![1, 1, 1]); + let exclude = BooleanArray::from(vec![false, false, false]); + let expected: Arc = + Arc::new(Int64Array::from(vec![Some(2), None, Some(12)])); + + let result = tester + .invoke_arrays(vec![Arc::new(rasters), Arc::new(bands), Arc::new(exclude)]) + .unwrap(); + assert_array_equal(&result, &expected); + } + + #[test] + fn udf_count_no_nodata_defined() { + // When no nodata is defined, excludeNoData=true should still return total pixels + let data = vec![0u8; 9]; + let rasters = raster_from_single_band(3, 3, BandDataType::UInt8, &data, Some("OGC:CRS84")); + + let udf: ScalarUDF = rs_count_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER]); + let result = tester.invoke_array(Arc::new(rasters)).unwrap(); + let int_array = result + .as_any() + .downcast_ref::() + .expect("Expected Int64Array"); + assert_eq!(int_array.value(0), 9); + } + + #[test] + fn udf_count_null_scalar() { + let udf: ScalarUDF = rs_count_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER]); + let result = tester.invoke_scalar(ScalarValue::Null).unwrap(); + tester.assert_scalar_result_equals(result, ScalarValue::Int64(None)); + } + + #[test] + fn udf_count_invalid_band_errors() { + let udf: ScalarUDF = rs_count_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER, SedonaType::Arrow(DataType::Int32)]); + + let rasters = generate_test_rasters(1, None).unwrap(); + let bands = Int32Array::from(vec![5]); // out of range + let result = tester.invoke_arrays(vec![Arc::new(rasters), Arc::new(bands)]); + assert!(result.is_err()); + } +} diff --git a/rust/sedona-raster-gdal/Cargo.toml b/rust/sedona-raster-gdal/Cargo.toml index 5dba31c98..2860605e3 100644 --- a/rust/sedona-raster-gdal/Cargo.toml +++ b/rust/sedona-raster-gdal/Cargo.toml @@ -31,18 +31,72 @@ rust-version.workspace = true result_large_err = "allow" [dependencies] +arrow = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } +async-trait = { workspace = true } +datafusion = { workspace = true, default_features = false } datafusion-common = { workspace = true } +datafusion-common-runtime = { workspace = true } +datafusion-expr = { workspace = true } +evalexpr = "13" +futures = { workspace = true } lru = { workspace = true } sedona-common = { workspace = true } -sedona-gdal = { workspace = true } +sedona-expr = { workspace = true } +sedona-geometry = { workspace = true } sedona-raster = { workspace = true } +sedona-raster-functions = { workspace = true } sedona-schema = { workspace = true } +sedona-proj = { workspace = true } +sedona-gdal = { workspace = true } +wkb = { workspace = true } [dev-dependencies] criterion = { workspace = true } sedona-gdal = { workspace = true, features = ["gdal-sys"] } +sedona-proj = { workspace = true, features = ["proj-sys"] } sedona-testing = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } + +[[bench]] +harness = false +name = "rs_value" +path = "benches/rs_value.rs" + +[[bench]] +harness = false +name = "rs_from_gdal_raster" +path = "benches/rs_from_gdal_raster.rs" + +[[bench]] +harness = false +name = "rs_as_geotiff" +path = "benches/rs_as_geotiff.rs" + +[[bench]] +harness = false +name = "rs_polygonize" +path = "benches/rs_polygonize.rs" + +[[bench]] +harness = false +name = "rs_clip" +path = "benches/rs_clip.rs" + +[[bench]] +harness = false +name = "rs_zonal_stats" +path = "benches/rs_zonal_stats.rs" + +[[bench]] +harness = false +name = "rs_map_algebra" +path = "benches/rs_map_algebra.rs" + +[[bench]] +harness = false +name = "rs_from_path" +path = "benches/rs_from_path.rs" diff --git a/rust/sedona-raster-gdal/benches/bench_common.rs b/rust/sedona-raster-gdal/benches/bench_common.rs new file mode 100644 index 000000000..f68558840 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/bench_common.rs @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common utilities for sedona-raster-gdal benchmarks + +#![allow(dead_code)] + +use std::hint::black_box; +use std::sync::Arc; + +use arrow_array::{ArrayRef, BinaryArray, StringArray, StructArray}; +use arrow_schema::Field; +use datafusion_common::{config::ConfigOptions, ScalarValue}; +use datafusion_expr::{ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_schema::crs::lnglat; +use sedona_schema::datatypes::SedonaType; +use sedona_schema::raster::{BandDataType, StorageType}; + +/// Load test rasters from GeoTIFF files using GDAL +/// +/// This creates in-db raster arrays by parsing GeoTIFF bytes via GDAL +pub fn load_rasters_from_geotiff(name: &str, count: usize) -> StructArray { + let test_file = sedona_testing::data::test_raster(name).expect("Failed to find test raster"); + let content = std::fs::read(&test_file).expect("Failed to read test raster file"); + + // Create multiple copies of the raster content + let binary_data: Vec> = (0..count).map(|_| Some(content.as_slice())).collect(); + let binary_array = BinaryArray::from(binary_data); + + // Use the UDF directly + let udf: ScalarUDF = sedona_raster_gdal::rs_from_gdal_raster_udf().into(); + + let args = ScalarFunctionArgs { + args: vec![ColumnarValue::Array(Arc::new(binary_array) as ArrayRef)], + arg_fields: vec![Arc::new(Field::new( + "binary", + arrow_schema::DataType::Binary, + true, + ))], + number_rows: count, + return_field: Arc::new(Field::new( + "raster", + sedona_schema::datatypes::RASTER.storage_type().clone(), + true, + )), + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = udf + .invoke_with_args(args) + .expect("Failed to invoke RS_FromGDALRaster"); + + match result { + ColumnarValue::Array(array) => array + .as_any() + .downcast_ref::() + .expect("Expected StructArray") + .clone(), + ColumnarValue::Scalar(scalar) => { + if let ScalarValue::Struct(arc_struct) = scalar { + arc_struct.as_ref().clone() + } else { + panic!("Expected Struct scalar"); + } + } + } +} + +/// Load a single raster and wrap as scalar for benchmarks +pub fn load_raster_as_scalar(name: &str) -> ColumnarValue { + let raster = load_rasters_from_geotiff(name, 1); + let scalar = ScalarValue::try_from_array(&raster, 0).expect("Failed to create scalar"); + ColumnarValue::Scalar(scalar) +} + +/// Load rasters as array for benchmarks +pub fn load_rasters_as_array(name: &str, count: usize) -> ColumnarValue { + let raster = load_rasters_from_geotiff(name, count); + ColumnarValue::Array(Arc::new(raster) as ArrayRef) +} + +/// Invoke a UDF with given arguments and consume the result +/// +/// For proper type matching, pass arg_types alongside args to specify SedonaTypes +/// (especially important for RASTER types). +pub fn invoke_udf( + udf: &ScalarUDF, + args: &[ColumnarValue], + arg_types: &[SedonaType], +) -> datafusion_common::Result<()> { + // Get number of rows from first array argument, default to 1 + let number_rows = args + .iter() + .find_map(|arg| { + if let ColumnarValue::Array(array) = arg { + Some(array.len()) + } else { + None + } + }) + .unwrap_or(1); + + // Create fields for each argument using the provided SedonaTypes + let arg_fields: Vec> = arg_types + .iter() + .enumerate() + .map(|(i, sedona_type)| { + Arc::new( + sedona_type + .to_storage_field(&format!("arg{}", i), true) + .unwrap(), + ) + }) + .collect(); + + let scalar_arguments = args + .iter() + .map(|arg| match arg { + ColumnarValue::Scalar(scalar) => Some(scalar), + ColumnarValue::Array(_) => None, + }) + .collect::>(); + + let return_field = udf.return_field_from_args(ReturnFieldArgs { + arg_fields: &arg_fields, + scalar_arguments: &scalar_arguments, + })?; + + let func_args = ScalarFunctionArgs { + args: args.to_vec(), + arg_fields, + number_rows, + return_field, + config_options: Arc::new(ConfigOptions::default()), + }; + + let result = udf.invoke_with_args(func_args)?; + black_box(result); + Ok(()) +} + +/// Create a string scalar value +pub fn string_scalar(s: &str) -> ColumnarValue { + ColumnarValue::Scalar(ScalarValue::Utf8(Some(s.to_string()))) +} + +/// Create an i32 scalar value +pub fn int32_scalar(v: i32) -> ColumnarValue { + ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) +} + +/// Create an f64 scalar value +pub fn float64_scalar(v: f64) -> ColumnarValue { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) +} + +/// Create a string array with repeated values +#[allow(dead_code)] +pub fn string_array(s: &str, count: usize) -> ColumnarValue { + let strings: Vec<&str> = (0..count).map(|_| s).collect(); + let array = StringArray::from(strings); + ColumnarValue::Array(Arc::new(array) as ArrayRef) +} + +/// Default batch size for benchmarks +#[allow(dead_code)] +pub const BENCH_BATCH_SIZE: usize = 100; + +/// Small batch size for slower operations +#[allow(dead_code)] +pub const SMALL_BATCH_SIZE: usize = 10; + +/// Tiny batch size for very slow operations (like polygonize, clip) +#[allow(dead_code)] +pub const TINY_BATCH_SIZE: usize = 5; + +/// Generate synthetic in-db rasters for benchmarking +/// +/// This creates rasters without GDAL dependency, useful for pure-Rust benchmarks +#[allow(dead_code)] +pub fn generate_synthetic_rasters(count: usize, width: usize, height: usize) -> StructArray { + let mut builder = RasterBuilder::new(count); + let crs = lnglat().unwrap().to_crs_string(); + + for _ in 0..count { + let raster_metadata = RasterMetadata { + width: width as u64, + height: height as u64, + upperleft_x: 0.0, + upperleft_y: 0.0, + scale_x: 1.0, + scale_y: -1.0, + skew_x: 0.0, + skew_y: 0.0, + }; + + builder.start_raster(&raster_metadata, Some(&crs)).unwrap(); + + // Add 3 bands (like RGB) + for _ in 0..3 { + builder + .start_band(BandMetadata { + datatype: BandDataType::UInt8, + nodata_value: Some(vec![0u8]), + storage_type: StorageType::InDb, + outdb_url: None, + outdb_band_id: None, + }) + .unwrap(); + + // Fill with random-ish data + let pixel_count = width * height; + let band_data: Vec = (0..pixel_count).map(|i| (i % 256) as u8).collect(); + builder.band_data_writer().append_value(&band_data); + builder.finish_band().unwrap(); + } + + builder.finish_raster().unwrap(); + } + + builder.finish().unwrap() +} diff --git a/rust/sedona-raster-gdal/benches/rs_as_geotiff.rs b/rust/sedona-raster-gdal/benches/rs_as_geotiff.rs new file mode 100644 index 000000000..e6ad93508 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_as_geotiff.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_AsGeoTiff UDF +//! +//! RS_AsGeoTiff converts rasters to GeoTIFF binary format. +//! Supported variants: +//! - RS_AsGeoTiff(raster) +//! - RS_AsGeoTiff(raster, compression) +//! - RS_AsGeoTiff(raster, compression, quality) +//! - RS_AsGeoTiff(raster, compression, quality, tileWidth, tileHeight) + +mod bench_common; + +use arrow_array::Array; +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use datafusion_common::ScalarValue; +use datafusion_expr::{ColumnarValue, ScalarUDF}; +use sedona_raster_gdal::rs_as_geotiff_udf; +use sedona_schema::datatypes::{SedonaType, RASTER}; + +use bench_common::*; + +fn bench_rs_as_geotiff_basic(c: &mut Criterion) { + let udf: ScalarUDF = rs_as_geotiff_udf().into(); + + // Arg types: (raster) + let arg_types = vec![RASTER]; + + let mut group = c.benchmark_group("rs_as_geotiff"); + + for raster_name in &["test1.tiff", "test4.tiff"] { + let raster_scalar = load_raster_as_scalar(raster_name); + + // Estimate throughput based on source raster size + let raster_arr = load_rasters_from_geotiff(raster_name, 1); + let estimated_size = raster_arr.get_array_memory_size(); + group.throughput(Throughput::Bytes(estimated_size as u64)); + + group.bench_with_input( + BenchmarkId::new("basic", raster_name), + &raster_scalar, + |b, raster| { + b.iter(|| { + let args = vec![raster.clone()]; + invoke_udf(&udf, &args, &arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_as_geotiff_with_compression(c: &mut Criterion) { + let udf: ScalarUDF = rs_as_geotiff_udf().into(); + + // Arg types: (raster, compression, quality) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Float64), + ]; + + let mut group = c.benchmark_group("rs_as_geotiff_compression"); + + let raster_name = "test4.tiff"; + let raster_scalar = load_raster_as_scalar(raster_name); + let quality_scalar = ColumnarValue::Scalar(ScalarValue::Float64(Some(75.0))); + + for compression in &["none", "lzw", "deflate"] { + let compression_scalar = + ColumnarValue::Scalar(ScalarValue::Utf8(Some(compression.to_string()))); + + group.bench_with_input( + BenchmarkId::new("compression", compression), + &(&raster_scalar, &compression_scalar, &quality_scalar), + |b, (raster, comp, qual)| { + b.iter(|| { + let args = vec![(*raster).clone(), (*comp).clone(), (*qual).clone()]; + invoke_udf(&udf, &args, &arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_rs_as_geotiff_basic, + bench_rs_as_geotiff_with_compression +); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_clip.rs b/rust/sedona-raster-gdal/benches/rs_clip.rs new file mode 100644 index 000000000..a8052ace0 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_clip.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_Clip UDF +//! +//! RS_Clip clips rasters to a geometry boundary. +//! Signature: RS_Clip(raster, geometry, [nodata], [crop]) +//! +//! NOTE: This benchmark requires geometry creation which needs additional setup. +//! The underlying GDAL segfault issue has been fixed. +//! TODO: Implement full benchmark once geometry utilities are available in bench context. + +use criterion::{criterion_group, criterion_main, Criterion}; + +fn bench_rs_clip_placeholder(c: &mut Criterion) { + let mut group = c.benchmark_group("rs_clip"); + + // Placeholder benchmark - needs geometry creation utilities + // The underlying GDAL memory dataset issue has been fixed + group.bench_function("placeholder", |b| b.iter(|| std::hint::black_box(42))); + + group.finish(); +} + +criterion_group!(benches, bench_rs_clip_placeholder); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_from_gdal_raster.rs b/rust/sedona-raster-gdal/benches/rs_from_gdal_raster.rs new file mode 100644 index 000000000..1974f05a2 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_from_gdal_raster.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_FromGDALRaster UDF +//! +//! RS_FromGDALRaster parses binary raster data (GeoTIFF, etc.) into in-db rasters. +//! This benchmark covers: +//! - Different raster sizes +//! - Single vs batch parsing +//! - Parsing efficiency + +mod bench_common; + +use std::sync::Arc; + +use arrow_array::{ArrayRef, BinaryArray}; +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use datafusion_expr::{ColumnarValue, ScalarUDF}; +use sedona_raster_gdal::rs_from_gdal_raster_udf; +use sedona_schema::datatypes::SedonaType; + +use bench_common::*; + +fn load_geotiff_bytes(name: &str) -> Vec { + let test_file = sedona_testing::data::test_raster(name).expect("Failed to find test raster"); + std::fs::read(&test_file).expect("Failed to read test raster file") +} + +fn bench_rs_from_gdal_raster_single(c: &mut Criterion) { + let udf: ScalarUDF = rs_from_gdal_raster_udf().into(); + + // Arg types: (binary) + let arg_types = vec![SedonaType::Arrow(DataType::Binary)]; + + let mut group = c.benchmark_group("rs_from_gdal_raster_single"); + + for raster_name in &["test1.tiff", "test4.tiff"] { + let bytes = load_geotiff_bytes(raster_name); + + group.throughput(Throughput::Bytes(bytes.len() as u64)); + + let binary_scalar = + ColumnarValue::Scalar(datafusion_common::ScalarValue::Binary(Some(bytes.clone()))); + + group.bench_with_input( + BenchmarkId::new("parse", raster_name), + &(&binary_scalar, &arg_types), + |b, (binary, arg_types)| { + b.iter(|| { + let args = vec![(*binary).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_from_gdal_raster_batch(c: &mut Criterion) { + let udf: ScalarUDF = rs_from_gdal_raster_udf().into(); + + // Arg types: (binary) + let arg_types = vec![SedonaType::Arrow(DataType::Binary)]; + + let mut group = c.benchmark_group("rs_from_gdal_raster_batch"); + + for batch_size in &[10, 50, 100] { + let bytes = load_geotiff_bytes("test4.tiff"); + let total_bytes = bytes.len() * batch_size; + + group.throughput(Throughput::Bytes(total_bytes as u64)); + + // Create batch of binary data + let binary_data: Vec> = + (0..*batch_size).map(|_| Some(bytes.as_slice())).collect(); + let binary_array = BinaryArray::from(binary_data); + let binary_columnar = ColumnarValue::Array(Arc::new(binary_array) as ArrayRef); + + group.bench_with_input( + BenchmarkId::new("batch", batch_size), + &(&binary_columnar, &arg_types), + |b, (binary, arg_types)| { + b.iter(|| { + let args = vec![(*binary).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_rs_from_gdal_raster_single, + bench_rs_from_gdal_raster_batch +); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_from_path.rs b/rust/sedona-raster-gdal/benches/rs_from_path.rs new file mode 100644 index 000000000..52807f0c9 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_from_path.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_FromPath UDF +//! +//! RS_FromPath creates out-db rasters from file paths. +//! +//! NOTE: This benchmark is currently disabled because RS_FromPath has a known issue +//! with RasterBuilder not correctly handling null data for out-db rasters. +//! The out-db path support is still evolving; this file currently contains a placeholder benchmark. +//! +//! Once the out-db raster support is fixed, this benchmark should cover: +//! - Loading rasters with and without extent calculation +//! - Different raster files +//! - Batch processing + +use criterion::{criterion_group, criterion_main, Criterion}; + +fn bench_rs_from_path_placeholder(c: &mut Criterion) { + let mut group = c.benchmark_group("rs_from_path"); + + // Placeholder benchmark - actual benchmarks disabled due to known issue + // with RasterBuilder not handling null data for out-db rasters + group.bench_function("placeholder", |b| { + b.iter(|| { + // No-op: RS_FromPath benchmarks disabled until out-db raster issue is resolved + std::hint::black_box(42) + }) + }); + + group.finish(); +} + +criterion_group!(benches, bench_rs_from_path_placeholder); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_map_algebra.rs b/rust/sedona-raster-gdal/benches/rs_map_algebra.rs new file mode 100644 index 000000000..14f759bab --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_map_algebra.rs @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_MapAlgebra UDF +//! +//! RS_MapAlgebra applies mathematical expressions to raster pixels. +//! This benchmark covers: +//! - Different expression complexities +//! - Single vs multiple output bands +//! - Different raster sizes +//! - Different output pixel types + +mod bench_common; + +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use datafusion_expr::ScalarUDF; +use sedona_raster_gdal::rs_map_algebra_udf; +use sedona_schema::datatypes::{SedonaType, RASTER}; + +use bench_common::*; + +fn bench_rs_map_algebra_expressions(c: &mut Criterion) { + let udf: ScalarUDF = rs_map_algebra_udf().into(); + + // Arg types: (raster, pixel_type, expr) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + ]; + + let mut group = c.benchmark_group("rs_map_algebra_expressions"); + group.sample_size(20); + + let raster = load_raster_as_scalar("test4.tiff"); + let pixel_type = string_scalar("D"); // Float64 output + + // Simple expression: identity + let expr_identity = string_scalar("rast0"); + group.bench_with_input( + BenchmarkId::new("expr", "identity"), + &(&expr_identity, &arg_types), + |b, (expr, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), (*expr).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + // Simple arithmetic + let expr_arithmetic = string_scalar("rast0 * 2 + 1"); + group.bench_with_input( + BenchmarkId::new("expr", "arithmetic"), + &(&expr_arithmetic, &arg_types), + |b, (expr, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), (*expr).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + // More complex arithmetic (single band, but exercises the expression parser) + let expr_complex_arith = string_scalar("(rast0 * 0.5 + 128) / 2.0 - rast0 * 0.1"); + group.bench_with_input( + BenchmarkId::new("expr", "complex_arithmetic"), + &(&expr_complex_arith, &arg_types), + |b, (expr, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), (*expr).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + // Using position variables (x, y) in expressions + let expr_position = string_scalar("rast0 + x + y"); + group.bench_with_input( + BenchmarkId::new("expr", "with_position"), + &(&expr_position, &arg_types), + |b, (expr, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), (*expr).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + // Using width/height constants in expressions + let expr_normalized = string_scalar("rast0 / 255.0 * (width + height)"); + group.bench_with_input( + BenchmarkId::new("expr", "with_dimensions"), + &(&expr_normalized, &arg_types), + |b, (expr, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), (*expr).clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + group.finish(); +} + +fn bench_rs_map_algebra_pixel_types(c: &mut Criterion) { + let udf: ScalarUDF = rs_map_algebra_udf().into(); + + // Arg types: (raster, pixel_type, expr) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + ]; + + let mut group = c.benchmark_group("rs_map_algebra_pixel_types"); + group.sample_size(20); + + let raster = load_raster_as_scalar("test4.tiff"); + let expr = string_scalar("rast0 * 2"); + + // Different output pixel types + for (pixel_type, type_name) in &[ + ("B", "UInt8"), + ("S", "Int16"), + ("I", "Int32"), + ("F", "Float32"), + ("D", "Float64"), + ] { + let pixel_type_scalar = string_scalar(pixel_type); + + group.bench_with_input( + BenchmarkId::new("output_type", type_name), + &(&pixel_type_scalar, &arg_types), + |b, (pt, arg_types)| { + b.iter(|| { + let args = vec![raster.clone(), (*pt).clone(), expr.clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_map_algebra_with_nodata(c: &mut Criterion) { + let udf: ScalarUDF = rs_map_algebra_udf().into(); + + // Different arg types for each signature + let arg_types_basic = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + ]; + let arg_types_nodata = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Float64), + ]; + let arg_types_bands = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Float64), + SedonaType::Arrow(DataType::Int32), + ]; + + let mut group = c.benchmark_group("rs_map_algebra_with_nodata"); + group.sample_size(20); + + let raster = load_raster_as_scalar("test4.tiff"); + let pixel_type = string_scalar("D"); + let expr = string_scalar("rast0 * 2 + 1"); + + // Without nodata + group.bench_function("without_nodata", |b| { + b.iter(|| { + let args = vec![raster.clone(), pixel_type.clone(), expr.clone()]; + invoke_udf(&udf, &args, &arg_types_basic).unwrap() + }) + }); + + // With nodata + let nodata = float64_scalar(-9999.0); + group.bench_function("with_nodata", |b| { + b.iter(|| { + let args = vec![ + raster.clone(), + pixel_type.clone(), + expr.clone(), + nodata.clone(), + ]; + invoke_udf(&udf, &args, &arg_types_nodata).unwrap() + }) + }); + + // With nodata and multiple output bands + let num_bands = int32_scalar(3); + group.bench_function("multi_band_output", |b| { + b.iter(|| { + let args = vec![ + raster.clone(), + pixel_type.clone(), + expr.clone(), + nodata.clone(), + num_bands.clone(), + ]; + invoke_udf(&udf, &args, &arg_types_bands).unwrap() + }) + }); + + group.finish(); +} + +fn bench_rs_map_algebra_raster_sizes(c: &mut Criterion) { + let udf: ScalarUDF = rs_map_algebra_udf().into(); + + // Arg types: (raster, pixel_type, expr) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + ]; + + let mut group = c.benchmark_group("rs_map_algebra_raster_sizes"); + group.sample_size(20); + + let pixel_type = string_scalar("D"); + let expr = string_scalar("rast0 * 2 + 1"); + + for raster_name in &["test1.tiff", "test4.tiff"] { + let raster = load_raster_as_scalar(raster_name); + + group.bench_with_input( + BenchmarkId::new("raster", raster_name), + &(&raster, &arg_types), + |b, (raster, arg_types)| { + b.iter(|| { + let args = vec![(*raster).clone(), pixel_type.clone(), expr.clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_map_algebra_batch(c: &mut Criterion) { + let udf: ScalarUDF = rs_map_algebra_udf().into(); + + // Arg types: (raster, pixel_type, expr) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Utf8), + SedonaType::Arrow(DataType::Utf8), + ]; + + let mut group = c.benchmark_group("rs_map_algebra_batch"); + group.sample_size(10); + + let pixel_type = string_scalar("D"); + let expr = string_scalar("rast0 * 2"); + + for batch_size in &[5, 10, 20] { + let rasters = load_rasters_as_array("test4.tiff", *batch_size); + + group.bench_with_input( + BenchmarkId::new("batch", batch_size), + &(&rasters, &arg_types), + |b, (rasters, arg_types)| { + b.iter(|| { + let args = vec![(*rasters).clone(), pixel_type.clone(), expr.clone()]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_rs_map_algebra_expressions, + bench_rs_map_algebra_pixel_types, + bench_rs_map_algebra_with_nodata, + bench_rs_map_algebra_raster_sizes, + bench_rs_map_algebra_batch +); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_polygonize.rs b/rust/sedona-raster-gdal/benches/rs_polygonize.rs new file mode 100644 index 000000000..26e80eae7 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_polygonize.rs @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_Polygonize UDF +//! +//! RS_Polygonize converts raster pixels to polygon geometries. +//! Signature: RS_Polygonize(raster, band) +//! +//! This benchmark covers: +//! - Different raster sizes +//! - Different band numbers + +mod bench_common; + +use arrow_array::Array; +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use datafusion_common::ScalarValue; +use datafusion_expr::{ColumnarValue, ScalarUDF}; +use sedona_raster_gdal::rs_polygonize_udf; +use sedona_schema::datatypes::{SedonaType, RASTER}; + +use bench_common::*; + +fn bench_rs_polygonize_single(c: &mut Criterion) { + let udf: ScalarUDF = rs_polygonize_udf().into(); + + // Arg types: (raster, band_index) + let arg_types = vec![RASTER, SedonaType::Arrow(DataType::Int32)]; + + let mut group = c.benchmark_group("rs_polygonize"); + + for raster_name in &["test1.tiff", "test4.tiff"] { + let raster_scalar = load_raster_as_scalar(raster_name); + let band_scalar = ColumnarValue::Scalar(ScalarValue::Int32(Some(1))); + + // Estimate throughput based on source raster size + let raster_arr = load_rasters_from_geotiff(raster_name, 1); + let estimated_size = raster_arr.get_array_memory_size(); + group.throughput(Throughput::Bytes(estimated_size as u64)); + + group.bench_with_input( + BenchmarkId::new("single", raster_name), + &(&raster_scalar, &band_scalar), + |b, (raster, band)| { + b.iter(|| { + let args = vec![(*raster).clone(), (*band).clone()]; + invoke_udf(&udf, &args, &arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_rs_polygonize_single); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_value.rs b/rust/sedona-raster-gdal/benches/rs_value.rs new file mode 100644 index 000000000..9614f4a26 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_value.rs @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_Value UDF +//! +//! RS_Value extracts pixel values from raster at specified coordinates. +//! This benchmark covers: +//! - Grid coordinate (col/row) based lookup with different positions +//! - Different raster sizes +//! - Batch processing + +mod bench_common; + +use arrow_schema::DataType; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use datafusion_expr::ScalarUDF; +use sedona_raster_gdal::rs_value_udf; +use sedona_schema::datatypes::{SedonaType, RASTER}; + +use bench_common::*; + +fn bench_rs_value_grid(c: &mut Criterion) { + let udf: ScalarUDF = rs_value_udf().into(); + + // Arg types for grid-based RS_Value: (raster, col_x, row_y, band) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + ]; + + let mut group = c.benchmark_group("rs_value_grid"); + + for raster_name in &["test1.tiff", "test4.tiff"] { + let raster = load_raster_as_scalar(raster_name); + + // Test at pixel (0, 0) + let col_x = int32_scalar(0); + let row_y = int32_scalar(0); + let band = int32_scalar(1); + + group.bench_with_input( + BenchmarkId::new("pos_0_0", raster_name), + &(&raster, &col_x, &row_y, &band, &arg_types), + |b, (raster, col_x, row_y, band, arg_types)| { + b.iter(|| { + let args = vec![ + (*raster).clone(), + (*col_x).clone(), + (*row_y).clone(), + (*band).clone(), + ]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + + // Test at a different position + let col_x = int32_scalar(5); + let row_y = int32_scalar(5); + group.bench_with_input( + BenchmarkId::new("pos_5_5", raster_name), + &(&raster, &col_x, &row_y, &band, &arg_types), + |b, (raster, col_x, row_y, band, arg_types)| { + b.iter(|| { + let args = vec![ + (*raster).clone(), + (*col_x).clone(), + (*row_y).clone(), + (*band).clone(), + ]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_value_bands(c: &mut Criterion) { + let udf: ScalarUDF = rs_value_udf().into(); + + // Arg types for grid-based RS_Value: (raster, col_x, row_y, band) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + ]; + + let mut group = c.benchmark_group("rs_value_bands"); + + let raster = load_raster_as_scalar("test4.tiff"); + let col_x = int32_scalar(0); + let row_y = int32_scalar(0); + + // Different band numbers + for band_num in &[1, 2, 3] { + let band = int32_scalar(*band_num); + + group.bench_with_input( + BenchmarkId::new("band", band_num), + &(&raster, &col_x, &row_y, &band, &arg_types), + |b, (raster, col_x, row_y, band, arg_types)| { + b.iter(|| { + let args = vec![ + (*raster).clone(), + (*col_x).clone(), + (*row_y).clone(), + (*band).clone(), + ]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +fn bench_rs_value_batch(c: &mut Criterion) { + let udf: ScalarUDF = rs_value_udf().into(); + + // Arg types for grid-based RS_Value: (raster, col_x, row_y, band) + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + ]; + + let mut group = c.benchmark_group("rs_value_batch"); + + // Batch processing: multiple rasters with same query position + for batch_size in &[10, 50, 100] { + let rasters = load_rasters_as_array("test4.tiff", *batch_size); + let col_x = int32_scalar(0); + let row_y = int32_scalar(0); + let band = int32_scalar(1); + + group.bench_with_input( + BenchmarkId::new("batch", batch_size), + &(&rasters, &arg_types), + |b, (rasters, arg_types)| { + b.iter(|| { + let args = vec![ + (*rasters).clone(), + col_x.clone(), + row_y.clone(), + band.clone(), + ]; + invoke_udf(&udf, &args, arg_types).unwrap() + }) + }, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_rs_value_grid, + bench_rs_value_bands, + bench_rs_value_batch +); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/benches/rs_zonal_stats.rs b/rust/sedona-raster-gdal/benches/rs_zonal_stats.rs new file mode 100644 index 000000000..6b0c6b7c9 --- /dev/null +++ b/rust/sedona-raster-gdal/benches/rs_zonal_stats.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmarks for RS_ZonalStats and RS_ZonalStatsAll UDFs +//! +//! RS_ZonalStats computes statistics for pixels within a geometry. +//! RS_ZonalStatsAll computes all statistics at once. +//! +//! NOTE: This benchmark requires geometry creation which needs additional setup. +//! The underlying GDAL segfault issue has been fixed. +//! TODO: Implement full benchmark once geometry utilities are available in bench context. + +use criterion::{criterion_group, criterion_main, Criterion}; + +fn bench_rs_zonal_stats_placeholder(c: &mut Criterion) { + let mut group = c.benchmark_group("rs_zonal_stats"); + + // Placeholder benchmark - needs geometry creation utilities + // The underlying GDAL memory dataset issue has been fixed + group.bench_function("placeholder", |b| b.iter(|| std::hint::black_box(42))); + + group.finish(); +} + +criterion_group!(benches, bench_rs_zonal_stats_placeholder); +criterion_main!(benches); diff --git a/rust/sedona-raster-gdal/src/lib.rs b/rust/sedona-raster-gdal/src/lib.rs index 8e8c871fb..803906a82 100644 --- a/rust/sedona-raster-gdal/src/lib.rs +++ b/rust/sedona-raster-gdal/src/lib.rs @@ -25,21 +25,64 @@ //! - GDAL datatype and nodata conversion helpers //! - path normalization for GDAL VSI-backed raster sources -// Temporary until https://github.com/apache/sedona-db/issues/804 is resolved. -#[allow(dead_code)] -mod gdal_common; -// Temporary until https://github.com/apache/sedona-db/issues/804 is resolved. -#[allow(dead_code)] -mod gdal_dataset_provider; - -mod utils; +pub mod raster_band_reader; +pub mod rs_as_geotiff; +pub mod rs_as_raster; +pub mod rs_clip; +pub mod rs_from_gdal_raster; +pub mod rs_from_path; +pub mod rs_geotiff_tiles; +pub mod rs_map_algebra; +pub mod rs_metadata; +pub mod rs_polygonize; +pub mod rs_value; +pub mod rs_zonal_stats; +pub mod utils; #[cfg(test)] mod source_uri; +mod gdal_common; +mod gdal_dataset_provider; + // Re-export main dataset conversion functions pub use gdal_common::{ band_data_type_to_gdal, bytes_to_f64, gdal_to_band_data_type, gdal_type_byte_size, nodata_bytes_to_f64, nodata_f64_to_bytes, }; -pub use utils::{append_as_indb_raster, dataset_to_indb_raster}; +pub use utils::{append_as_indb_raster, dataset_to_indb_raster, load_as_indb_raster}; + +// Expose provider/cache initializers for callers that need GDAL datasets from a `RasterRef`. +// Crate-internal callers construct providers from an explicit `Gdal` plus `thread_local_cache()`. + +// Re-export UDF constructors +pub use rs_as_geotiff::{rs_as_geotiff_udf, CompressionType}; +pub use rs_as_raster::rs_as_raster_udf; +pub use rs_clip::rs_clip_udf; +pub use rs_from_gdal_raster::rs_from_gdal_raster_udf; +pub use rs_from_path::rs_from_path_udf; +pub use rs_map_algebra::rs_map_algebra_udf; +pub use rs_metadata::rs_metadata_udf; +pub use rs_polygonize::rs_polygonize_udf; +pub use rs_value::rs_value_udf; +pub use rs_zonal_stats::{rs_zonal_stats_all_udf, rs_zonal_stats_udf, StatType, ZonalStatistics}; + +// Re-export UDTF constructors +pub use rs_geotiff_tiles::rs_geotiff_tiles_udtf; + +/// Returns all GDAL-based raster UDFs +pub fn all_gdal_udfs() -> Vec { + vec![ + rs_from_path_udf().into(), + rs_from_gdal_raster_udf().into(), + rs_as_geotiff_udf().into(), + rs_as_raster_udf().into(), + rs_value_udf().into(), + rs_polygonize_udf().into(), + rs_clip_udf().into(), + rs_zonal_stats_udf().into(), + rs_zonal_stats_all_udf().into(), + rs_map_algebra_udf().into(), + rs_metadata_udf().into(), + ] +} diff --git a/rust/sedona-raster-gdal/src/raster_band_reader.rs b/rust/sedona-raster-gdal/src/raster_band_reader.rs new file mode 100644 index 000000000..8555f55cc --- /dev/null +++ b/rust/sedona-raster-gdal/src/raster_band_reader.rs @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError, Result}; +use sedona_gdal::dataset::Dataset; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::raster::rasterband::RasterBand; +use sedona_gdal::raster::types::GdalDataType; +use sedona_gdal::raster::types::GdalType; +use sedona_raster::traits::RasterRef; +use sedona_schema::raster::{BandDataType, StorageType}; + +use crate::gdal_common::band_data_type_to_gdal; +use crate::gdal_dataset_provider::{thread_local_provider, RasterDataset}; + +pub(crate) struct RasterBandReader<'a> { + gdal: &'a Gdal, + raster: &'a dyn RasterRef, + dataset: Option>, +} + +impl<'a> RasterBandReader<'a> { + pub fn new(gdal: &'a Gdal, raster: &'a dyn RasterRef) -> Self { + Self { + gdal, + raster, + dataset: None, + } + } + + pub fn dataset(&mut self) -> Result>> { + if self.dataset.is_some() { + return Ok(self.dataset.as_ref()); + } + + if self.raster_has_outdb()? { + self.ensure_dataset()?; + } + + Ok(self.dataset.as_ref()) + } + + #[allow(unused)] + pub fn gdal_dataset(&mut self) -> Result> { + Ok(self.dataset()?.map(|dataset| dataset.as_dataset())) + } + + pub fn read_pixel_f64(&mut self, band_idx: usize, col: usize, row: usize) -> Result { + let metadata = self.raster.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + + if col >= width || row >= height { + return exec_err!("Pixel coordinates out of bounds"); + } + + let pixel_idx = row * width + col; + let (storage_type, data_type) = self.band_meta(band_idx)?; + + match storage_type { + StorageType::InDb => { + let band = self.band_ref(band_idx)?; + let data = band.data(); + read_pixel_from_bytes(data, pixel_idx, &data_type) + } + StorageType::OutDbRef => { + let meta = self.raster.metadata(); + let raster_w = meta.width() as usize; + let raster_h = meta.height() as usize; + + let dataset = self.ensure_dataset()?; + let gdal_band = dataset + .as_dataset() + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band: {e}"))?; + let (src_width, src_height) = gdal_band.size(); + if raster_w != src_width || raster_h != src_height { + return exec_err!( + "Out-db dataset size mismatch: raster=({raster_w},{raster_h}) dataset=({src_width},{src_height})" + ); + } + + let buffer = gdal_band + .read_as::((col as isize, row as isize), (1, 1), (1, 1), None) + .map_err(|e| exec_datafusion_err!("Failed to read pixel: {e}"))?; + Ok(buffer.data()[0]) + } + } + } + + pub fn read_band_f64(&mut self, band_idx: usize) -> Result> { + let metadata = self.raster.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + let pixel_count = width * height; + + let (storage_type, data_type) = self.band_meta(band_idx)?; + + match storage_type { + StorageType::InDb => { + let band = self.band_ref(band_idx)?; + let data = band.data(); + let mut result = Vec::with_capacity(pixel_count); + for idx in 0..pixel_count { + result.push(read_pixel_from_bytes(data, idx, &data_type)?); + } + Ok(result) + } + StorageType::OutDbRef => self.read_window_f64(band_idx, (0, 0), (width, height)), + } + } + + #[allow(dead_code)] + pub fn read_window_f64( + &mut self, + band_idx: usize, + offset: (usize, usize), + size: (usize, usize), + ) -> Result> { + let (storage_type, data_type) = self.band_meta(band_idx)?; + + match storage_type { + StorageType::InDb => { + let metadata = self.raster.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + let (xoff, yoff) = offset; + let (win_w, win_h) = size; + + if xoff + win_w > width || yoff + win_h > height { + return exec_err!("Window out of bounds"); + } + + let band = self.band_ref(band_idx)?; + let data = band.data(); + let mut result = Vec::with_capacity(win_w * win_h); + for row in 0..win_h { + let base = (yoff + row) * width + xoff; + for col in 0..win_w { + let idx = base + col; + result.push(read_pixel_from_bytes(data, idx, &data_type)?); + } + } + Ok(result) + } + StorageType::OutDbRef => { + let meta = self.raster.metadata(); + let raster_w = meta.width() as usize; + let raster_h = meta.height() as usize; + + let (xoff, yoff) = offset; + let (win_w, win_h) = size; + if xoff + win_w > raster_w || yoff + win_h > raster_h { + return exec_err!("Window out of bounds"); + } + + let dataset = self.ensure_dataset()?; + let gdal_band = dataset + .as_dataset() + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band: {e}"))?; + let (src_width, src_height) = gdal_band.size(); + + if raster_w != src_width || raster_h != src_height { + return exec_err!( + "Out-db dataset size mismatch: raster=({raster_w},{raster_h}) dataset=({src_width},{src_height})" + ); + } + + let buffer = gdal_band + .read_as::( + (xoff as isize, yoff as isize), + (win_w, win_h), + (win_w, win_h), + None, + ) + .map_err(|e| exec_datafusion_err!("Failed to read window: {e}"))?; + Ok(buffer.data().to_vec()) + } + } + } + + pub fn read_band_bytes(&mut self, band_idx: usize) -> Result> { + let metadata = self.raster.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + let pixel_count = width * height; + + let (storage_type, data_type) = self.band_meta(band_idx)?; + + match storage_type { + StorageType::InDb => { + let band = self.band_ref(band_idx)?; + Ok(band.data().to_vec()) + } + StorageType::OutDbRef => { + let dataset = self.ensure_dataset()?; + let gdal_band = dataset + .as_dataset() + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band: {e}"))?; + read_band_bytes_from_gdal(&gdal_band, data_type, pixel_count) + } + } + } + + #[allow(dead_code)] + pub fn read_window_bytes( + &mut self, + band_idx: usize, + offset: (usize, usize), + size: (usize, usize), + ) -> Result> { + let (storage_type, data_type) = self.band_meta(band_idx)?; + + match storage_type { + StorageType::InDb => { + let metadata = self.raster.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + let (xoff, yoff) = offset; + let (win_w, win_h) = size; + + if xoff + win_w > width || yoff + win_h > height { + return exec_err!("Window out of bounds"); + } + + let band = self.band_ref(band_idx)?; + let data = band.data(); + let byte_size = band_data_type_size(&data_type); + let mut result = Vec::with_capacity(win_w * win_h * byte_size); + for row in 0..win_h { + let base = (yoff + row) * width + xoff; + for col in 0..win_w { + let idx = base + col; + let byte_offset = idx * byte_size; + result.extend_from_slice(&data[byte_offset..byte_offset + byte_size]); + } + } + Ok(result) + } + StorageType::OutDbRef => { + let dataset = self.ensure_dataset()?; + let gdal_band = dataset + .as_dataset() + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band: {e}"))?; + read_window_bytes_from_gdal(&gdal_band, data_type, offset, size) + } + } + } + + fn ensure_dataset(&mut self) -> Result<&RasterDataset<'a>> { + if self.dataset.is_none() { + let provider = thread_local_provider(self.gdal) + .map_err(|e| exec_datafusion_err!("Failed to init GDAL provider: {e}"))?; + let dataset = provider + .raster_ref_to_gdal(self.raster) + .map_err(|e| exec_datafusion_err!("Failed to create GDAL dataset: {e}"))?; + self.dataset = Some(dataset); + } + + Ok(self.dataset.as_ref().expect("dataset should be set")) + } + + fn raster_has_outdb(&self) -> Result { + let bands = self.raster.bands(); + for idx in 1..=bands.len() { + let band = bands + .band(idx) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {e}", idx))?; + if band.metadata().storage_type()? == StorageType::OutDbRef { + return Ok(true); + } + } + Ok(false) + } + + fn band_ref(&self, band_idx: usize) -> Result> { + let bands = self.raster.bands(); + if band_idx == 0 || band_idx > bands.len() { + return exec_err!("Band {} is out of range (1-{})", band_idx, bands.len()); + } + bands + .band(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band: {e}")) + } + + fn band_meta(&self, band_idx: usize) -> Result<(StorageType, BandDataType)> { + let band = self.band_ref(band_idx)?; + let meta = band.metadata(); + Ok((meta.storage_type()?, meta.data_type()?)) + } +} + +fn band_data_type_size(data_type: &BandDataType) -> usize { + match data_type { + BandDataType::UInt8 => 1, + BandDataType::Int8 => 1, + BandDataType::UInt16 | BandDataType::Int16 => 2, + BandDataType::UInt32 | BandDataType::Int32 | BandDataType::Float32 => 4, + BandDataType::UInt64 | BandDataType::Int64 => 8, + BandDataType::Float64 => 8, + } +} + +fn read_pixel_from_bytes(data: &[u8], offset: usize, data_type: &BandDataType) -> Result { + let byte_size = band_data_type_size(data_type); + let byte_offset = offset * byte_size; + + if byte_offset + byte_size > data.len() { + return exec_err!("Pixel offset out of bounds"); + } + + let value = match data_type { + BandDataType::UInt8 => data[byte_offset] as f64, + BandDataType::Int8 => (data[byte_offset] as i8) as f64, + BandDataType::UInt16 => { + u16::from_le_bytes([data[byte_offset], data[byte_offset + 1]]) as f64 + } + BandDataType::Int16 => { + i16::from_le_bytes([data[byte_offset], data[byte_offset + 1]]) as f64 + } + BandDataType::UInt32 => u32::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + ]) as f64, + BandDataType::Int32 => i32::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + ]) as f64, + BandDataType::UInt64 => u64::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + data[byte_offset + 4], + data[byte_offset + 5], + data[byte_offset + 6], + data[byte_offset + 7], + ]) as f64, + BandDataType::Int64 => i64::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + data[byte_offset + 4], + data[byte_offset + 5], + data[byte_offset + 6], + data[byte_offset + 7], + ]) as f64, + BandDataType::Float32 => f32::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + ]) as f64, + BandDataType::Float64 => f64::from_le_bytes([ + data[byte_offset], + data[byte_offset + 1], + data[byte_offset + 2], + data[byte_offset + 3], + data[byte_offset + 4], + data[byte_offset + 5], + data[byte_offset + 6], + data[byte_offset + 7], + ]), + }; + + Ok(value) +} + +fn read_band_bytes_from_gdal( + band: &RasterBand, + data_type: BandDataType, + pixel_count: usize, +) -> Result> { + let gdal_type = band_data_type_to_gdal(&data_type); + match gdal_type { + GdalDataType::UInt8 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Int8 => read_gdal_bytes::(band, pixel_count), + GdalDataType::UInt16 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Int16 => read_gdal_bytes::(band, pixel_count), + GdalDataType::UInt32 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Int32 => read_gdal_bytes::(band, pixel_count), + GdalDataType::UInt64 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Int64 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Float32 => read_gdal_bytes::(band, pixel_count), + GdalDataType::Float64 => read_gdal_bytes::(band, pixel_count), + _ => Err(DataFusionError::NotImplemented( + "Unsupported GDAL data type".to_string(), + )), + } +} + +#[allow(dead_code)] +fn read_window_bytes_from_gdal( + band: &RasterBand, + data_type: BandDataType, + offset: (usize, usize), + size: (usize, usize), +) -> Result> { + let gdal_type = band_data_type_to_gdal(&data_type); + match gdal_type { + GdalDataType::UInt8 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Int8 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::UInt16 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Int16 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::UInt32 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Int32 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::UInt64 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Int64 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Float32 => read_gdal_window_bytes::(band, offset, size), + GdalDataType::Float64 => read_gdal_window_bytes::(band, offset, size), + _ => Err(DataFusionError::NotImplemented( + "Unsupported GDAL data type".to_string(), + )), + } +} + +fn read_gdal_bytes( + band: &RasterBand, + pixel_count: usize, +) -> Result> { + let (width, height) = band.size(); + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| exec_datafusion_err!("Failed to read band: {e}"))?; + let values = buffer.data(); + let mut out = Vec::with_capacity(pixel_count * std::mem::size_of::()); + for value in values.iter().take(pixel_count) { + out.extend_from_slice(&value.to_le_bytes()); + } + Ok(out) +} + +#[allow(dead_code)] +fn read_gdal_window_bytes( + band: &RasterBand, + offset: (usize, usize), + size: (usize, usize), +) -> Result> { + let buffer = band + .read_as::( + (offset.0 as isize, offset.1 as isize), + (size.0, size.1), + (size.0, size.1), + None, + ) + .map_err(|e| exec_datafusion_err!("Failed to read window: {e}"))?; + let values = buffer.data(); + let mut out = Vec::with_capacity(std::mem::size_of_val(values)); + for value in values.iter() { + out.extend_from_slice(&value.to_le_bytes()); + } + Ok(out) +} + +trait ToLeBytes { + fn to_le_bytes(&self) -> Vec; +} + +impl ToLeBytes for u8 { + fn to_le_bytes(&self) -> Vec { + vec![*self] + } +} + +impl ToLeBytes for i8 { + fn to_le_bytes(&self) -> Vec { + vec![*self as u8] + } +} + +impl ToLeBytes for u16 { + fn to_le_bytes(&self) -> Vec { + u16::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for i16 { + fn to_le_bytes(&self) -> Vec { + i16::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for u32 { + fn to_le_bytes(&self) -> Vec { + u32::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for i32 { + fn to_le_bytes(&self) -> Vec { + i32::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for u64 { + fn to_le_bytes(&self) -> Vec { + u64::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for i64 { + fn to_le_bytes(&self) -> Vec { + i64::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for f32 { + fn to_le_bytes(&self) -> Vec { + f32::to_le_bytes(*self).to_vec() + } +} + +impl ToLeBytes for f64 { + fn to_le_bytes(&self) -> Vec { + f64::to_le_bytes(*self).to_vec() + } +} diff --git a/rust/sedona-raster-gdal/src/rs_as_geotiff.rs b/rust/sedona-raster-gdal/src/rs_as_geotiff.rs new file mode 100644 index 000000000..e4a35d5c7 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_as_geotiff.rs @@ -0,0 +1,519 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_AsGeoTiff UDF - Export raster as GeoTiff binary +//! +//! Returns a binary DataFrame from a Raster DataFrame with multiple overloads: +//! - RS_AsGeoTiff(raster) +//! - RS_AsGeoTiff(raster, tileSize) +//! - RS_AsGeoTiff(raster, compressionType, imageQuality) +//! - RS_AsGeoTiff(raster, compressionType, imageQuality, tileSize) +//! - RS_AsGeoTiff(raster, compressionType, imageQuality, tileWidth, tileHeight) + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use crate::gdal_common::with_gdal; +use arrow_array::builder::BinaryBuilder; +use arrow_schema::DataType; +use datafusion_common::cast::{as_float64_array, as_string_array, as_uint32_array}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::array::RasterRefImpl; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::SedonaType; +use sedona_schema::matchers::ArgMatcher; + +// Use thread-local provider to create GDAL datasets from `RasterRef`. +use crate::gdal_dataset_provider::configure_thread_local_options; + +/// Counter for generating unique VSI memory file names +static VSI_FILE_COUNTER: AtomicUsize = AtomicUsize::new(0); + +/// Compression types supported for GeoTiff output +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CompressionType { + None, + PackBits, + Deflate, + Huffman, + Lzw, + Jpeg, +} + +impl CompressionType { + /// Parse compression type from string (case-insensitive) + pub fn parse(s: &str) -> Option { + match s.to_lowercase().as_str() { + "none" => Some(CompressionType::None), + "packbits" => Some(CompressionType::PackBits), + "deflate" => Some(CompressionType::Deflate), + "huffman" => Some(CompressionType::Huffman), + "lzw" => Some(CompressionType::Lzw), + "jpeg" => Some(CompressionType::Jpeg), + _ => None, + } + } + + /// Get GDAL compression option value + pub fn gdal_value(&self) -> &'static str { + match self { + CompressionType::None => "NONE", + CompressionType::PackBits => "PACKBITS", + CompressionType::Deflate => "DEFLATE", + CompressionType::Huffman => "CCITTRLE", + CompressionType::Lzw => "LZW", + CompressionType::Jpeg => "JPEG", + } + } +} + +/// RS_AsGeoTiff() scalar UDF implementation +/// +/// Returns a binary DataFrame from a Raster DataFrame +pub fn rs_as_geotiff_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_asgeotiff", + vec![ + Arc::new(RsAsGeoTiff::new(Variant::Basic)), // RS_AsGeoTiff(raster) + Arc::new(RsAsGeoTiff::new(Variant::WithTileSize)), // RS_AsGeoTiff(raster, tileSize) + Arc::new(RsAsGeoTiff::new(Variant::WithCompressionQuality)), // RS_AsGeoTiff(raster, compression, quality) + Arc::new(RsAsGeoTiff::new(Variant::WithCompressionQualityTileSize)), // RS_AsGeoTiff(raster, compression, quality, tileSize) + Arc::new(RsAsGeoTiff::new(Variant::WithCompressionQualityTileWH)), // RS_AsGeoTiff(raster, compression, quality, tileWidth, tileHeight) + ], + Volatility::Immutable, + ) +} + +/// Variants for different overloads +#[derive(Debug, Clone, Copy)] +enum Variant { + Basic, // (raster) + WithTileSize, // (raster, tileSize) + WithCompressionQuality, // (raster, compression, quality) + WithCompressionQualityTileSize, // (raster, compression, quality, tileSize) + WithCompressionQualityTileWH, // (raster, compression, quality, tileWidth, tileHeight) +} + +/// Kernel implementation for RS_AsGeoTiff +#[derive(Debug)] +struct RsAsGeoTiff { + variant: Variant, +} + +impl RsAsGeoTiff { + fn new(variant: Variant) -> Self { + Self { variant } + } + + /// Generate a unique VSI memory file path + fn generate_vsi_path() -> String { + let counter = VSI_FILE_COUNTER.fetch_add(1, Ordering::SeqCst); + let thread_id = std::thread::current().id(); + format!("/vsimem/rs_as_geotiff_{:?}_{}.tif", thread_id, counter) + } + + /// Convert raster to GeoTiff bytes + fn raster_to_geotiff( + gdal: &sedona_gdal::gdal::Gdal, + raster: &RasterRefImpl, + compression: Option, + quality: Option, + tile_width: Option, + tile_height: Option, + ) -> Result> { + let provider = crate::gdal_dataset_provider::thread_local_provider(gdal) + .map_err(|e| exec_datafusion_err!("Failed to init GDAL provider: {}", e))?; + let raster_ds = provider + .raster_ref_to_gdal(raster) + .map_err(|e| exec_datafusion_err!("Failed to create GDAL dataset: {}", e))?; + let source_dataset = raster_ds.as_dataset(); + + let driver = gdal + .get_driver_by_name("GTiff") + .map_err(|e| exec_datafusion_err!("Failed to get GTiff driver: {}", e))?; + + // Build creation options as string list + let mut options_list: Vec = Vec::new(); + + // Add compression option + if let Some(comp) = compression { + options_list.push(format!("COMPRESS={}", comp.gdal_value())); + + // Add quality for JPEG + if comp == CompressionType::Jpeg { + if let Some(q) = quality { + // JPEG quality is 1-100, we receive 0.0-1.0 + let jpeg_quality = (q * 100.0).round() as i32; + options_list.push(format!("JPEG_QUALITY={}", jpeg_quality.clamp(1, 100))); + } + } + + // Add predictor for Deflate/LZW (improves compression) + if comp == CompressionType::Deflate || comp == CompressionType::Lzw { + options_list.push("PREDICTOR=2".to_string()); + } + } + + // Add tiling options + if let (Some(tw), Some(th)) = (tile_width, tile_height) { + options_list.push("TILED=YES".to_string()); + options_list.push(format!("BLOCKXSIZE={}", tw)); + options_list.push(format!("BLOCKYSIZE={}", th)); + } + + // Convert to creation options slice + let options_refs: Vec<&str> = options_list.iter().map(|s| s.as_str()).collect(); + + // Generate VSI path for output + let vsi_path = Self::generate_vsi_path(); + + // Create copy to VSI memory file + let _output_dataset = source_dataset + .create_copy(&driver, &vsi_path, &options_refs) + .map_err(|e| exec_datafusion_err!("Failed to create GeoTiff: {}", e))?; + + // Close the output dataset to flush data + drop(_output_dataset); + + // Read bytes from VSI memory file and clean up + let bytes = gdal.get_vsi_mem_file_bytes_owned(&vsi_path).map_err(|e| { + let _ = gdal.unlink_mem_file(&vsi_path); + exec_datafusion_err!("Failed to read GeoTiff bytes: {}", e) + })?; + + // Clean up VSI file + let _ = gdal.unlink_mem_file(&vsi_path); + + Ok(bytes) + } +} + +impl SedonaScalarKernel for RsAsGeoTiff { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = match self.variant { + Variant::Basic => vec![ArgMatcher::is_raster()], + Variant::WithTileSize => vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), // tileSize + ], + Variant::WithCompressionQuality => vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), // compressionType + ArgMatcher::is_numeric(), // imageQuality + ], + Variant::WithCompressionQualityTileSize => vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), // compressionType + ArgMatcher::is_numeric(), // imageQuality + ArgMatcher::is_integer(), // tileSize + ], + Variant::WithCompressionQualityTileWH => vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), // compressionType + ArgMatcher::is_numeric(), // imageQuality + ArgMatcher::is_integer(), // tileWidth + ArgMatcher::is_integer(), // tileHeight + ], + }; + + let matcher = ArgMatcher::new(matchers, SedonaType::Arrow(DataType::Binary)); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let num_iterations = executor.num_iterations(); + + // Convert variant-specific args to arrays upfront via into_array. + // For variants that don't use a parameter, create null-filled default arrays. + let (compression_array, quality_array, tile_width_array, tile_height_array) = + match self.variant { + Variant::Basic => { + // No extra args → all null arrays + let compression = ScalarValue::Utf8(None).to_array_of_size(num_iterations)?; + let quality = ScalarValue::Float64(None).to_array_of_size(num_iterations)?; + let tile_width = ScalarValue::UInt32(None).to_array_of_size(num_iterations)?; + let tile_height = ScalarValue::UInt32(None).to_array_of_size(num_iterations)?; + (compression, quality, tile_width, tile_height) + } + Variant::WithTileSize => { + // args[1] → tile_width AND tile_height + let compression = ScalarValue::Utf8(None).to_array_of_size(num_iterations)?; + let quality = ScalarValue::Float64(None).to_array_of_size(num_iterations)?; + let tile_size = args[1] + .clone() + .cast_to(&DataType::UInt32, None)? + .into_array(num_iterations)?; + (compression, quality, tile_size.clone(), tile_size) + } + Variant::WithCompressionQuality => { + // args[1] → compression, args[2] → quality + let compression = args[1] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let quality = args[2] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)?; + let tile_width = ScalarValue::UInt32(None).to_array_of_size(num_iterations)?; + let tile_height = ScalarValue::UInt32(None).to_array_of_size(num_iterations)?; + (compression, quality, tile_width, tile_height) + } + Variant::WithCompressionQualityTileSize => { + // args[1] → compression, args[2] → quality, args[3] → tile_width AND tile_height + let compression = args[1] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let quality = args[2] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)?; + let tile_size = args[3] + .clone() + .cast_to(&DataType::UInt32, None)? + .into_array(num_iterations)?; + (compression, quality, tile_size.clone(), tile_size) + } + Variant::WithCompressionQualityTileWH => { + // args[1] → compression, args[2] → quality, args[3] → tile_width, args[4] → tile_height + let compression = args[1] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let quality = args[2] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)?; + let tile_width = args[3] + .clone() + .cast_to(&DataType::UInt32, None)? + .into_array(num_iterations)?; + let tile_height = args[4] + .clone() + .cast_to(&DataType::UInt32, None)? + .into_array(num_iterations)?; + (compression, quality, tile_width, tile_height) + } + }; + + // Downcast all parameter arrays once before the loop + let compression_array = as_string_array(&compression_array)?; + let quality_array = as_float64_array(&quality_array)?; + let tile_width_array = as_uint32_array(&tile_width_array)?; + let tile_height_array = as_uint32_array(&tile_height_array)?; + + // Create iterators for each parameter array + let mut compression_iter = compression_array.iter(); + let mut quality_iter = quality_array.iter(); + let mut tile_width_iter = tile_width_array.iter(); + let mut tile_height_iter = tile_height_array.iter(); + + // Build output binary array + let mut builder = BinaryBuilder::with_capacity(num_iterations, num_iterations * 1024); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + executor.execute_raster_void(|_i, raster_opt| { + let compression_opt = compression_iter.next().unwrap(); + let quality_opt = quality_iter.next().unwrap(); + let tile_width_opt = tile_width_iter.next().unwrap(); + let tile_height_opt = tile_height_iter.next().unwrap(); + + let raster = match raster_opt { + Some(raster) => raster, + None => { + builder.append_null(); + return Ok(()); + } + }; + + let compression = match compression_opt { + Some(comp_str) => Some(CompressionType::parse(comp_str).ok_or_else(|| { + exec_datafusion_err!( + "Unknown compression type: {}. Valid values: None, PackBits, Deflate, Huffman, LZW, JPEG", + comp_str + ) + })?), + None => None, + }; + + let quality = quality_opt; + let tile_width = tile_width_opt; + let tile_height = tile_height_opt; + + let bytes = Self::raster_to_geotiff( + gdal, + raster, + compression, + quality, + tile_width, + tile_height, + )?; + builder.append_value(&bytes); + + Ok(()) + })?; + + executor.finish(Arc::new(builder.finish())) + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use sedona_raster::traits::RasterRef; + + #[test] + fn test_compression_type_parse() { + assert_eq!(CompressionType::parse("none"), Some(CompressionType::None)); + assert_eq!(CompressionType::parse("NONE"), Some(CompressionType::None)); + assert_eq!( + CompressionType::parse("deflate"), + Some(CompressionType::Deflate) + ); + assert_eq!( + CompressionType::parse("DEFLATE"), + Some(CompressionType::Deflate) + ); + assert_eq!(CompressionType::parse("lzw"), Some(CompressionType::Lzw)); + assert_eq!(CompressionType::parse("jpeg"), Some(CompressionType::Jpeg)); + assert_eq!(CompressionType::parse("invalid"), None); + } + + #[test] + fn test_generate_vsi_path() { + let path1 = RsAsGeoTiff::generate_vsi_path(); + let path2 = RsAsGeoTiff::generate_vsi_path(); + + assert!(path1.starts_with("/vsimem/rs_as_geotiff_")); + assert!(path1.ends_with(".tif")); + assert!(path2.starts_with("/vsimem/rs_as_geotiff_")); + assert_ne!(path1, path2); + } + + #[test] + fn udf_as_geotiff() { + let udf: datafusion_expr::ScalarUDF = rs_as_geotiff_udf().into(); + assert_eq!(udf.name(), "rs_asgeotiff"); + } + + #[test] + fn test_roundtrip_geotiff() { + use crate::rs_from_gdal_raster::RsFromGDALRaster; + use sedona_raster::array::RasterStructArray; + use sedona_testing::data::test_raster; + + // Load test4.tiff as in-db raster + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + with_gdal(|gdal| { + let raster_arr = crate::utils::load_as_indb_raster(gdal, &path)?; + let raster_array = RasterStructArray::new(&raster_arr); + assert_eq!(raster_array.len(), 1); + let raster = raster_array.get(0).expect("Should get raster"); + + let geotiff_bytes = + RsAsGeoTiff::raster_to_geotiff(gdal, &raster, None, None, None, None)?; + assert!(geotiff_bytes.len() > 4, "GeoTiff should have content"); + assert!( + &geotiff_bytes[0..2] == b"II" || &geotiff_bytes[0..2] == b"MM", + "Should be valid TIFF header" + ); + + let roundtrip_arr = RsFromGDALRaster::parse_gdal_raster(gdal, &geotiff_bytes)?; + let roundtrip_array = RasterStructArray::new(&roundtrip_arr); + let roundtrip_raster = roundtrip_array.get(0).expect("Should get roundtrip raster"); + + assert_eq!( + roundtrip_raster.metadata().width(), + raster.metadata().width() + ); + assert_eq!( + roundtrip_raster.metadata().height(), + raster.metadata().height() + ); + assert_eq!(roundtrip_raster.bands().len(), raster.bands().len()); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .expect("Should roundtrip GeoTiff"); + } + + #[test] + fn test_geotiff_with_compression() { + use sedona_raster::array::RasterStructArray; + use sedona_testing::data::test_raster; + + // Load test raster + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + with_gdal(|gdal| { + let raster_arr = crate::utils::load_as_indb_raster(gdal, &path)?; + let raster_array = RasterStructArray::new(&raster_arr); + let raster = raster_array.get(0).expect("Should get raster"); + + let lzw_bytes = RsAsGeoTiff::raster_to_geotiff( + gdal, + &raster, + Some(CompressionType::Lzw), + Some(75.0), + None, + None, + )?; + assert!( + !lzw_bytes.is_empty(), + "LZW compressed GeoTiff should have content" + ); + + let deflate_bytes = RsAsGeoTiff::raster_to_geotiff( + gdal, + &raster, + Some(CompressionType::Deflate), + Some(6.0), + None, + None, + )?; + assert!( + !deflate_bytes.is_empty(), + "DEFLATE compressed GeoTiff should have content" + ); + assert!(&lzw_bytes[0..2] == b"II" || &lzw_bytes[0..2] == b"MM"); + assert!(&deflate_bytes[0..2] == b"II" || &deflate_bytes[0..2] == b"MM"); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .expect("Should convert with DEFLATE"); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_as_raster.rs b/rust/sedona-raster-gdal/src/rs_as_raster.rs new file mode 100644 index 000000000..6c02a3fa2 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_as_raster.rs @@ -0,0 +1,714 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_AsRaster UDF - Rasterize a vector geometry onto a raster grid. +//! +//! RS_AsRaster converts a vector geometry into a raster dataset by assigning a +//! specified value to all pixels covered by the geometry. + +use std::sync::Arc; + +use arrow_array::Array; +use datafusion_common::cast::{ + as_binary_array, as_boolean_array, as_float64_array, as_string_array, +}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, exec_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_gdal::dataset::Dataset; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::mem::MemDatasetBuilder; +use sedona_gdal::raster::types::Buffer; + +use arrow_schema::DataType; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata, RasterRef}; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; +use sedona_schema::raster::{BandDataType, StorageType}; + +use crate::gdal_common::{band_data_type_to_gdal, nodata_f64_to_bytes, with_gdal}; +use crate::gdal_dataset_provider::configure_thread_local_options; + +/// RS_AsRaster() scalar UDF implementation +pub fn rs_as_raster_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_asraster", + vec![ + Arc::new(RsAsRaster { arg_count: 3 }), + Arc::new(RsAsRaster { arg_count: 4 }), + Arc::new(RsAsRaster { arg_count: 5 }), + Arc::new(RsAsRaster { arg_count: 6 }), + Arc::new(RsAsRaster { arg_count: 7 }), + ], + Volatility::Immutable, + ) +} + +#[derive(Debug)] +struct RsAsRaster { + /// Number of arguments in the matched signature (3..=7) + arg_count: usize, +} + +impl SedonaScalarKernel for RsAsRaster { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let mut matchers = vec![ + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ]; + + if self.arg_count >= 4 { + matchers.push(ArgMatcher::is_boolean()); + } + if self.arg_count >= 5 { + matchers.push(ArgMatcher::is_numeric()); + } + if self.arg_count >= 6 { + matchers.push(ArgMatcher::is_numeric()); + } + if self.arg_count >= 7 { + matchers.push(ArgMatcher::is_boolean()); + } + + let matcher = ArgMatcher::new(matchers, RASTER); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let num_iterations = calc_num_iterations(args); + + // Convert all non-raster/non-geometry args to arrays upfront via into_array + // arg[2]: pixelType (always present, string) + let pixel_type_array = args[2] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let pixel_type_array = as_string_array(&pixel_type_array)?; + + // arg[3]: all_touched (if arg_count >= 4, boolean; default false) + let all_touched_array = if self.arg_count >= 4 { + args[3] + .clone() + .cast_to(&DataType::Boolean, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Boolean(Some(false)).to_array_of_size(num_iterations)? + }; + let all_touched_array = as_boolean_array(&all_touched_array)?; + + // arg[4]: burn_value (if arg_count >= 5, numeric -> f64; default 1.0) + let burn_value_array = if self.arg_count >= 5 { + args[4] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Float64(Some(1.0)).to_array_of_size(num_iterations)? + }; + let burn_value_array = as_float64_array(&burn_value_array)?; + + // arg[5]: nodata_value (if arg_count >= 6, numeric -> f64; default None) + let nodata_value_array = if self.arg_count >= 6 { + args[5] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Float64(None).to_array_of_size(num_iterations)? + }; + let nodata_value_array = as_float64_array(&nodata_value_array)?; + + // arg[6]: use_geometry_extent (if arg_count >= 7, boolean; default true) + let use_geom_extent_array = if self.arg_count >= 7 { + args[6] + .clone() + .cast_to(&DataType::Boolean, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Boolean(Some(true)).to_array_of_size(num_iterations)? + }; + let use_geom_extent_array = as_boolean_array(&use_geom_extent_array)?; + + // Convert geometry (arg[0]) to binary array + let geom_array = args[0].clone().into_array(num_iterations)?; + let geom_array = as_binary_array(&geom_array)?; + let mut geom_iter = geom_array.iter(); + + let mut pixel_type_iter = pixel_type_array.iter(); + let mut all_touched_iter = all_touched_array.iter(); + let mut burn_value_iter = burn_value_array.iter(); + let mut nodata_value_iter = nodata_value_array.iter(); + let mut use_geom_extent_iter = use_geom_extent_array.iter(); + + let mut builder = RasterBuilder::new(num_iterations); + + // Raster is at arg[1] — create executor with raster-only subset + let exec_arg_types = vec![arg_types[1].clone()]; + let exec_args = vec![args[1].clone()]; + let executor = + RasterExecutor::new_with_num_iterations(&exec_arg_types, &exec_args, num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + executor.execute_raster_void(|_i, raster_opt| { + let geom_opt = geom_iter.next().unwrap(); + let pixel_type_opt = pixel_type_iter.next().unwrap(); + let all_touched_opt = all_touched_iter.next().unwrap(); + let burn_value_opt = burn_value_iter.next().unwrap(); + let nodata_value_opt = nodata_value_iter.next().unwrap(); + let use_geom_extent_opt = use_geom_extent_iter.next().unwrap(); + + let raster = match raster_opt { + Some(r) => r, + None => { + builder.append_null()?; + return Ok(()); + } + }; + let geom_wkb = match geom_opt { + Some(g) => g, + None => { + builder.append_null()?; + return Ok(()); + } + }; + let pixel_type_str = match pixel_type_opt { + Some(s) => s, + None => { + builder.append_null()?; + return Ok(()); + } + }; + + let band_type = parse_pixel_type(pixel_type_str)?; + let all_touched = all_touched_opt.unwrap_or(false); + let burn_value = burn_value_opt.unwrap_or(1.0); + let nodata_value = nodata_value_opt; + let use_geometry_extent = use_geom_extent_opt.unwrap_or(true); + + match as_raster( + gdal, + geom_wkb, + raster, + band_type, + all_touched, + burn_value, + nodata_value, + use_geometry_extent, + ) { + Ok((out_metadata, out_band_metadata, out_band_bytes)) => { + builder + .start_raster(&out_metadata, raster.crs()) + .map_err(|e| { + exec_datafusion_err!("Failed to start output raster: {}", e) + })?; + + builder.start_band(out_band_metadata).map_err(|e| { + exec_datafusion_err!("Failed to start output raster band: {}", e) + })?; + + builder.band_data_writer().append_value(&out_band_bytes); + builder.finish_band().map_err(|e| { + exec_datafusion_err!("Failed to finish output raster band: {}", e) + })?; + + builder.finish_raster().map_err(|e| { + exec_datafusion_err!("Failed to finish output raster: {}", e) + })?; + } + Err(e) => { + eprintln!("RS_AsRaster error: {}", e); + builder.append_null()?; + } + } + + Ok(()) + })?; + + // Use finish_result to check ALL original args for scalar/array decision, + // since the executor only has the raster subset. + finish_result(args, Arc::new(builder.finish()?)) + }) + } +} + +fn parse_pixel_type(s: &str) -> Result { + match s.trim().to_ascii_uppercase().as_str() { + "D" => Ok(BandDataType::Float64), + "F" => Ok(BandDataType::Float32), + "I" => Ok(BandDataType::Int32), + "S" => Ok(BandDataType::Int16), + "US" => Ok(BandDataType::UInt16), + "B" => Ok(BandDataType::UInt8), + "I8" | "INT8" => Ok(BandDataType::Int8), + "U64" | "UINT64" => Ok(BandDataType::UInt64), + "I64" | "INT64" => Ok(BandDataType::Int64), + other => exec_err!( + "Unsupported pixelType: {} (expected one of D, F, I, S, US, B, I8, U64, I64)", + other + ), + } +} + +#[allow(clippy::too_many_arguments)] +fn as_raster( + gdal: &Gdal, + geom_wkb: &[u8], + reference_raster: &RasterRefImpl<'_>, + band_type: BandDataType, + all_touched: bool, + burn_value: f64, + nodata_value: Option, + use_geometry_extent: bool, +) -> Result<(RasterMetadata, BandMetadata, Vec)> { + let ref_md = reference_raster.metadata(); + + if ref_md.skew_x() != 0.0 || ref_md.skew_y() != 0.0 { + return exec_err!( + "RS_AsRaster currently requires skew_x=0 and skew_y=0 in the reference raster" + ); + } + + // Parse geometry + let geometry = gdal + .geometry_from_wkb(geom_wkb) + .map_err(|e| exec_datafusion_err!("Failed to parse geometry from WKB: {}", e))?; + + // Compute output grid + let (out_width, out_height, out_ulx, out_uly) = if use_geometry_extent { + let env = geometry.envelope(); + let ulx = ref_md.upper_left_x(); + let uly = ref_md.upper_left_y(); + let scale_x = ref_md.scale_x(); + let scale_y = ref_md.scale_y(); + + if scale_x == 0.0 || scale_y == 0.0 { + return exec_err!("Reference raster has zero scale"); + } + + let start_col = ((env.MinX - ulx) / scale_x).floor() as isize; + let end_col_excl = ((env.MaxX - ulx) / scale_x).ceil() as isize; + + // Note: scale_y is typically negative. + let start_row = ((env.MaxY - uly) / scale_y).floor() as isize; + let end_row_excl = ((env.MinY - uly) / scale_y).ceil() as isize; + + let width = (end_col_excl - start_col).max(0) as usize; + let height = (end_row_excl - start_row).max(0) as usize; + + if width == 0 || height == 0 { + return exec_err!("Geometry extent produced an empty raster"); + } + + let out_ulx = ulx + (start_col as f64) * scale_x; + let out_uly = uly + (start_row as f64) * scale_y; + (width, height, out_ulx, out_uly) + } else { + ( + ref_md.width() as usize, + ref_md.height() as usize, + ref_md.upper_left_x(), + ref_md.upper_left_y(), + ) + }; + + // Create output GDAL dataset + let gdal_type = band_data_type_to_gdal(&band_type); + let out_dataset = MemDatasetBuilder::create(gdal, out_width, out_height, 1, gdal_type) + .map_err(|e| exec_datafusion_err!("Failed to create dataset: {}", e))?; + + let geotransform = [ + out_ulx, + ref_md.scale_x(), + ref_md.skew_x(), + out_uly, + ref_md.skew_y(), + ref_md.scale_y(), + ]; + out_dataset + .set_geo_transform(&geotransform) + .map_err(|e| exec_datafusion_err!("Failed to set geotransform: {}", e))?; + + // Set spatial reference based on reference raster dataset (if present) + let provider = crate::gdal_dataset_provider::thread_local_provider(gdal) + .map_err(|e| exec_datafusion_err!("Failed to init GDAL provider: {}", e))?; + let ref_raster_ds = provider + .raster_ref_to_gdal(reference_raster) + .map_err(|e| exec_datafusion_err!("Failed to create GDAL dataset: {}", e))?; + if let Ok(srs) = ref_raster_ds.as_dataset().spatial_ref() { + out_dataset + .set_spatial_ref(&srs) + .map_err(|e| exec_datafusion_err!("Failed to set spatial reference: {}", e))?; + } + + // Initialize output band to nodata (if provided) or 0 + let init_value = nodata_value.unwrap_or(0.0); + initialize_band(&out_dataset, &band_type, out_width, out_height, init_value)?; + + // Set nodata metadata on band + if let Some(nodata) = nodata_value { + let band = out_dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get output band: {}", e))?; + match band_type { + BandDataType::UInt64 => { + band.set_no_data_value_u64(Some(nodata as u64)) + .map_err(|e| exec_datafusion_err!("Failed to set nodata value: {}", e))?; + } + BandDataType::Int64 => { + band.set_no_data_value_i64(Some(nodata as i64)) + .map_err(|e| exec_datafusion_err!("Failed to set nodata value: {}", e))?; + } + _ => band + .set_no_data_value(Some(nodata)) + .map_err(|e| exec_datafusion_err!("Failed to set nodata value: {}", e))?, + } + } + + gdal.rasterize_affine(&out_dataset, &[1], &[geometry], &[burn_value], all_touched) + .map_err(|e| exec_datafusion_err!("Failed to rasterize geometry: {}", e))?; + + // Read band data as bytes + let band_bytes = read_band_as_bytes(&out_dataset, 1, out_width, out_height, &band_type)?; + + let out_metadata = RasterMetadata { + width: out_width as u64, + height: out_height as u64, + upperleft_x: out_ulx, + upperleft_y: out_uly, + scale_x: ref_md.scale_x(), + scale_y: ref_md.scale_y(), + skew_x: ref_md.skew_x(), + skew_y: ref_md.skew_y(), + }; + + let out_band_metadata = BandMetadata { + nodata_value: nodata_value.map(|v| nodata_f64_to_bytes(v, &band_type)), + storage_type: StorageType::InDb, + datatype: band_type, + outdb_url: None, + outdb_band_id: None, + }; + + Ok((out_metadata, out_band_metadata, band_bytes)) +} + +fn initialize_band( + dataset: &Dataset, + band_type: &BandDataType, + width: usize, + height: usize, + init_value: f64, +) -> Result<()> { + match band_type { + BandDataType::UInt8 => initialize_band_t::(dataset, width, height, init_value as u8), + BandDataType::Int8 => initialize_band_t::(dataset, width, height, init_value as i8), + BandDataType::UInt16 => initialize_band_t::(dataset, width, height, init_value as u16), + BandDataType::Int16 => initialize_band_t::(dataset, width, height, init_value as i16), + BandDataType::UInt32 => initialize_band_t::(dataset, width, height, init_value as u32), + BandDataType::Int32 => initialize_band_t::(dataset, width, height, init_value as i32), + BandDataType::UInt64 => initialize_band_t::(dataset, width, height, init_value as u64), + BandDataType::Int64 => initialize_band_t::(dataset, width, height, init_value as i64), + BandDataType::Float32 => { + initialize_band_t::(dataset, width, height, init_value as f32) + } + BandDataType::Float64 => initialize_band_t::(dataset, width, height, init_value), + } +} + +fn initialize_band_t( + dataset: &Dataset, + width: usize, + height: usize, + init_value: T, +) -> Result<()> { + let band = dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get output band: {}", e))?; + + let values = vec![init_value; width * height]; + let mut buffer = Buffer::new((width, height), values); + band.write((0, 0), (width, height), &mut buffer) + .map_err(|e| exec_datafusion_err!("Failed to initialize band: {}", e))?; + + Ok(()) +} + +fn read_band_as_bytes( + dataset: &Dataset, + band_idx: usize, + width: usize, + height: usize, + band_type: &BandDataType, +) -> Result> { + let band = dataset + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_idx, e))?; + + let data = match band_type { + BandDataType::UInt8 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().to_vec() + } + BandDataType::Int8 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().map(|v| *v as u8).collect() + } + BandDataType::UInt16 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::Int16 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::UInt32 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::Int32 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::UInt64 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::Int64 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::Float32 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + BandDataType::Float64 => { + let buffer = band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| { + exec_datafusion_err!("Failed to read band {} data: {}", band_idx, e) + })?; + buffer.data().iter().flat_map(|v| v.to_le_bytes()).collect() + } + }; + + Ok(data) +} + +// ----------------------------------------------------------------------------- +// ColumnarValue helpers +// ----------------------------------------------------------------------------- + +fn calc_num_iterations(args: &[ColumnarValue]) -> usize { + for arg in args { + if let ColumnarValue::Array(array) = arg { + return array.len(); + } + } + 1 +} + +fn finish_result(args: &[ColumnarValue], out: Arc) -> Result { + for arg in args { + if let ColumnarValue::Array(_) = arg { + return Ok(ColumnarValue::Array(out)); + } + } + Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(&out, 0)?)) +} + +#[cfg(test)] +mod tests { + use super::*; + use sedona_raster::array::RasterStructArray; + + fn wkb_from_wkt(gdal: &sedona_gdal::gdal::Gdal, wkt: &str) -> Result> { + let geom = gdal.geometry_from_wkt(wkt).unwrap(); + geom.wkb().map_err(|e| exec_datafusion_err!("{e}")) + } + + fn bytes_to_f64_vec(bytes: &[u8]) -> Vec { + bytes + .chunks_exact(8) + .map(|c| f64::from_le_bytes(c.try_into().unwrap())) + .collect() + } + + #[test] + fn test_parse_pixel_type() { + assert_eq!(parse_pixel_type("D").unwrap(), BandDataType::Float64); + assert_eq!(parse_pixel_type("f").unwrap(), BandDataType::Float32); + assert_eq!(parse_pixel_type("I").unwrap(), BandDataType::Int32); + assert_eq!(parse_pixel_type("S").unwrap(), BandDataType::Int16); + assert_eq!(parse_pixel_type("US").unwrap(), BandDataType::UInt16); + assert_eq!(parse_pixel_type("B").unwrap(), BandDataType::UInt8); + assert_eq!(parse_pixel_type("I8").unwrap(), BandDataType::Int8); + assert_eq!(parse_pixel_type("U64").unwrap(), BandDataType::UInt64); + assert_eq!(parse_pixel_type("I64").unwrap(), BandDataType::Int64); + } + + #[test] + fn test_rs_as_raster_use_reference_extent() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let md = raster.metadata(); + let ulx = md.upper_left_x(); + let uly = md.upper_left_y(); + let scale_x = md.scale_x(); + let scale_y = md.scale_y(); + + let minx = ulx; + let maxx = ulx + scale_x; + let maxy = uly; + let miny = uly + scale_y; + + let wkt = format!( + "POLYGON(({minx} {miny}, {minx} {maxy}, {maxx} {maxy}, {maxx} {miny}, {minx} {miny}))" + ); + let geom_wkb = wkb_from_wkt(gdal, &wkt)?; + + let (out_md, _band_md, out_bytes) = as_raster( + gdal, + &geom_wkb, + &raster, + BandDataType::Float64, + false, + 255.0, + Some(0.0), + false, + )?; + + assert_eq!(out_md.width, md.width() as u64); + assert_eq!(out_md.height, md.height() as u64); + assert_eq!(out_md.upperleft_x, md.upper_left_x()); + assert_eq!(out_md.upperleft_y, md.upper_left_y()); + + let values = bytes_to_f64_vec(&out_bytes); + assert_eq!(values[0], 255.0); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } + + #[test] + fn test_rs_as_raster_use_geometry_extent() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let md = raster.metadata(); + + let ulx = md.upper_left_x(); + let uly = md.upper_left_y(); + let scale_x = md.scale_x(); + let scale_y = md.scale_y(); + + let minx = ulx; + let maxx = ulx + scale_x; + let maxy = uly; + let miny = uly + scale_y; + + let wkt = format!( + "POLYGON(({minx} {miny}, {minx} {maxy}, {maxx} {maxy}, {maxx} {miny}, {minx} {miny}))" + ); + let geom_wkb = wkb_from_wkt(gdal, &wkt)?; + + let (out_md, _band_md, out_bytes) = as_raster( + gdal, + &geom_wkb, + &raster, + BandDataType::Float64, + false, + 255.0, + Some(0.0), + true, + )?; + + assert_eq!(out_md.width, 1); + assert_eq!(out_md.height, 1); + assert_eq!(out_md.upperleft_x, md.upper_left_x()); + assert_eq!(out_md.upperleft_y, md.upper_left_y()); + + let values = bytes_to_f64_vec(&out_bytes); + assert_eq!(values.len(), 1); + assert_eq!(values[0], 255.0); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_clip.rs b/rust/sedona-raster-gdal/src/rs_clip.rs new file mode 100644 index 000000000..2a6bc4aa2 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_clip.rs @@ -0,0 +1,928 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_Clip UDF - Clip a raster to a geometry boundary +//! +//! Similar to PostGIS ST_Clip, this function clips a raster to the extent of a geometry. +//! Pixels outside the geometry are set to nodata (or the minimum possible value for the +//! band pixel data type if nodata is not specified). + +use std::convert::TryFrom; +use std::sync::Arc; + +use arrow_array::Array; +use datafusion_common::cast::{as_boolean_array, as_float64_array, as_int32_array}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::exec_err; +use datafusion_common::{exec_datafusion_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::mem::MemDatasetBuilder; +use sedona_gdal::raster::types::Buffer; +use sedona_gdal::raster::types::GdalDataType; +use sedona_proj::transform::with_global_proj_engine; + +use arrow_schema::DataType; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata, RasterRef}; +use sedona_raster_functions::crs_utils::{crs_transform_wkb, resolve_crs}; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; +use sedona_schema::raster::{BandDataType, StorageType}; + +use crate::gdal_common::with_gdal; +use crate::gdal_common::{nodata_bytes_to_f64, nodata_f64_to_bytes}; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::raster_band_reader::RasterBandReader; + +/// RS_Clip() scalar UDF implementation +/// +/// Clips a raster to a geometry boundary. +/// +/// Signatures: +/// - `RS_Clip(raster, band, geom)` — 3 args +/// - `RS_Clip(raster, band, geom, allTouched)` — 4 args +/// - `RS_Clip(raster, band, geom, allTouched, noDataValue)` — 5 args +/// - `RS_Clip(raster, band, geom, allTouched, noDataValue, crop)` — 6 args +/// - `RS_Clip(raster, band, geom, allTouched, noDataValue, crop, lenient)` — 7 args +pub fn rs_clip_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_clip", + vec![ + Arc::new(RsClip { arg_count: 3 }), // (raster, band, geom) + Arc::new(RsClip { arg_count: 4 }), // (raster, band, geom, allTouched) + Arc::new(RsClip { arg_count: 5 }), // (raster, band, geom, allTouched, noDataValue) + Arc::new(RsClip { arg_count: 6 }), // (raster, band, geom, allTouched, noDataValue, crop) + Arc::new(RsClip { arg_count: 7 }), // (raster, band, geom, allTouched, noDataValue, crop, lenient) + ], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_Clip +#[derive(Debug)] +struct RsClip { + /// Number of arguments in the matched signature (3..=7) + arg_count: usize, +} + +impl SedonaScalarKernel for RsClip { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = match self.arg_count { + 3 => vec![ + // RS_Clip(raster, band, geom) + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_geometry_or_geography(), + ], + 4 => vec![ + // RS_Clip(raster, band, geom, allTouched) + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_boolean(), + ], + 5 => vec![ + // RS_Clip(raster, band, geom, allTouched, noDataValue) + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_boolean(), + ArgMatcher::is_numeric(), + ], + 6 => vec![ + // RS_Clip(raster, band, geom, allTouched, noDataValue, crop) + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_boolean(), + ArgMatcher::is_numeric(), + ArgMatcher::is_boolean(), + ], + 7 => vec![ + // RS_Clip(raster, band, geom, allTouched, noDataValue, crop, lenient) + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_boolean(), + ArgMatcher::is_numeric(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ], + _ => { + return sedona_internal_err!("RS_Clip: unexpected arg_count {}", self.arg_count); + } + }; + + let matcher = ArgMatcher::new(matchers, RASTER); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let num_iterations = calc_num_iterations(args); + + // Band is always at index 1, geom is always at index 2. + let geom_arg_idx: usize = 2; + + // Expand band to array + let band_array = args[1] + .clone() + .cast_to(&arrow_schema::DataType::Int32, None)? + .into_array(num_iterations)?; + let band_array = as_int32_array(&band_array)?.clone(); + + // allTouched at index 3 (when arg_count >= 4) + let all_touched_array = if self.arg_count >= 4 { + args[3] + .clone() + .cast_to(&arrow_schema::DataType::Boolean, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Boolean(Some(false)).to_array_of_size(num_iterations)? + }; + let all_touched_array = as_boolean_array(&all_touched_array)?.clone(); + + // noDataValue at index 4 (when arg_count >= 5) + let nodata_array = if self.arg_count >= 5 { + args[4] + .clone() + .cast_to(&arrow_schema::DataType::Float64, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Float64(None).to_array_of_size(num_iterations)? + }; + let nodata_array = as_float64_array(&nodata_array)?.clone(); + + // crop at index 5 (when arg_count >= 6), default true + let crop_array = if self.arg_count >= 6 { + args[5] + .clone() + .cast_to(&arrow_schema::DataType::Boolean, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Boolean(Some(true)).to_array_of_size(num_iterations)? + }; + let crop_array = as_boolean_array(&crop_array)?.clone(); + + // lenient at index 6 (when arg_count >= 7), default true + let lenient_array = if self.arg_count >= 7 { + args[6] + .clone() + .cast_to(&arrow_schema::DataType::Boolean, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Boolean(Some(true)).to_array_of_size(num_iterations)? + }; + let lenient_array = as_boolean_array(&lenient_array)?.clone(); + + let mut band_iter = band_array.iter(); + let mut all_touched_iter = all_touched_array.iter(); + let mut nodata_iter = nodata_array.iter(); + let mut crop_iter = crop_array.iter(); + let mut lenient_iter = lenient_array.iter(); + + // Build output rasters + let mut builder = RasterBuilder::new(num_iterations); + + let exec_arg_types = vec![arg_types[0].clone(), arg_types[geom_arg_idx].clone()]; + let exec_args = vec![args[0].clone(), args[geom_arg_idx].clone()]; + let executor = + RasterExecutor::new_with_num_iterations(&exec_arg_types, &exec_args, num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + with_global_proj_engine(|engine| { + executor.execute_raster_wkb_crs_void(|raster_opt, wkb_opt, geom_crs| { + let band = band_iter.next().unwrap_or(Some(0)).unwrap_or(0); + let all_touched = all_touched_iter + .next() + .unwrap_or(Some(false)) + .unwrap_or(false); + let nodata_value = nodata_iter.next().unwrap_or(None); + let crop = crop_iter.next().unwrap_or(Some(true)).unwrap_or(true); + let lenient = lenient_iter.next().unwrap_or(Some(true)).unwrap_or(true); + + let (raster, geom_wkb) = match (raster_opt, wkb_opt) { + (Some(r), Some(w)) => (r, w), + _ => { + builder.append_null()?; + return Ok(()); + } + }; + + let raster_crs = resolve_crs(raster.crs())?; + let geom_wkb = match (geom_crs, raster_crs.as_deref()) { + (Some(geom_crs), Some(raster_crs)) => { + crs_transform_wkb(geom_wkb, geom_crs, raster_crs, engine)? + } + (None, None) => geom_wkb.to_vec(), + (Some(_), None) => { + return exec_err!( + "Cannot operate on geometry and raster: raster has no CRS but geometry does" + ) + } + (None, Some(_)) => { + return exec_err!( + "Cannot operate on geometry and raster: geometry has no CRS but raster does" + ) + } + }; + + let band_index = usize::try_from(band.max(1)).unwrap_or(1); + match clip_raster( + gdal, + raster, + &geom_wkb, + band_index, + nodata_value, + all_touched, + crop, + ) { + Ok(Some(clipped_data)) => { + build_clipped_raster(&mut builder, raster, &clipped_data)? + } + Ok(None) => { + // No intersection between raster and geometry + if lenient { + builder.append_null()?; + } else { + return exec_err!("RS_Clip: raster and geometry do not intersect"); + } + } + Err(e) => { + if lenient { + eprintln!("RS_Clip error: {}", e); + builder.append_null()?; + } else { + return Err(e); + } + } + } + + Ok(()) + }) + })?; + + executor.finish(Arc::new(builder.finish()?)) + }) + } +} + +/// Data for a clipped raster +struct ClippedRasterData { + /// Clipped band data (one Vec per band) + band_data: Vec>, + /// Band metadata (data types, nodata values) + band_metadata: Vec, + /// Crop window in pixel coordinates (col_off, row_off, width, height). + /// `None` means the full original raster extent was kept (crop=false). + crop_window: Option, +} + +/// A rectangular crop window in pixel coordinates. +#[derive(Debug, Clone, Copy)] +struct CropWindow { + col_off: usize, + row_off: usize, + width: usize, + height: usize, +} + +/// Clip a raster to a geometry. +/// +/// Returns `Ok(None)` when the geometry does not intersect the raster extent +/// (caller decides how to handle based on `lenient`). +fn clip_raster( + gdal: &Gdal, + raster: &RasterRefImpl<'_>, + geom_wkb: &[u8], + band_num: usize, + custom_nodata: Option, + all_touched: bool, + crop: bool, +) -> Result> { + let metadata = raster.metadata(); + let bands = raster.bands(); + let mut band_reader = RasterBandReader::new(gdal, raster); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + + // Parse geometry from WKB + let geometry = gdal + .geometry_from_wkb(geom_wkb) + .map_err(|e| exec_datafusion_err!("Failed to parse geometry from WKB: {}", e))?; + + // Create a mask raster (same dimensions as input) + let mask_dataset = MemDatasetBuilder::create(gdal, width, height, 1, GdalDataType::UInt8) + .map_err(|e| exec_datafusion_err!("Failed to create mask dataset: {}", e))?; + + // Set the same geotransform as the input raster + let geotransform = [ + metadata.upper_left_x(), + metadata.scale_x(), + metadata.skew_x(), + metadata.upper_left_y(), + metadata.skew_y(), + metadata.scale_y(), + ]; + mask_dataset + .set_geo_transform(&geotransform) + .map_err(|e| exec_datafusion_err!("Failed to set geotransform: {}", e))?; + + // Initialize mask to 0 (outside) + let mask_band = mask_dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get mask band: {}", e))?; + let zeros = vec![0u8; width * height]; + let mut buffer = Buffer::new((width, height), zeros); + mask_band + .write((0, 0), (width, height), &mut buffer) + .map_err(|e| exec_datafusion_err!("Failed to initialize mask: {}", e))?; + + gdal.rasterize_affine( + &mask_dataset, + &[1], // band 1 + &[geometry], + &[1.0], // burn value = 1 (inside) + all_touched, + ) + .map_err(|e| exec_datafusion_err!("Failed to rasterize geometry: {}", e))?; + + // Read the mask + let mask_band = mask_dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get mask band: {}", e))?; + let mask_buffer = mask_band + .read_as::((0, 0), (width, height), (width, height), None) + .map_err(|e| exec_datafusion_err!("Failed to read mask: {}", e))?; + let mask = mask_buffer.data(); + + // Check if there are any non-zero pixels in the mask (i.e. geometry intersects raster) + let has_intersection = mask.iter().any(|&v| v != 0); + if !has_intersection { + return Ok(None); + } + + // Compute crop window if crop=true + let crop_window = if crop { + compute_crop_window(mask, width, height) + } else { + None + }; + + // Determine which bands to process + let band_indices: Vec = if band_num == 0 { + (1..=bands.len()).collect() + } else { + if band_num > bands.len() { + return exec_err!("Band {} is out of range (1-{})", band_num, bands.len()); + } + vec![band_num] + }; + + // Process each band + let mut clipped_band_data = Vec::new(); + let mut clipped_band_metadata = Vec::new(); + + for &band_idx in &band_indices { + let band = bands + .band(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_idx, e))?; + + let band_metadata = band.metadata(); + let data_type = band_metadata.data_type()?; + let original_data = band_reader.read_band_bytes(band_idx)?; + + // Determine nodata value + let nodata = custom_nodata + .or_else(|| nodata_bytes_to_f64(band_metadata.nodata_value(), &data_type)) + .unwrap_or(0.0); + + // Apply mask to band data, optionally cropping + let clipped_data = if let Some(cw) = crop_window { + apply_mask_and_crop(&original_data, mask, width, &data_type, nodata, &cw)? + } else { + apply_mask_to_band(&original_data, mask, width, height, &data_type, nodata)? + }; + + // Build band metadata + let new_band_metadata = BandMetadata { + nodata_value: Some(nodata_f64_to_bytes(nodata, &data_type)), + storage_type: StorageType::InDb, + datatype: data_type, + outdb_url: None, + outdb_band_id: None, + }; + + clipped_band_data.push(clipped_data); + clipped_band_metadata.push(new_band_metadata); + } + + Ok(Some(ClippedRasterData { + band_data: clipped_band_data, + band_metadata: clipped_band_metadata, + crop_window, + })) +} + +/// Compute the minimal bounding pixel window that contains all non-zero mask pixels. +fn compute_crop_window(mask: &[u8], width: usize, height: usize) -> Option { + let mut min_col = width; + let mut max_col = 0usize; + let mut min_row = height; + let mut max_row = 0usize; + + for row in 0..height { + for col in 0..width { + if mask[row * width + col] != 0 { + min_col = min_col.min(col); + max_col = max_col.max(col); + min_row = min_row.min(row); + max_row = max_row.max(row); + } + } + } + + if min_col > max_col || min_row > max_row { + return None; // no non-zero pixels (shouldn't happen if caller checked) + } + + Some(CropWindow { + col_off: min_col, + row_off: min_row, + width: max_col - min_col + 1, + height: max_row - min_row + 1, + }) +} + +/// Apply mask to band data (no cropping — preserves original dimensions). +fn apply_mask_to_band( + original_data: &[u8], + mask: &[u8], + width: usize, + height: usize, + data_type: &BandDataType, + nodata: f64, +) -> Result> { + let byte_size = data_type_byte_size(data_type); + let mut result = original_data.to_vec(); + + for (pixel_idx, &mask_val) in mask.iter().enumerate().take(width * height) { + if mask_val == 0 { + // Pixel is outside geometry - set to nodata + let byte_offset = pixel_idx * byte_size; + write_nodata_value(&mut result, byte_offset, data_type, nodata)?; + } + } + + Ok(result) +} + +/// Apply mask AND crop to the given crop window in one pass. +fn apply_mask_and_crop( + original_data: &[u8], + mask: &[u8], + full_width: usize, + data_type: &BandDataType, + nodata: f64, + cw: &CropWindow, +) -> Result> { + let byte_size = data_type_byte_size(data_type); + let crop_pixel_count = cw.width * cw.height; + let nodata_bytes = nodata_value_bytes(data_type, nodata); + let mut result = vec![0u8; crop_pixel_count * byte_size]; + + for row in 0..cw.height { + let src_row = cw.row_off + row; + for col in 0..cw.width { + let src_col = cw.col_off + col; + let src_pixel_idx = src_row * full_width + src_col; + let dst_pixel_idx = row * cw.width + col; + let src_byte_offset = src_pixel_idx * byte_size; + let dst_byte_offset = dst_pixel_idx * byte_size; + + if mask[src_pixel_idx] != 0 { + // Inside geometry — copy original pixel + result[dst_byte_offset..dst_byte_offset + byte_size] + .copy_from_slice(&original_data[src_byte_offset..src_byte_offset + byte_size]); + } else { + // Outside geometry — write nodata + result[dst_byte_offset..dst_byte_offset + byte_size].copy_from_slice(&nodata_bytes); + } + } + } + + Ok(result) +} + +/// Convert a nodata f64 value to its byte representation for the given data type. +fn nodata_value_bytes(data_type: &BandDataType, nodata: f64) -> Vec { + match data_type { + BandDataType::UInt8 => vec![nodata as u8], + BandDataType::Int8 => (nodata as i8).to_le_bytes().to_vec(), + BandDataType::UInt16 => (nodata as u16).to_le_bytes().to_vec(), + BandDataType::Int16 => (nodata as i16).to_le_bytes().to_vec(), + BandDataType::UInt32 => (nodata as u32).to_le_bytes().to_vec(), + BandDataType::Int32 => (nodata as i32).to_le_bytes().to_vec(), + BandDataType::UInt64 => (nodata as u64).to_le_bytes().to_vec(), + BandDataType::Int64 => (nodata as i64).to_le_bytes().to_vec(), + BandDataType::Float32 => (nodata as f32).to_le_bytes().to_vec(), + BandDataType::Float64 => nodata.to_le_bytes().to_vec(), + } +} + +/// Write nodata value to band data at specified offset +fn write_nodata_value( + data: &mut [u8], + offset: usize, + data_type: &BandDataType, + nodata: f64, +) -> Result<()> { + match data_type { + BandDataType::UInt8 => { + data[offset] = nodata as u8; + } + BandDataType::Int8 => { + data[offset] = (nodata as i8).to_le_bytes()[0]; + } + BandDataType::UInt16 => { + let bytes = (nodata as u16).to_le_bytes(); + data[offset..offset + 2].copy_from_slice(&bytes); + } + BandDataType::Int16 => { + let bytes = (nodata as i16).to_le_bytes(); + data[offset..offset + 2].copy_from_slice(&bytes); + } + BandDataType::UInt32 => { + let bytes = (nodata as u32).to_le_bytes(); + data[offset..offset + 4].copy_from_slice(&bytes); + } + BandDataType::Int32 => { + let bytes = (nodata as i32).to_le_bytes(); + data[offset..offset + 4].copy_from_slice(&bytes); + } + BandDataType::UInt64 => { + let bytes = (nodata as u64).to_le_bytes(); + data[offset..offset + 8].copy_from_slice(&bytes); + } + BandDataType::Int64 => { + let bytes = (nodata as i64).to_le_bytes(); + data[offset..offset + 8].copy_from_slice(&bytes); + } + BandDataType::Float32 => { + let bytes = (nodata as f32).to_le_bytes(); + data[offset..offset + 4].copy_from_slice(&bytes); + } + BandDataType::Float64 => { + let bytes = nodata.to_le_bytes(); + data[offset..offset + 8].copy_from_slice(&bytes); + } + } + Ok(()) +} + +/// Build clipped raster using RasterBuilder +fn build_clipped_raster( + builder: &mut RasterBuilder, + original_raster: &RasterRefImpl<'_>, + clipped_data: &ClippedRasterData, +) -> Result<()> { + let original_metadata = original_raster.metadata(); + + let metadata = if let Some(cw) = clipped_data.crop_window { + // Cropped: adjust dimensions and upper-left coordinate. + // new_upper_left = original_upper_left + pixel_offset * scale + pixel_offset * skew + let new_upper_left_x = original_metadata.upper_left_x() + + cw.col_off as f64 * original_metadata.scale_x() + + cw.row_off as f64 * original_metadata.skew_x(); + let new_upper_left_y = original_metadata.upper_left_y() + + cw.row_off as f64 * original_metadata.scale_y() + + cw.col_off as f64 * original_metadata.skew_y(); + + RasterMetadata { + width: cw.width as u64, + height: cw.height as u64, + upperleft_x: new_upper_left_x, + upperleft_y: new_upper_left_y, + scale_x: original_metadata.scale_x(), + scale_y: original_metadata.scale_y(), + skew_x: original_metadata.skew_x(), + skew_y: original_metadata.skew_y(), + } + } else { + // No crop: use original raster dimensions and geotransform + RasterMetadata { + width: original_metadata.width(), + height: original_metadata.height(), + upperleft_x: original_metadata.upper_left_x(), + upperleft_y: original_metadata.upper_left_y(), + scale_x: original_metadata.scale_x(), + scale_y: original_metadata.scale_y(), + skew_x: original_metadata.skew_x(), + skew_y: original_metadata.skew_y(), + } + }; + + builder + .start_raster(&metadata, original_raster.crs()) + .map_err(|e| exec_datafusion_err!("Failed to start raster: {}", e))?; + + // Add clipped bands + for (band_data, band_metadata) in clipped_data + .band_data + .iter() + .zip(clipped_data.band_metadata.iter()) + { + builder + .start_band(band_metadata.clone()) + .map_err(|e| exec_datafusion_err!("Failed to start band: {}", e))?; + builder.band_data_writer().append_value(band_data); + builder + .finish_band() + .map_err(|e| exec_datafusion_err!("Failed to finish band: {}", e))?; + } + + builder + .finish_raster() + .map_err(|e| exec_datafusion_err!("Failed to finish raster: {}", e))?; + + Ok(()) +} + +/// Get byte size of data type +fn data_type_byte_size(data_type: &BandDataType) -> usize { + match data_type { + BandDataType::UInt8 => 1, + BandDataType::Int8 => 1, + BandDataType::UInt16 | BandDataType::Int16 => 2, + BandDataType::UInt32 | BandDataType::Int32 | BandDataType::Float32 => 4, + BandDataType::UInt64 | BandDataType::Int64 => 8, + BandDataType::Float64 => 8, + } +} + +fn calc_num_iterations(args: &[ColumnarValue]) -> usize { + for arg in args { + if let ColumnarValue::Array(array) = arg { + return array.len(); + } + } + 1 +} + +#[cfg(test)] +mod tests { + use super::*; + use sedona_raster::array::RasterStructArray; + use sedona_schema::crs::deserialize_crs; + use sedona_schema::datatypes::Edges; + + fn wkb_from_wkt(gdal: &sedona_gdal::gdal::Gdal, wkt: &str) -> Result> { + let geometry = gdal.geometry_from_wkt(wkt).unwrap(); + geometry.wkb().map_err(|e| exec_datafusion_err!("{e}")) + } + + #[test] + fn test_rs_clip_basic() { + // Load test raster + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()) / 2.0; + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()) / 2.0; + + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + + let geom_wkb = wkb_from_wkt(gdal, &wkt)?; + let clipped = clip_raster(gdal, &raster, &geom_wkb, 0, None, false, false)? + .expect("Should have intersection"); + let mut reader = RasterBandReader::new(gdal, &raster); + let original_len = reader.read_band_bytes(1)?.len(); + assert!( + !clipped.band_data.is_empty(), + "Should have at least one band" + ); + assert_eq!( + clipped.band_data[0].len(), + original_len, + "Clipped band should have same size as original when crop=false" + ); + assert!( + clipped.crop_window.is_none(), + "crop_window should be None when crop=false" + ); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } + + #[test] + fn test_rs_clip_crop() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()) / 4.0; + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()) / 4.0; + + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + + let geom_wkb = wkb_from_wkt(gdal, &wkt)?; + let clipped = clip_raster(gdal, &raster, &geom_wkb, 0, None, false, true)? + .expect("Should have intersection"); + assert!( + clipped.crop_window.is_some(), + "crop_window should be set when crop=true" + ); + + let cw = clipped.crop_window.unwrap(); + let byte_size = data_type_byte_size(&clipped.band_metadata[0].datatype); + assert_eq!( + clipped.band_data[0].len(), + cw.width * cw.height * byte_size, + "Cropped band data size should match crop window" + ); + assert!( + (cw.width as u64) < metadata.width(), + "Cropped width should be smaller" + ); + assert!( + (cw.height as u64) < metadata.height(), + "Cropped height should be smaller" + ); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } + + #[test] + fn test_rs_clip_no_intersection() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let wkt = "POLYGON((1000 1000, 1001 1000, 1001 1001, 1000 1001, 1000 1000))"; + let geom_wkb = wkb_from_wkt(gdal, wkt)?; + let result = clip_raster(gdal, &raster, &geom_wkb, 0, None, false, true)?; + assert!(result.is_none(), "Should return None for no intersection"); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } + + #[test] + fn test_rs_clip_crs_mismatch() { + use sedona_expr::scalar_udf::SedonaScalarKernel; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let (raster_array, geom_wkb) = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()) / 2.0; + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()) / 2.0; + + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + let geom_wkb = wkb_from_wkt(gdal, &wkt)?; + Ok::<_, datafusion_common::DataFusionError>((raster_array, geom_wkb)) + }) + .unwrap(); + + // Generate the EPSG:3857 geometry using the same PROJ engine that the + // UDF uses for CRS transforms. This makes the test robust to axis-order + // and normalization differences between build configurations. + let crs_4326 = deserialize_crs("EPSG:4326").unwrap().unwrap(); + let crs_3857 = deserialize_crs("EPSG:3857").unwrap().unwrap(); + + let geom_wkb_merc = with_global_proj_engine(|engine| { + crs_transform_wkb(&geom_wkb, crs_4326.as_ref(), crs_3857.as_ref(), engine) + }) + .unwrap(); + + // Test with 3-arg variant: RS_Clip(raster, band, geom) + let kernel = RsClip { arg_count: 3 }; + + let raster_scalar = ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(raster_array))); + let geom_type_4326 = SedonaType::Wkb(Edges::Planar, Some(crs_4326)); + let geom_type_3857 = SedonaType::Wkb(Edges::Planar, Some(crs_3857)); + let band_type = SedonaType::Arrow(DataType::Int32); + let band_val = ColumnarValue::Scalar(ScalarValue::Int32(Some(1))); + + let result_4326 = kernel + .invoke_batch( + &[RASTER, band_type.clone(), geom_type_4326], + &[ + raster_scalar.clone(), + band_val.clone(), + ColumnarValue::Scalar(ScalarValue::Binary(Some(geom_wkb))), + ], + ) + .unwrap(); + + let result_3857 = kernel + .invoke_batch( + &[RASTER, band_type, geom_type_3857], + &[ + raster_scalar, + band_val, + ColumnarValue::Scalar(ScalarValue::Binary(Some(geom_wkb_merc))), + ], + ) + .unwrap(); + + let band_data_4326 = match result_4326 { + ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => { + let array = RasterStructArray::new(struct_array.as_ref()); + let raster = array.get(0).unwrap(); + let data = raster.bands().band(1).unwrap().data().to_vec(); + data + } + _ => panic!("Expected raster scalar result"), + }; + + let band_data_3857 = match result_3857 { + ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => { + let array = RasterStructArray::new(struct_array.as_ref()); + let raster = array.get(0).unwrap(); + let data = raster.bands().band(1).unwrap().data().to_vec(); + data + } + _ => panic!("Expected raster scalar result"), + }; + + assert_eq!(band_data_4326, band_data_3857); + } + + #[test] + fn test_write_nodata_value() { + let mut data = vec![0u8; 8]; + + // Test UInt8 + write_nodata_value(&mut data, 0, &BandDataType::UInt8, 255.0).unwrap(); + assert_eq!(data[0], 255); + + // Test Float32 + write_nodata_value(&mut data, 0, &BandDataType::Float32, -9999.0).unwrap(); + let value = f32::from_le_bytes([data[0], data[1], data[2], data[3]]); + assert!((value - (-9999.0)).abs() < 0.001); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_from_gdal_raster.rs b/rust/sedona-raster-gdal/src/rs_from_gdal_raster.rs new file mode 100644 index 000000000..ba7e42006 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_from_gdal_raster.rs @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromGDALRaster UDF - Parse binary content using GDAL driver as in-db raster +//! +//! Similar to PostGIS's ST_FromGDALRaster. Parses binary content using GDAL driver +//! and loads it as an in-db raster with all band data stored inline. + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use arrow_array::{Array, ArrayRef, StructArray}; +use arrow_schema::DataType; +use datafusion_common::cast::as_binary_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::exec_datafusion_err; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_gdal::gdal::Gdal; +use sedona_raster::builder::RasterBuilder; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; + +/// Counter for generating unique VSI memory file names +static VSI_FILE_COUNTER: AtomicUsize = AtomicUsize::new(0); + +/// RS_FromGDALRaster() scalar UDF implementation +/// +/// Parse binary content using GDAL driver and load it as in-db raster +pub fn rs_from_gdal_raster_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_fromgdalraster", + vec![Arc::new(RsFromGDALRaster)], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_FromGDALRaster +#[derive(Debug)] +pub(crate) struct RsFromGDALRaster; + +impl RsFromGDALRaster { + /// Generate a unique VSI memory file path + fn generate_vsi_path() -> String { + let counter = VSI_FILE_COUNTER.fetch_add(1, Ordering::SeqCst); + let thread_id = std::thread::current().id(); + format!( + "/vsimem/rs_from_gdal_raster_{:?}_{}.bin", + thread_id, counter + ) + } + + /// Parse binary content and create an in-db raster + pub(crate) fn parse_gdal_raster(gdal: &Gdal, content: &[u8]) -> Result { + // Create a temporary VSI memory file + let vsi_path = Self::generate_vsi_path(); + let content_copy = content.to_vec(); + + // Write content to VSI memory file + gdal.create_mem_file(&vsi_path, &content_copy) + .map_err(|e| exec_datafusion_err!("Failed to create VSI memory file: {}", e))?; + + // Delegate to load_as_indb_raster, then always clean up + let result = crate::utils::load_as_indb_raster(gdal, &vsi_path); + let _ = gdal.unlink_mem_file(&vsi_path); + result + } +} + +impl SedonaScalarKernel for RsFromGDALRaster { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new(vec![ArgMatcher::is_binary()], RASTER); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let content_array = match &args[0] { + ColumnarValue::Scalar(scalar) => scalar.to_array().map_err(|e| { + exec_datafusion_err!("Failed to convert scalar to array: {}", e) + })?, + ColumnarValue::Array(array) => array.clone(), + }; + + let binary_array = as_binary_array(&content_array)?; + + let len = binary_array.len(); + + if len == 0 { + let builder = RasterBuilder::new(0); + let result = builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build empty raster: {}", e))?; + return Ok(ColumnarValue::Array(Arc::new(result))); + } + + let mut combined_arrays: Vec = Vec::with_capacity(len); + + for i in 0..len { + if binary_array.is_null(i) { + let mut builder = RasterBuilder::new(1); + builder + .append_null() + .map_err(|e| exec_datafusion_err!("Failed to append null: {}", e))?; + let result = builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build null raster: {}", e))?; + combined_arrays.push(Arc::new(result)); + } else { + let content = binary_array.value(i); + let raster = Self::parse_gdal_raster(gdal, content)?; + combined_arrays.push(Arc::new(raster)); + } + } + + let refs: Vec<&dyn Array> = combined_arrays.iter().map(|a| a.as_ref()).collect(); + let result = arrow::compute::concat(&refs) + .map_err(|e| exec_datafusion_err!("Failed to concatenate rasters: {}", e))?; + + match &args[0] { + ColumnarValue::Scalar(_) => { + let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?; + Ok(ColumnarValue::Scalar(scalar)) + } + ColumnarValue::Array(_) => Ok(ColumnarValue::Array(result)), + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::gdal_common::with_gdal; + use datafusion_common::cast::as_struct_array; + use sedona_raster::traits::RasterRef; + + #[test] + fn test_generate_vsi_path() { + let path1 = RsFromGDALRaster::generate_vsi_path(); + let path2 = RsFromGDALRaster::generate_vsi_path(); + + assert!(path1.starts_with("/vsimem/rs_from_gdal_raster_")); + assert!(path2.starts_with("/vsimem/rs_from_gdal_raster_")); + assert_ne!(path1, path2); + } + + #[test] + fn udf_from_gdal_raster() { + let udf: datafusion_expr::ScalarUDF = rs_from_gdal_raster_udf().into(); + assert_eq!(udf.name(), "rs_fromgdalraster"); + } + + #[test] + fn test_parse_geotiff_bytes() { + use sedona_raster::array::RasterStructArray; + use sedona_testing::data::test_raster; + + // Read test4.tiff file into bytes + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + let content = std::fs::read(&path).expect("Should read file"); + + // Parse the GeoTiff bytes into a raster + let result = with_gdal(|gdal| RsFromGDALRaster::parse_gdal_raster(gdal, &content)) + .expect("Should parse GeoTiff bytes"); + + // Verify the raster + let raster_array = RasterStructArray::new(&result); + assert_eq!(raster_array.len(), 1); + + let raster = raster_array.get(0).expect("Should get raster"); + assert_eq!(raster.metadata().width(), 10); + assert_eq!(raster.metadata().height(), 10); + assert_eq!(raster.bands().len(), 1); + // Check CRS - test4.tiff has EPSG:4326 + assert!(raster.crs().is_some()); + + // Verify it's an in-db raster (should have band data, not outdb_url) + let band = raster.bands().band(1).expect("Should have band 1"); + assert!( + band.metadata().outdb_url().is_none(), + "In-db raster should not have outdb_url" + ); + } + + #[test] + fn test_invoke_rs_from_gdal_raster() { + use arrow_array::BinaryArray; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_testing::data::test_raster; + + // Read test file into bytes + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + let content = std::fs::read(&path).expect("Should read file"); + + // Create binary array with the content + let binary_arr = Arc::new(BinaryArray::from(vec![content.as_slice()])); + let input = ColumnarValue::Array(binary_arr); + + // Invoke the UDF + let kernel = RsFromGDALRaster; + let result = kernel + .invoke_batch_from_args(&[], &[input], &SedonaType::Arrow(DataType::Null), 0, None) + .expect("Should invoke successfully"); + + // Verify result + match result { + ColumnarValue::Array(arr) => { + let struct_arr = as_struct_array(&arr).unwrap(); + let raster_array = sedona_raster::array::RasterStructArray::new(struct_arr); + assert_eq!(raster_array.len(), 1); + let raster = raster_array.get(0).expect("Should get raster"); + assert_eq!(raster.metadata().width(), 10); + assert_eq!(raster.metadata().height(), 10); + } + _ => panic!("Expected array result"), + } + } +} diff --git a/rust/sedona-raster-gdal/src/rs_from_path.rs b/rust/sedona-raster-gdal/src/rs_from_path.rs new file mode 100644 index 000000000..082783b75 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_from_path.rs @@ -0,0 +1,447 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_FromPath UDF - Load out-db raster from file path +//! +//! Returns an out-db raster from a path to an image file. Supported formats include: +//! - GeoTiff (*.tif, *.tiff) +//! - Arc Info ASCII Grid (*.asc) +//! - And other GDAL-supported raster formats + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow::compute::cast; +use arrow_array::{Array, ArrayRef, StructArray}; +use datafusion_common::cast::as_string_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::exec_datafusion_err; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY}; +use sedona_gdal::raster::types::DatasetOptions; +use sedona_gdal::spatial_ref::SpatialRef; + +use arrow_schema::DataType; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; +use sedona_schema::raster::StorageType; + +use crate::gdal_common::{ + gdal_to_band_data_type, nodata_f64_to_bytes, normalize_outdb_source_path, with_gdal, +}; +use crate::gdal_dataset_provider::configure_thread_local_options; + +/// RS_FromPath() scalar UDF implementation +/// +/// Returns an out-db raster from a path to an image file +pub fn rs_from_path_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_frompath", + vec![ + Arc::new(RsFromPath::new(false)), // RS_FromPath(path) + Arc::new(RsFromPath::new(true)), // RS_FromPath(path, params) + ], + Volatility::Volatile, // Reads from filesystem + ) +} + +/// Kernel implementation for RS_FromPath +#[derive(Debug)] +pub(crate) struct RsFromPath { + with_params: bool, +} + +impl RsFromPath { + pub(crate) fn new(with_params: bool) -> Self { + Self { with_params } + } + + /// Parse parameters string into a HashMap + /// Format: "key1=value1;key2=value2" + #[allow(dead_code)] + fn parse_params(params: &str) -> HashMap { + params + .split(';') + .filter_map(|pair| { + let parts: Vec<&str> = pair.trim().splitn(2, '=').collect(); + if parts.len() == 2 { + Some((parts[0].trim().to_string(), parts[1].trim().to_string())) + } else { + None + } + }) + .collect() + } + + /// Load raster metadata from file and create out-db raster + fn load_outdb_raster(gdal: &Gdal, path: &str, _params: Option<&str>) -> Result { + let gdal_path = normalize_outdb_source_path(path); + let dataset = gdal + .open_ex_with_options( + &gdal_path, + DatasetOptions { + open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY, + ..Default::default() + }, + ) + .map_err(|e| { + exec_datafusion_err!( + "Failed to open raster file '{}'(GDAL path '{}'): {}", + path, + gdal_path, + e + ) + })?; + + let (width, height) = dataset.raster_size(); + + let geotransform = dataset + .geo_transform() + .map_err(|e| exec_datafusion_err!("Failed to get geotransform: {}", e))?; + + let metadata = RasterMetadata { + width: width as u64, + height: height as u64, + upperleft_x: geotransform[0], + upperleft_y: geotransform[3], + scale_x: geotransform[1], + scale_y: geotransform[5], + skew_x: geotransform[2], + skew_y: geotransform[4], + }; + + let crs = dataset + .spatial_ref() + .ok() + .and_then(|sr: SpatialRef| sr.to_projjson().ok()); + + let mut builder = RasterBuilder::new(1); + builder + .start_raster(&metadata, crs.as_deref()) + .map_err(|e| exec_datafusion_err!("Failed to start raster: {}", e))?; + + let band_count = dataset.raster_count(); + for band_idx in 1..=band_count { + let band = dataset + .rasterband(band_idx) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_idx, e))?; + + let gdal_type = band.band_type(); + let band_data_type = gdal_to_band_data_type(gdal_type) + .map_err(|_| exec_datafusion_err!("Unsupported band data type: {:?}", gdal_type))?; + + let nodata_bytes = band + .no_data_value() + .map(|no_data| nodata_f64_to_bytes(no_data, &band_data_type)); + + let band_metadata = BandMetadata { + nodata_value: nodata_bytes, + storage_type: StorageType::OutDbRef, + datatype: band_data_type, + outdb_url: Some(path.to_string()), + outdb_band_id: Some(band_idx as u32), + }; + + builder + .start_band(band_metadata) + .map_err(|e| exec_datafusion_err!("Failed to start band: {}", e))?; + + // For out-db rasters, we don't store the actual band data + // but the schema requires the `data` field to be non-null. + // Use an empty (0-length) value as a placeholder; readers must consult + // `storage_type` + `outdb_*` metadata to load the actual pixels. + builder.band_data_writer().append_value([]); + + builder + .finish_band() + .map_err(|e| exec_datafusion_err!("Failed to finish band: {}", e))?; + } + + builder + .finish_raster() + .map_err(|e| exec_datafusion_err!("Failed to finish raster: {}", e))?; + + builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build raster: {}", e)) + } +} + +impl SedonaScalarKernel for RsFromPath { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = if self.with_params { + vec![ + ArgMatcher::is_string(), // path + ArgMatcher::is_string(), // params + ] + } else { + vec![ArgMatcher::is_string()] // path only + }; + + let matcher = ArgMatcher::new(matchers, RASTER); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + _arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + + let (paths, params_opt) = match &args[0] { + ColumnarValue::Scalar(scalar) => { + let path = scalar.to_array().map_err(|e| { + exec_datafusion_err!("Failed to convert scalar to array: {}", e) + })?; + let params = if self.with_params { + match &args[1] { + ColumnarValue::Scalar(s) => Some(s.to_array().map_err(|e| { + exec_datafusion_err!("Failed to convert params scalar: {}", e) + })?), + ColumnarValue::Array(a) => Some(a.clone()), + } + } else { + None + }; + (path, params) + } + ColumnarValue::Array(array) => { + let params = if self.with_params { + match &args[1] { + ColumnarValue::Scalar(s) => Some(s.to_array().map_err(|e| { + exec_datafusion_err!("Failed to convert params scalar: {}", e) + })?), + ColumnarValue::Array(a) => Some(a.clone()), + } + } else { + None + }; + (array.clone(), params) + } + }; + + let paths = cast(&paths, &DataType::Utf8)?; + let path_array = as_string_array(&paths)?; + + let params_casted = params_opt.map(|p| cast(&p, &DataType::Utf8)).transpose()?; + let params_array = params_casted + .as_ref() + .map(|p| as_string_array(p)) + .transpose()?; + + let len = path_array.len(); + + if len == 0 { + let builder = RasterBuilder::new(0); + let result = builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build empty raster: {}", e))?; + return Ok(ColumnarValue::Array(Arc::new(result))); + } + + let mut combined_arrays: Vec = Vec::with_capacity(len); + + for i in 0..len { + if path_array.is_null(i) { + let mut builder = RasterBuilder::new(1); + builder + .append_null() + .map_err(|e| exec_datafusion_err!("Failed to append null: {}", e))?; + let result = builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build null raster: {}", e))?; + combined_arrays.push(Arc::new(result)); + } else { + let path = path_array.value(i); + let params = params_array.and_then(|pa| { + if pa.is_null(i) { + None + } else { + Some(pa.value(i)) + } + }); + + let raster = Self::load_outdb_raster(gdal, path, params)?; + combined_arrays.push(Arc::new(raster)); + } + } + + let refs: Vec<&dyn Array> = combined_arrays.iter().map(|a| a.as_ref()).collect(); + let result = arrow::compute::concat(&refs) + .map_err(|e| exec_datafusion_err!("Failed to concatenate rasters: {}", e))?; + + match &args[0] { + ColumnarValue::Scalar(_) => { + let scalar = datafusion_common::ScalarValue::try_from_array(&result, 0)?; + Ok(ColumnarValue::Scalar(scalar)) + } + ColumnarValue::Array(_) => Ok(ColumnarValue::Array(result)), + } + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::gdal_common::with_gdal; + + #[test] + fn test_parse_params() { + let params = "key1=value1;key2=value2"; + let parsed = RsFromPath::parse_params(params); + assert_eq!(parsed.get("key1"), Some(&"value1".to_string())); + assert_eq!(parsed.get("key2"), Some(&"value2".to_string())); + + // Empty params + let parsed = RsFromPath::parse_params(""); + assert!(parsed.is_empty()); + + // Single param + let parsed = RsFromPath::parse_params("option=true"); + assert_eq!(parsed.get("option"), Some(&"true".to_string())); + } + + #[test] + fn udf_from_path() { + let udf: datafusion_expr::ScalarUDF = rs_from_path_udf().into(); + assert_eq!(udf.name(), "rs_frompath"); + } + + #[test] + fn test_load_outdb_raster_from_file() { + use sedona_testing::data::test_raster; + + // Load test4.tiff - a simple 10x10 GeoTIFF + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + let raster = with_gdal(|gdal| RsFromPath::load_outdb_raster(gdal, &path, None)) + .expect("Should load raster from path"); + + // Verify the StructArray has correct length + assert_eq!(raster.len(), 1); + + // Verify metadata directly from the struct array + use datafusion_common::cast::{ + as_list_array, as_string_array, as_string_view_array, as_struct_array, as_uint32_array, + as_uint64_array, + }; + use sedona_schema::raster::{ + band_indices, band_metadata_indices, metadata_indices, raster_indices, + }; + + let metadata_struct = as_struct_array(raster.column(raster_indices::METADATA)).unwrap(); + let width = as_uint64_array(metadata_struct.column(metadata_indices::WIDTH)) + .unwrap() + .value(0); + let height = as_uint64_array(metadata_struct.column(metadata_indices::HEIGHT)) + .unwrap() + .value(0); + + assert_eq!(width, 10); + assert_eq!(height, 10); + + // Check CRS + let crs = as_string_view_array(raster.column(raster_indices::CRS)).unwrap(); + assert!(!crs.is_null(0)); + + // Verify bands - check that it's out-db via the metadata + let bands_list = as_list_array(raster.column(raster_indices::BANDS)).unwrap(); + let bands_struct = as_struct_array(bands_list.values()).unwrap(); + let band_metadata_struct = + as_struct_array(bands_struct.column(band_indices::METADATA)).unwrap(); + + // Check outdb_url is set (meaning it's an out-db raster) + let outdb_url = + as_string_array(band_metadata_struct.column(band_metadata_indices::OUTDB_URL)).unwrap(); + assert!( + !outdb_url.is_null(0), + "Out-db raster should have outdb_url set" + ); + assert!(outdb_url.value(0).contains("test4.tiff")); + + // Check storage type is OutDbRef + let storage_type = + as_uint32_array(band_metadata_struct.column(band_metadata_indices::STORAGE_TYPE)) + .unwrap(); + assert_eq!( + storage_type.value(0), + sedona_schema::raster::StorageType::OutDbRef as u32 + ); + } + + #[test] + fn test_invoke_rs_from_path() { + use arrow_array::StringArray; + use datafusion_common::cast::{as_struct_array, as_uint64_array}; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::raster::{metadata_indices, raster_indices}; + use sedona_testing::data::test_raster; + + let path = test_raster("test4.tiff").expect("test4.tiff should exist"); + + // Create input array with the path + let paths = Arc::new(StringArray::from(vec![path.as_str()])); + let input = ColumnarValue::Array(paths); + + // Invoke the UDF + let kernel = RsFromPath { with_params: false }; + let result = kernel + .invoke_batch_from_args(&[], &[input], &SedonaType::Arrow(DataType::Null), 0, None) + .expect("Should invoke successfully"); + + // Verify result + match result { + ColumnarValue::Array(arr) => { + let struct_arr = as_struct_array(&arr).unwrap(); + assert_eq!(struct_arr.len(), 1); + + // Check dimensions from metadata + let metadata_struct = + as_struct_array(struct_arr.column(raster_indices::METADATA)).unwrap(); + let width = as_uint64_array(metadata_struct.column(metadata_indices::WIDTH)) + .unwrap() + .value(0); + let height = as_uint64_array(metadata_struct.column(metadata_indices::HEIGHT)) + .unwrap() + .value(0); + + assert_eq!(width, 10); + assert_eq!(height, 10); + } + _ => panic!("Expected array result"), + } + } +} diff --git a/rust/sedona-raster-gdal/src/rs_geotiff_tiles.rs b/rust/sedona-raster-gdal/src/rs_geotiff_tiles.rs new file mode 100644 index 000000000..b3d4c07c9 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_geotiff_tiles.rs @@ -0,0 +1,621 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! rs_geotiff_tiles UDTF +//! +//! Read a GeoTIFF file or directory of GeoTIFF files as a table where each row is one +//! internal tile (block) of the source dataset. +//! +//! Output schema: +//! - path: string +//! - x: tile x index (0-based) +//! - y: tile y index (0-based) +//! - rast: out-db raster pointing at the source GeoTIFF band(s) + +use std::any::Any; +use std::path::Path; +use std::sync::Arc; + +use arrow_array::{builder::StringBuilder, builder::UInt32Builder, ArrayRef, RecordBatch}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use async_trait::async_trait; +use datafusion::catalog::TableFunctionImpl; +use datafusion::execution::context::TaskContext; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::expressions::Column; +use datafusion::physical_plan::projection::ProjectionExec; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, +}; +use datafusion::{ + common::{plan_err, Result}, + datasource::TableType, + physical_expr::EquivalenceProperties, + physical_plan::PlanProperties, + prelude::Expr, +}; +use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError, ScalarValue}; +use datafusion_common_runtime::SpawnedTask; +use futures::{StreamExt, TryStreamExt}; +use sedona_gdal::gdal_dyn_bindgen::{VSI_S_IFMT, VSI_S_IFREG}; +use sedona_gdal::spatial_ref::SpatialRef; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata}; +use sedona_schema::raster::StorageType; + +use crate::gdal_common::{ + convert_gdal_err, gdal_to_band_data_type, nodata_f64_to_bytes, normalize_outdb_source_path, + open_gdal_dataset, with_gdal, +}; + +/// Create the rs_geotiff_tiles table function +pub fn rs_geotiff_tiles_udtf() -> Arc { + Arc::new(RsGeoTiffTilesFunction {}) +} + +#[derive(Debug)] +pub struct RsGeoTiffTilesFunction {} + +impl TableFunctionImpl for RsGeoTiffTilesFunction { + fn call(&self, exprs: &[Expr]) -> Result> { + if exprs.is_empty() || exprs.len() > 2 { + return plan_err!( + "rs_geotiff_tiles() expected 1 or 2 arguments (path[, recursive]) but got {}", + exprs.len() + ); + } + + let dir = match &exprs[0] { + Expr::Literal(ScalarValue::Utf8(Some(s)), _) => s.clone(), + Expr::Literal(ScalarValue::Utf8View(Some(s)), _) => s.to_string(), + Expr::Literal(ScalarValue::LargeUtf8(Some(s)), _) => s.clone(), + other => { + return plan_err!("rs_geotiff_tiles() expected literal string path but got {other}") + } + }; + + let recursive = if exprs.len() == 2 { + match &exprs[1] { + Expr::Literal(ScalarValue::Boolean(Some(v)), _) => *v, + other => { + return plan_err!( + "rs_geotiff_tiles() expected literal boolean recursive but got {other}" + ) + } + } + } else { + false + }; + + Ok(Arc::new(GeoTiffTilesProvider::try_new(dir, recursive)?)) + } +} + +#[derive(Debug)] +pub struct GeoTiffTilesProvider { + dir: String, + recursive: bool, + schema: SchemaRef, +} + +impl GeoTiffTilesProvider { + pub fn try_new(dir: String, recursive: bool) -> Result { + let rast_field = sedona_schema::datatypes::RASTER + .to_storage_field("rast", false) + .map_err(|e| exec_datafusion_err!("{e}"))?; + let schema = Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + Field::new("x", DataType::UInt32, false), + Field::new("y", DataType::UInt32, false), + rast_field, + ]); + + Ok(Self { + dir, + recursive, + schema: Arc::new(schema), + }) + } +} + +#[async_trait] +impl datafusion::catalog::TableProvider for GeoTiffTilesProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn table_type(&self) -> TableType { + TableType::View + } + + async fn scan( + &self, + _state: &dyn datafusion::catalog::Session, + projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + let exec = Arc::new(GeoTiffTilesExec::new( + self.dir.clone(), + self.recursive, + self.schema.clone(), + )); + + if let Some(projection) = projection { + let schema = self.schema(); + let exprs: Vec<_> = projection + .iter() + .map(|index| -> (Arc, String) { + let name = schema.field(*index).name(); + (Arc::new(Column::new(name, *index)), name.clone()) + }) + .collect(); + Ok(Arc::new(ProjectionExec::try_new(exprs, exec)?)) + } else { + Ok(exec) + } + } +} + +#[derive(Debug)] +struct GeoTiffTilesExec { + dir: String, + recursive: bool, + schema: SchemaRef, + properties: PlanProperties, +} + +impl GeoTiffTilesExec { + fn new(dir: String, recursive: bool, schema: SchemaRef) -> Self { + let properties = PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + ); + + Self { + dir, + recursive, + schema, + properties, + } + } +} + +impl DisplayAs for GeoTiffTilesExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "GeoTiffTilesExec: path='{}', recursive={}", + self.dir, self.recursive + ) + } +} + +impl ExecutionPlan for GeoTiffTilesExec { + fn name(&self) -> &str { + "GeoTiffTilesExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + Vec::new() + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + Ok(self) + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + let schema_worker = self.schema.clone(); + let schema_empty = self.schema.clone(); + let schema_adapter = self.schema.clone(); + let dir = self.dir.clone(); + let recursive = self.recursive; + + // Collect paths synchronously + let paths = list_geotiffs(&dir, recursive)?; + + // Create a stream that processes files in parallel (bounded) + let stream = futures::stream::iter(paths) + .map(move |path| { + let schema = schema_worker.clone(); + SpawnedTask::spawn_blocking(move || build_batch_for_file(path, schema)) + }) + .buffered(4) // Run up to 4 concurrent GDAL opens/reads + .map(move |res| match res { + Ok(Ok(Some(batch))) => Ok(batch), + Ok(Ok(None)) => Ok(RecordBatch::new_empty(schema_empty.clone())), + Ok(Err(e)) => Err(e), + Err(e) => Err(exec_datafusion_err!("Task failed: {e}")), + }) + .try_filter(|batch| futures::future::ready(batch.num_rows() > 0)); + + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema_adapter, + Box::pin(stream), + ))) + } +} + +pub(crate) fn build_batch_for_file( + path: impl AsRef, + schema: SchemaRef, +) -> Result> { + let path_str = path.as_ref().to_string_lossy().to_string(); + with_gdal(|gdal| { + let ds = open_gdal_dataset(gdal, &path_str, None) + .map_err(|e| exec_datafusion_err!("Failed to open GeoTIFF {path_str}: {e}"))?; + let (width, height) = ds.raster_size(); + + let band_count = ds.raster_count(); + if band_count == 0 { + return Ok(None); + } + + let band1 = ds + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get band 1 for {path_str}: {e}"))?; + let (block_x, block_y) = band1.block_size(); + let block_x = block_x.max(1) as u32; + let block_y = block_y.max(1) as u32; + + let tiles_x = div_ceil_u32(width as u32, block_x); + let tiles_y = div_ceil_u32(height as u32, block_y); + + let geotransform = ds + .geo_transform() + .map_err(|e| exec_datafusion_err!("Failed to get geotransform for {path_str}: {e}"))?; + + let base_metadata = RasterMetadata { + width: width as u64, + height: height as u64, + upperleft_x: geotransform[0], + upperleft_y: geotransform[3], + scale_x: geotransform[1], + scale_y: geotransform[5], + skew_x: geotransform[2], + skew_y: geotransform[4], + }; + + let crs = ds + .spatial_ref() + .ok() + .and_then(|sr: SpatialRef| sr.to_projjson().ok()); + + let total_tiles = (tiles_x * tiles_y) as usize; + let mut path_builder = + StringBuilder::with_capacity(total_tiles, total_tiles * path_str.len()); + let mut x_builder = UInt32Builder::with_capacity(total_tiles); + let mut y_builder = UInt32Builder::with_capacity(total_tiles); + let mut rast_builder = RasterBuilder::new(total_tiles); + + for tile_y in 0..tiles_y { + for tile_x in 0..tiles_x { + let px = tile_x * block_x; + let py = tile_y * block_y; + + let tw = (width as u32).saturating_sub(px).min(block_x); + let th = (height as u32).saturating_sub(py).min(block_y); + if tw == 0 || th == 0 { + continue; + } + + let tile_ulx = base_metadata.upperleft_x + + (px as f64) * base_metadata.scale_x + + (py as f64) * base_metadata.skew_x; + let tile_uly = base_metadata.upperleft_y + + (px as f64) * base_metadata.skew_y + + (py as f64) * base_metadata.scale_y; + + let tile_metadata = RasterMetadata { + width: tw as u64, + height: th as u64, + upperleft_x: tile_ulx, + upperleft_y: tile_uly, + scale_x: base_metadata.scale_x, + scale_y: base_metadata.scale_y, + skew_x: base_metadata.skew_x, + skew_y: base_metadata.skew_y, + }; + + path_builder.append_value(&path_str); + x_builder.append_value(tile_x); + y_builder.append_value(tile_y); + + rast_builder + .start_raster(&tile_metadata, crs.as_deref()) + .map_err(|e| { + exec_datafusion_err!( + "Failed to start raster for {path_str} tile ({tile_x},{tile_y}): {e}" + ) + })?; + + for band_idx in 1..=band_count { + let band = ds.rasterband(band_idx).map_err(|e| { + exec_datafusion_err!("Failed to get band {band_idx} for {path_str}: {e}") + })?; + + let gdal_type = band.band_type(); + let band_data_type = gdal_to_band_data_type(gdal_type).map_err(|_| { + exec_datafusion_err!( + "Unsupported band data type {gdal_type:?} for {path_str} band {band_idx}" + ) + })?; + + let nodata_bytes = band + .no_data_value() + .map(|v| nodata_f64_to_bytes(v, &band_data_type)); + + let band_metadata = BandMetadata { + nodata_value: nodata_bytes, + storage_type: StorageType::OutDbRef, + datatype: band_data_type, + outdb_url: Some(path_str.clone()), + outdb_band_id: Some(band_idx as u32), + }; + + rast_builder.start_band(band_metadata).map_err(|e| { + exec_datafusion_err!("Failed to start band {band_idx} for {path_str}: {e}") + })?; + + rast_builder.band_data_writer().append_value([]); + + rast_builder.finish_band().map_err(|e| { + exec_datafusion_err!("Failed to finish band {band_idx} for {path_str}: {e}") + })?; + } + + rast_builder.finish_raster().map_err(|e| { + exec_datafusion_err!( + "Failed to finish raster for {path_str} tile ({tile_x},{tile_y}): {e}" + ) + })?; + } + } + + let rast_array: ArrayRef = Arc::new( + rast_builder + .finish() + .map_err(|e| exec_datafusion_err!("Failed to build rasters: {e}"))?, + ); + let path_array: ArrayRef = Arc::new(path_builder.finish()); + let x_array: ArrayRef = Arc::new(x_builder.finish()); + let y_array: ArrayRef = Arc::new(y_builder.finish()); + + let batch = RecordBatch::try_new(schema, vec![path_array, x_array, y_array, rast_array]) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(Some(batch)) + }) +} + +fn list_geotiffs(path: &str, recursive: bool) -> Result> { + let normalized_path = normalize_outdb_source_path(path); + + if with_gdal(|gdal| Ok(open_gdal_dataset(gdal, &normalized_path, None).is_ok()))? { + if !is_geotiff_path_str(&normalized_path) { + return exec_err!("rs_geotiff_tiles(): path is not a GeoTIFF file: {path}"); + } + return Ok(vec![normalized_path]); + } + + let recurse_depth = if recursive { -1 } else { 0 }; + let list_result = with_gdal(|gdal| { + let separator = gdal + .vsi_directory_separator(&normalized_path) + .map_err(convert_gdal_err)?; + let mut dir = gdal + .open_vsi_dir(&normalized_path, recurse_depth, None) + .map_err(convert_gdal_err)?; + + let mut out = Vec::new(); + for entry in &mut dir { + let Some(mode) = entry.mode else { + continue; + }; + + // Ignore this entry if it is not a regular file + if (mode & VSI_S_IFMT) != VSI_S_IFREG { + continue; + } + + let child_path = join_vsi_path(&normalized_path, &separator, &entry.name); + if is_geotiff_path_str(&child_path) { + out.push(child_path); + } + } + + out.sort(); + Ok(out) + }); + + match list_result { + Ok(paths) => Ok(paths), + Err(_) if is_geotiff_path_str(&normalized_path) => { + exec_err!("rs_geotiff_tiles(): failed to open GeoTIFF file: {path}") + } + Err(_) => exec_err!("rs_geotiff_tiles(): path is not a GeoTIFF file or directory: {path}"), + } +} + +fn join_vsi_path(base: &str, separator: &str, child_name: &str) -> String { + if base.ends_with(separator) { + format!("{base}{child_name}") + } else { + format!("{base}{separator}{child_name}") + } +} + +fn is_geotiff_path_str(path: &str) -> bool { + let path_without_fragment = path.split('#').next().unwrap_or(path); + let path_without_query = path_without_fragment + .split('?') + .next() + .unwrap_or(path_without_fragment); + let file_name = path_without_query + .rsplit(['/', '\\']) + .next() + .unwrap_or(path_without_query); + match file_name.rsplit_once('.') { + Some((_, ext)) => ext.eq_ignore_ascii_case("tif") || ext.eq_ignore_ascii_case("tiff"), + None => false, + } +} + +fn div_ceil_u32(n: u32, d: u32) -> u32 { + if d == 0 { + return 0; + } + n.div_ceil(d) +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion::catalog::TableProvider; + use datafusion::prelude::SessionContext; + use sedona_testing::data::test_raster; + use tempfile::tempdir; + + #[tokio::test] + async fn udtf_registration_smoke() { + let ctx = SessionContext::new(); + ctx.register_udtf("rs_geotiff_tiles", rs_geotiff_tiles_udtf()); + } + + #[test] + fn list_geotiffs_non_recursive() { + let tmp = tempdir().unwrap(); + let base = tmp.path(); + std::fs::write(base.join("a.tif"), b"not a real tiff").unwrap(); + std::fs::create_dir(base.join("sub")).unwrap(); + std::fs::write(base.join("sub").join("b.tif"), b"not a real tiff").unwrap(); + + let files = list_geotiffs(base.to_str().unwrap(), false).unwrap(); + assert_eq!(files.len(), 1); + assert!(files[0].ends_with("a.tif")); + } + + #[test] + fn list_geotiffs_file_input_returns_single() { + let tmp = tempdir().unwrap(); + let base = tmp.path(); + let file_path = base.join("single.tiff"); + let src = test_raster("test4.tiff").unwrap(); + std::fs::copy(&src, &file_path).unwrap(); + + let files = list_geotiffs(file_path.to_str().unwrap(), true).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0], file_path.to_string_lossy().to_string()); + } + + #[test] + fn list_geotiffs_file_input_non_tiff_errors() { + let tmp = tempdir().unwrap(); + let base = tmp.path(); + let file_path = base.join("single.txt"); + std::fs::write(&file_path, b"not a real tiff").unwrap(); + + let err = list_geotiffs(file_path.to_str().unwrap(), false).unwrap_err(); + assert!(err + .to_string() + .contains("rs_geotiff_tiles(): path is not a GeoTIFF file")); + } + + #[test] + fn helper_join_vsi_path_and_extension_filtering() { + assert_eq!( + join_vsi_path("/vsis3/bucket/prefix", "/", "x.tif"), + "/vsis3/bucket/prefix/x.tif" + ); + assert_eq!( + join_vsi_path("/vsis3/bucket/prefix/", "/", "x.tif"), + "/vsis3/bucket/prefix/x.tif" + ); + + assert!(is_geotiff_path_str("/tmp/a.tif")); + assert!(is_geotiff_path_str("/tmp/a.TIFF")); + assert!(is_geotiff_path_str("https://host/data.tif?token=abc#f")); + assert!(!is_geotiff_path_str("/tmp/a.txt")); + assert!(!is_geotiff_path_str("/tmp/a")); + } + + #[tokio::test] + async fn provider_builds_rows_for_test_raster() { + let tmp = tempdir().unwrap(); + let base = tmp.path(); + + let src = test_raster("test4.tiff").unwrap(); + let dst = base.join("test4.tiff"); + std::fs::copy(&src, &dst).unwrap(); + + let provider = GeoTiffTilesProvider::try_new(base.to_string_lossy().to_string(), false) + .expect("provider created"); + + // Directly call the batch builder to validate schema + non-empty output. + let batch = build_batch_for_file(dst, provider.schema()) + .expect("build success") + .expect("batch present"); + + assert_eq!(batch.schema().fields().len(), 4); + assert_eq!(batch.num_columns(), 4); + // For a 10x10 raster, any reasonable tiling should produce at least one tile. + assert!(batch.num_rows() >= 1); + } + + #[test] + fn rast_field_has_raster_metadata() { + let provider = GeoTiffTilesProvider::try_new("/tmp".to_string(), false).unwrap(); + let schema = provider.schema(); + let rast_field = schema.field_with_name("rast").unwrap(); + let sedona_type = sedona_schema::datatypes::SedonaType::from_storage_field(rast_field) + .expect("sedona type"); + assert_eq!(sedona_type, sedona_schema::datatypes::RASTER); + assert_eq!( + rast_field + .metadata() + .get("ARROW:extension:name") + .map(|s| s.as_str()), + Some("sedona.raster") + ); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_map_algebra.rs b/rust/sedona-raster-gdal/src/rs_map_algebra.rs new file mode 100644 index 000000000..e69877565 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_map_algebra.rs @@ -0,0 +1,796 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_MapAlgebra UDF - Apply a map algebra expression on raster(s) +//! +//! This function evaluates a mathematical expression for each pixel in the input raster(s) +//! and produces an output raster. The expression can reference input raster bands using +//! `rast[band_index]` syntax (or `rast0[band_index]` and `rast1[band_index]` for two-raster +//! operations). +//! +//! # Expression Syntax +//! +//! The expression evaluator supports standard mathematical operations: +//! - Arithmetic: `+`, `-`, `*`, `/`, `%` (modulo), `^` (power) +//! - Comparison: `==`, `!=`, `<`, `<=`, `>`, `>=` +//! - Logic: `&&`, `||`, `!` +//! - Functions: `min`, `max`, `abs`, `sqrt`, `sin`, `cos`, `tan`, `ln`, `log`, `exp`, `floor`, `ceil`, `round` +//! - Conditionals: `if(condition, true_value, false_value)` +//! +//! # Variables +//! +//! For single-raster operations: +//! - `rast` or `rast0`, `rast1`, ..., `rastN`: Band values (where N is band index, 0-based) +//! +//! For two-raster operations: +//! - `rast0_0`, `rast0_1`, ...: First raster's band values +//! - `rast1_0`, `rast1_1`, ...: Second raster's band values +//! +//! Additional variables: +//! - `x`: Current pixel column (0-based) +//! - `y`: Current pixel row (0-based) +//! - `width`: Raster width +//! - `height`: Raster height + +use std::sync::Arc; + +use datafusion_common::cast::{as_float64_array, as_int32_array, as_string_array}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, exec_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use evalexpr::{build_operator_tree, ContextWithMutableVariables, HashMapContext, Value}; +use sedona_gdal::gdal::Gdal; + +use arrow_schema::DataType; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::{BandMetadata, RasterMetadata, RasterRef}; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::{SedonaType, RASTER}; +use sedona_schema::matchers::ArgMatcher; +use sedona_schema::raster::{BandDataType, StorageType}; + +use crate::gdal_common::nodata_f64_to_bytes; +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::raster_band_reader::RasterBandReader; + +/// RS_MapAlgebra() scalar UDF implementation +/// +/// Apply a map algebra expression on raster(s) +pub fn rs_map_algebra_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_mapalgebra", + vec![ + // Single raster variants + Arc::new(RsMapAlgebra { + two_raster: false, + with_nodata: false, + with_num_bands: false, + }), + Arc::new(RsMapAlgebra { + two_raster: false, + with_nodata: true, + with_num_bands: false, + }), + Arc::new(RsMapAlgebra { + two_raster: false, + with_nodata: true, + with_num_bands: true, + }), + // Two raster variants + Arc::new(RsMapAlgebra { + two_raster: true, + with_nodata: false, + with_num_bands: false, + }), + Arc::new(RsMapAlgebra { + two_raster: true, + with_nodata: true, + with_num_bands: false, + }), + Arc::new(RsMapAlgebra { + two_raster: true, + with_nodata: true, + with_num_bands: true, + }), + ], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_MapAlgebra +#[derive(Debug)] +struct RsMapAlgebra { + two_raster: bool, + with_nodata: bool, + with_num_bands: bool, +} + +impl SedonaScalarKernel for RsMapAlgebra { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = if self.two_raster { + if self.with_num_bands { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ArgMatcher::is_numeric(), + ArgMatcher::is_integer(), + ] + } else if self.with_nodata { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ArgMatcher::is_numeric(), + ] + } else { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ] + } + } else if self.with_num_bands { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ArgMatcher::is_numeric(), + ArgMatcher::is_integer(), + ] + } else if self.with_nodata { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ArgMatcher::is_numeric(), + ] + } else { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_string(), + ArgMatcher::is_string(), + ] + }; + + let matcher = ArgMatcher::new(matchers, RASTER); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let num_iterations = executor.num_iterations(); + + // Parse arguments based on signature + let (pixel_type_idx, script_idx, nodata_idx, num_bands_idx) = if self.two_raster { + if self.with_num_bands { + (2, 3, Some(4), Some(5)) + } else if self.with_nodata { + (2, 3, Some(4), None) + } else { + (2, 3, None, None) + } + } else if self.with_num_bands { + (1, 2, Some(3), Some(4)) + } else if self.with_nodata { + (1, 2, Some(3), None) + } else { + (1, 2, None, None) + }; + + // Convert all non-raster args to arrays upfront via into_array + let pixel_type_array = args[pixel_type_idx] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let pixel_type_array = as_string_array(&pixel_type_array)?; + + let script_array = args[script_idx] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let script_array = as_string_array(&script_array)?; + + let nodata_array = match nodata_idx { + Some(idx) => args[idx] + .clone() + .cast_to(&DataType::Float64, None)? + .into_array(num_iterations)?, + None => ScalarValue::Float64(None).to_array_of_size(num_iterations)?, + }; + let nodata_array = as_float64_array(&nodata_array)?; + + let num_bands_array = match num_bands_idx { + Some(idx) => args[idx] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?, + None => ScalarValue::Int32(Some(1)).to_array_of_size(num_iterations)?, + }; + let num_bands_array = as_int32_array(&num_bands_array)?; + + let mut pixel_type_iter = pixel_type_array.iter(); + let mut script_iter = script_array.iter(); + let mut nodata_iter = nodata_array.iter(); + let mut num_bands_iter = num_bands_array.iter(); + + let mut builder = RasterBuilder::new(num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + if self.two_raster { + executor.execute_raster_raster_void(|_i, raster0_opt, raster1_opt| { + let pixel_type_opt = pixel_type_iter.next().unwrap(); + let script_opt = script_iter.next().unwrap(); + let nodata_opt = nodata_iter.next().unwrap(); + let num_bands_opt = num_bands_iter.next().unwrap(); + + let raster0 = match raster0_opt { + Some(r) => r, + None => { + builder.append_null()?; + return Ok(()); + } + }; + let raster1 = match raster1_opt { + Some(r) => r, + None => { + builder.append_null()?; + return Ok(()); + } + }; + + process_map_algebra_row( + &mut builder, + gdal, + raster0, + Some(raster1), + pixel_type_opt, + script_opt, + nodata_opt, + num_bands_opt, + ) + })?; + } else { + executor.execute_raster_void(|_i, raster0_opt| { + let pixel_type_opt = pixel_type_iter.next().unwrap(); + let script_opt = script_iter.next().unwrap(); + let nodata_opt = nodata_iter.next().unwrap(); + let num_bands_opt = num_bands_iter.next().unwrap(); + + let raster0 = match raster0_opt { + Some(r) => r, + None => { + builder.append_null()?; + return Ok(()); + } + }; + + process_map_algebra_row( + &mut builder, + gdal, + raster0, + None, + pixel_type_opt, + script_opt, + nodata_opt, + num_bands_opt, + ) + })?; + } + + executor.finish(Arc::new(builder.finish()?)) + }) + } +} + +/// Process a single row of map algebra (shared between single- and two-raster paths) +#[allow(clippy::too_many_arguments)] +fn process_map_algebra_row( + builder: &mut RasterBuilder, + gdal: &Gdal, + raster0: &RasterRefImpl<'_>, + raster1: Option<&RasterRefImpl<'_>>, + pixel_type_opt: Option<&str>, + script_opt: Option<&str>, + nodata_opt: Option, + num_bands_opt: Option, +) -> Result<()> { + let pixel_type_str = match pixel_type_opt { + Some(s) => s, + None => { + builder.append_null()?; + return Ok(()); + } + }; + let script = match script_opt { + Some(s) => s, + None => { + builder.append_null()?; + return Ok(()); + } + }; + + let output_type = parse_pixel_type(pixel_type_str)?; + let compiled_expr = build_operator_tree(script) + .map_err(|e| exec_datafusion_err!("Failed to parse expression '{}': {}", script, e))?; + let nodata = nodata_opt; + let num_bands = num_bands_opt.map(|v| v.max(1) as usize).unwrap_or(1); + + match apply_map_algebra( + gdal, + raster0, + raster1, + &compiled_expr, + &output_type, + nodata, + num_bands, + ) { + Ok(result_data) => { + build_result_raster(builder, raster0, &result_data)?; + } + Err(e) => { + eprintln!("RS_MapAlgebra error: {}", e); + builder.append_null()?; + } + } + Ok(()) +} + +/// Output data from map algebra operation +struct MapAlgebraResult { + band_data: Vec>, + band_metadata: Vec, +} + +/// Parse pixel type string to BandDataType +fn parse_pixel_type(pixel_type: &str) -> Result { + match pixel_type.to_uppercase().as_str() { + "B" | "BYTE" | "UINT8" => Ok(BandDataType::UInt8), + "I8" | "INT8" => Ok(BandDataType::Int8), + "S" | "SHORT" | "INT16" => Ok(BandDataType::Int16), + "US" | "USHORT" | "UINT16" => Ok(BandDataType::UInt16), + "I" | "INT" | "INT32" => Ok(BandDataType::Int32), + "UI" | "UINT" | "UINT32" => Ok(BandDataType::UInt32), + "U64" | "UINT64" => Ok(BandDataType::UInt64), + "I64" | "INT64" => Ok(BandDataType::Int64), + "F" | "FLOAT" | "FLOAT32" => Ok(BandDataType::Float32), + "D" | "DOUBLE" | "FLOAT64" => Ok(BandDataType::Float64), + _ => exec_err!( + "Unknown pixel type '{}'. Use: B(yte), I8, S(hort), I(nt), U64, I64, F(loat), D(ouble)", + pixel_type + ), + } +} + +/// Apply map algebra expression to raster(s) +fn apply_map_algebra( + gdal: &Gdal, + raster0: &RasterRefImpl<'_>, + raster1: Option<&RasterRefImpl<'_>>, + expr: &evalexpr::Node, + output_type: &BandDataType, + nodata: Option, + num_bands: usize, +) -> Result { + let metadata = raster0.metadata(); + let width = metadata.width() as usize; + let height = metadata.height() as usize; + let pixel_count = width * height; + + // Read all band data from first raster + let bands0 = raster0.bands(); + let mut reader0 = RasterBandReader::new(gdal, raster0); + let band_data0: Vec> = (1..=bands0.len()) + .map(|i| reader0.read_band_f64(i)) + .collect::>>()?; + + // Read all band data from second raster (if present) + let band_data1: Option>> = if let Some(r1) = raster1 { + // Validate dimensions match + let m1 = r1.metadata(); + if m1.width() != metadata.width() || m1.height() != metadata.height() { + return exec_err!("Raster dimensions must match for two-raster map algebra"); + } + let bands1 = r1.bands(); + let mut reader1 = RasterBandReader::new(gdal, r1); + Some( + (1..=bands1.len()) + .map(|i| reader1.read_band_f64(i)) + .collect::>>()?, + ) + } else { + None + }; + + // Allocate output band data + let byte_size = data_type_byte_size(output_type); + let mut output_bands: Vec> = (0..num_bands) + .map(|_| vec![0u8; pixel_count * byte_size]) + .collect(); + + // Determine nodata value + let nodata_val = nodata.unwrap_or(0.0); + + // Create evaluation context + let mut context = HashMapContext::new(); + + // Set constant variables + context + .set_value("width".to_string(), Value::Float(width as f64)) + .map_err(|e| exec_datafusion_err!("Failed to set width: {}", e))?; + context + .set_value("height".to_string(), Value::Float(height as f64)) + .map_err(|e| exec_datafusion_err!("Failed to set height: {}", e))?; + + // Evaluate expression for each pixel + for pixel_idx in 0..pixel_count { + let x = pixel_idx % width; + let y = pixel_idx / width; + + // Set position variables + context + .set_value("x".to_string(), Value::Float(x as f64)) + .map_err(|e| exec_datafusion_err!("Failed to set x: {}", e))?; + context + .set_value("y".to_string(), Value::Float(y as f64)) + .map_err(|e| exec_datafusion_err!("Failed to set y: {}", e))?; + + // Set band values for first raster + // Support both rast0, rast1, ... and rast0_0, rast0_1, ... syntax + for (band_idx, band_values) in band_data0.iter().enumerate() { + let value = band_values[pixel_idx]; + // rast0, rast1, rast2, ... (single raster syntax) + context + .set_value(format!("rast{}", band_idx), Value::Float(value)) + .map_err(|e| exec_datafusion_err!("Failed to set rast{}: {}", band_idx, e))?; + // rast0_0, rast0_1, ... (two-raster syntax, first raster) + context + .set_value(format!("rast0_{}", band_idx), Value::Float(value)) + .map_err(|e| exec_datafusion_err!("Failed to set rast0_{}: {}", band_idx, e))?; + } + + // Set band values for second raster (if present) + if let Some(ref bands1) = band_data1 { + for (band_idx, band_values) in bands1.iter().enumerate() { + let value = band_values[pixel_idx]; + context + .set_value(format!("rast1_{}", band_idx), Value::Float(value)) + .map_err(|e| exec_datafusion_err!("Failed to set rast1_{}: {}", band_idx, e))?; + } + } + + // Evaluate expression + let result = expr.eval_with_context(&context).map_err(|e| { + exec_datafusion_err!( + "Expression evaluation failed at pixel ({}, {}): {}", + x, + y, + e + ) + })?; + + // Handle the result based on number of output bands + if num_bands == 1 { + // Single output band - use the result directly + let value = value_to_f64(&result)?; + write_pixel_value(&mut output_bands[0], pixel_idx, output_type, value); + } else { + // Multiple output bands - expect a tuple result or set all bands to same value + match result { + Value::Tuple(values) => { + for (band_idx, val) in values.iter().enumerate().take(num_bands) { + let value = value_to_f64(val)?; + write_pixel_value( + &mut output_bands[band_idx], + pixel_idx, + output_type, + value, + ); + } + // If tuple has fewer values than bands, fill remaining with nodata + for band in output_bands.iter_mut().take(num_bands).skip(values.len()) { + write_pixel_value(band, pixel_idx, output_type, nodata_val); + } + } + _ => { + // Single value - apply to first band, nodata for rest + let value = value_to_f64(&result)?; + write_pixel_value(&mut output_bands[0], pixel_idx, output_type, value); + for band in output_bands.iter_mut().take(num_bands).skip(1) { + write_pixel_value(band, pixel_idx, output_type, nodata_val); + } + } + } + } + } + + // Build band metadata + let band_metadata: Vec = (0..num_bands) + .map(|_| BandMetadata { + nodata_value: nodata.map(|v| nodata_f64_to_bytes(v, output_type)), + storage_type: StorageType::InDb, + datatype: *output_type, + outdb_url: None, + outdb_band_id: None, + }) + .collect(); + + Ok(MapAlgebraResult { + band_data: output_bands, + band_metadata, + }) +} + +/// Convert evalexpr Value to f64 +fn value_to_f64(value: &Value) -> Result { + match value { + Value::Float(f) => Ok(*f), + Value::Int(i) => Ok(*i as f64), + Value::Boolean(b) => Ok(if *b { 1.0 } else { 0.0 }), + _ => exec_err!("Cannot convert {:?} to numeric value", value), + } +} + +/// Write a pixel value to band data +fn write_pixel_value(data: &mut [u8], offset: usize, data_type: &BandDataType, value: f64) { + let byte_size = data_type_byte_size(data_type); + let byte_offset = offset * byte_size; + + match data_type { + BandDataType::UInt8 => { + data[byte_offset] = value.clamp(0.0, 255.0) as u8; + } + BandDataType::Int8 => { + let v = value.clamp(i8::MIN as f64, i8::MAX as f64) as i8; + data[byte_offset] = v as u8; + } + BandDataType::UInt16 => { + let v = value.clamp(0.0, u16::MAX as f64) as u16; + data[byte_offset..byte_offset + 2].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::Int16 => { + let v = value.clamp(i16::MIN as f64, i16::MAX as f64) as i16; + data[byte_offset..byte_offset + 2].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::UInt32 => { + let v = value.clamp(0.0, u32::MAX as f64) as u32; + data[byte_offset..byte_offset + 4].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::Int32 => { + let v = value.clamp(i32::MIN as f64, i32::MAX as f64) as i32; + data[byte_offset..byte_offset + 4].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::UInt64 => { + let v = value.clamp(0.0, u64::MAX as f64) as u64; + data[byte_offset..byte_offset + 8].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::Int64 => { + let v = value.clamp(i64::MIN as f64, i64::MAX as f64) as i64; + data[byte_offset..byte_offset + 8].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::Float32 => { + let v = value as f32; + data[byte_offset..byte_offset + 4].copy_from_slice(&v.to_le_bytes()); + } + BandDataType::Float64 => { + data[byte_offset..byte_offset + 8].copy_from_slice(&value.to_le_bytes()); + } + } +} + +/// Get byte size of data type +fn data_type_byte_size(data_type: &BandDataType) -> usize { + match data_type { + BandDataType::UInt8 => 1, + BandDataType::Int8 => 1, + BandDataType::UInt16 | BandDataType::Int16 => 2, + BandDataType::UInt32 | BandDataType::Int32 | BandDataType::Float32 => 4, + BandDataType::UInt64 | BandDataType::Int64 => 8, + BandDataType::Float64 => 8, + } +} + +/// Build result raster using RasterBuilder +fn build_result_raster( + builder: &mut RasterBuilder, + original_raster: &RasterRefImpl<'_>, + result: &MapAlgebraResult, +) -> Result<()> { + let original_metadata = original_raster.metadata(); + + let metadata = RasterMetadata { + width: original_metadata.width(), + height: original_metadata.height(), + upperleft_x: original_metadata.upper_left_x(), + upperleft_y: original_metadata.upper_left_y(), + scale_x: original_metadata.scale_x(), + scale_y: original_metadata.scale_y(), + skew_x: original_metadata.skew_x(), + skew_y: original_metadata.skew_y(), + }; + + builder + .start_raster(&metadata, original_raster.crs()) + .map_err(|e| exec_datafusion_err!("Failed to start raster: {}", e))?; + + for (band_data, band_metadata) in result.band_data.iter().zip(result.band_metadata.iter()) { + builder + .start_band(band_metadata.clone()) + .map_err(|e| exec_datafusion_err!("Failed to start band: {}", e))?; + builder.band_data_writer().append_value(band_data); + builder + .finish_band() + .map_err(|e| exec_datafusion_err!("Failed to finish band: {}", e))?; + } + + builder + .finish_raster() + .map_err(|e| exec_datafusion_err!("Failed to finish raster: {}", e))?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use sedona_raster::array::RasterStructArray; + + #[test] + fn test_parse_pixel_type() { + assert_eq!(parse_pixel_type("B").unwrap(), BandDataType::UInt8); + assert_eq!(parse_pixel_type("byte").unwrap(), BandDataType::UInt8); + assert_eq!(parse_pixel_type("I8").unwrap(), BandDataType::Int8); + assert_eq!(parse_pixel_type("S").unwrap(), BandDataType::Int16); + assert_eq!(parse_pixel_type("I").unwrap(), BandDataType::Int32); + assert_eq!(parse_pixel_type("U64").unwrap(), BandDataType::UInt64); + assert_eq!(parse_pixel_type("I64").unwrap(), BandDataType::Int64); + assert_eq!(parse_pixel_type("F").unwrap(), BandDataType::Float32); + assert_eq!(parse_pixel_type("D").unwrap(), BandDataType::Float64); + assert_eq!(parse_pixel_type("FLOAT64").unwrap(), BandDataType::Float64); + assert!(parse_pixel_type("X").is_err()); + } + + #[test] + fn test_value_to_f64() { + let pi = std::f64::consts::PI; + assert!((value_to_f64(&Value::Float(pi)).unwrap() - pi).abs() < f64::EPSILON); + assert_eq!(value_to_f64(&Value::Int(42)).unwrap(), 42.0); + assert_eq!(value_to_f64(&Value::Boolean(true)).unwrap(), 1.0); + assert_eq!(value_to_f64(&Value::Boolean(false)).unwrap(), 0.0); + } + + #[test] + fn test_write_pixel_value() { + let mut data = vec![0u8; 8]; + + // Test UInt8 + write_pixel_value(&mut data, 0, &BandDataType::UInt8, 128.0); + assert_eq!(data[0], 128); + + // Test Float64 + let mut data64 = vec![0u8; 8]; + let pi = std::f64::consts::PI; + write_pixel_value(&mut data64, 0, &BandDataType::Float64, pi); + let read_back = f64::from_le_bytes([ + data64[0], data64[1], data64[2], data64[3], data64[4], data64[5], data64[6], data64[7], + ]); + assert!((read_back - pi).abs() < 1e-10); + } + + #[test] + fn test_expression_evaluation() { + let expr = build_operator_tree("rast0 * 2 + 1").unwrap(); + let mut context = HashMapContext::new(); + context + .set_value("rast0".to_string(), Value::Float(10.0)) + .unwrap(); + let result = expr.eval_with_context(&context).unwrap(); + assert_eq!(value_to_f64(&result).unwrap(), 21.0); + } + + #[test] + fn test_ndvi_expression() { + // NDVI = (NIR - Red) / (NIR + Red) + let expr = build_operator_tree("(rast3 - rast0) / (rast3 + rast0)").unwrap(); + let mut context = HashMapContext::new(); + // Simulate: Red=100, NIR=200 + context + .set_value("rast0".to_string(), Value::Float(100.0)) + .unwrap(); + context + .set_value("rast3".to_string(), Value::Float(200.0)) + .unwrap(); + let result = expr.eval_with_context(&context).unwrap(); + let ndvi = value_to_f64(&result).unwrap(); + // NDVI = (200-100)/(200+100) = 100/300 = 0.333... + assert!((ndvi - 0.333333).abs() < 0.001); + } + + #[test] + fn test_map_algebra_with_test_raster() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let result = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let expr = build_operator_tree("rast0 * 2").unwrap(); + let metadata = raster.metadata(); + let expected_size = + (metadata.width() * metadata.height()) as usize * std::mem::size_of::(); + let output = + apply_map_algebra(gdal, &raster, None, &expr, &BandDataType::Float64, None, 1)?; + Ok::<_, datafusion_common::DataFusionError>((output, expected_size)) + }); + assert!( + result.is_ok(), + "Map algebra should succeed: {:?}", + result.err() + ); + + let (output, expected_size) = result.unwrap(); + assert_eq!(output.band_data.len(), 1); + assert_eq!(output.band_data[0].len(), expected_size); + } + + #[test] + fn test_map_algebra_multi_band_output() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let result = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let expr = build_operator_tree("rast0 + rast0").unwrap(); + apply_map_algebra( + gdal, + &raster, + None, + &expr, + &BandDataType::Float32, + Some(0.0), + 2, + ) + }); + assert!(result.is_ok()); + + let output = result.unwrap(); + assert_eq!(output.band_data.len(), 2); + assert_eq!(output.band_metadata.len(), 2); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_metadata.rs b/rust/sedona-raster-gdal/src/rs_metadata.rs new file mode 100644 index 000000000..340725ee7 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_metadata.rs @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{sync::Arc, vec}; + +use arrow_array::builder::{Float64Builder, Int32Builder, UInt64Builder}; +use arrow_array::StructArray; +use arrow_schema::{DataType, Field, Fields}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::exec_datafusion_err; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::traits::RasterRef; +use sedona_schema::crs::deserialize_crs; +use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::{configure_thread_local_options, thread_local_provider}; + +/// RS_MetaData() scalar UDF implementation (GDAL-backed) +pub fn rs_metadata_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_metadata", + vec![Arc::new(RsMetaData {})], + Volatility::Immutable, + ) +} + +fn metadata_struct_fields() -> Fields { + Fields::from(vec![ + Field::new("upperLeftX", DataType::Float64, true), + Field::new("upperLeftY", DataType::Float64, true), + Field::new("gridWidth", DataType::UInt64, true), + Field::new("gridHeight", DataType::UInt64, true), + Field::new("scaleX", DataType::Float64, true), + Field::new("scaleY", DataType::Float64, true), + Field::new("skewX", DataType::Float64, true), + Field::new("skewY", DataType::Float64, true), + Field::new("srid", DataType::Int32, true), + Field::new("numSampleDimensions", DataType::UInt64, true), + Field::new("tileWidth", DataType::UInt64, true), + Field::new("tileHeight", DataType::UInt64, true), + ]) +} + +#[derive(Debug)] +struct RsMetaData {} + +impl SedonaScalarKernel for RsMetaData { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ArgMatcher::is_raster()], + SedonaType::Arrow(DataType::Struct(metadata_struct_fields())), + ); + + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let executor = sedona_raster_functions::RasterExecutor::new(arg_types, args); + let capacity = executor.num_iterations(); + + let mut upper_left_x_builder = Float64Builder::with_capacity(capacity); + let mut upper_left_y_builder = Float64Builder::with_capacity(capacity); + let mut grid_width_builder = UInt64Builder::with_capacity(capacity); + let mut grid_height_builder = UInt64Builder::with_capacity(capacity); + let mut scale_x_builder = Float64Builder::with_capacity(capacity); + let mut scale_y_builder = Float64Builder::with_capacity(capacity); + let mut skew_x_builder = Float64Builder::with_capacity(capacity); + let mut skew_y_builder = Float64Builder::with_capacity(capacity); + let mut srid_builder = Int32Builder::with_capacity(capacity); + let mut num_bands_builder = UInt64Builder::with_capacity(capacity); + let mut tile_width_builder = UInt64Builder::with_capacity(capacity); + let mut tile_height_builder = UInt64Builder::with_capacity(capacity); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + let provider = thread_local_provider(gdal) + .map_err(|e| exec_datafusion_err!("Failed to init GDAL provider: {e}"))?; + + executor.execute_raster_void(|_i, raster_opt| { + match raster_opt { + None => { + upper_left_x_builder.append_null(); + upper_left_y_builder.append_null(); + grid_width_builder.append_null(); + grid_height_builder.append_null(); + scale_x_builder.append_null(); + scale_y_builder.append_null(); + skew_x_builder.append_null(); + skew_y_builder.append_null(); + srid_builder.append_null(); + num_bands_builder.append_null(); + tile_width_builder.append_null(); + tile_height_builder.append_null(); + } + Some(raster) => { + let metadata = raster.metadata(); + + upper_left_x_builder.append_value(metadata.upper_left_x()); + upper_left_y_builder.append_value(metadata.upper_left_y()); + grid_width_builder.append_value(metadata.width()); + grid_height_builder.append_value(metadata.height()); + scale_x_builder.append_value(metadata.scale_x()); + scale_y_builder.append_value(metadata.scale_y()); + skew_x_builder.append_value(metadata.skew_x()); + skew_y_builder.append_value(metadata.skew_y()); + + let srid = match raster.crs() { + None => 0i32, + Some(crs_str) => match deserialize_crs(crs_str) { + Ok(Some(crs_ref)) => { + crs_ref.srid().ok().flatten().map(|s| s as i32).unwrap_or(0) + } + _ => 0i32, + }, + }; + srid_builder.append_value(srid); + + num_bands_builder.append_value(raster.bands().len() as u64); + + let dataset = provider.raster_ref_to_gdal(raster).map_err(|e| { + exec_datafusion_err!("Failed to create GDAL dataset: {e}") + })?; + let band1 = dataset + .as_dataset() + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get band 1: {e}"))?; + let (block_x, block_y) = band1.block_size(); + tile_width_builder.append_value(block_x.max(1) as u64); + tile_height_builder.append_value(block_y.max(1) as u64); + } + } + Ok(()) + }) + })?; + + let struct_array = StructArray::new( + metadata_struct_fields(), + vec![ + Arc::new(upper_left_x_builder.finish()), + Arc::new(upper_left_y_builder.finish()), + Arc::new(grid_width_builder.finish()), + Arc::new(grid_height_builder.finish()), + Arc::new(scale_x_builder.finish()), + Arc::new(scale_y_builder.finish()), + Arc::new(skew_x_builder.finish()), + Arc::new(skew_y_builder.finish()), + Arc::new(srid_builder.finish()), + Arc::new(num_bands_builder.finish()), + Arc::new(tile_width_builder.finish()), + Arc::new(tile_height_builder.finish()), + ], + None, + ); + + executor.finish(Arc::new(struct_array)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::cast::AsArray; + use datafusion_expr::ScalarUDF; + use sedona_raster::array::RasterStructArray; + use sedona_schema::datatypes::RASTER; + use sedona_testing::testers::ScalarUdfTester; + + #[test] + fn rs_metadata_udf_docs() { + let udf: ScalarUDF = rs_metadata_udf().into(); + assert_eq!(udf.name(), "rs_metadata"); + } + + #[test] + fn rs_metadata_tile_dimensions_from_gdal() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let (raster_array, block_x, block_y) = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let provider = thread_local_provider(gdal).unwrap(); + let dataset = provider.raster_ref_to_gdal(&raster).unwrap(); + let band1 = dataset.as_dataset().rasterband(1).unwrap(); + let (block_x, block_y) = band1.block_size(); + Ok::<_, datafusion_common::DataFusionError>((raster_array, block_x, block_y)) + }) + .unwrap(); + + let udf: ScalarUDF = rs_metadata_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER]); + let result = tester.invoke_array(Arc::new(raster_array)).unwrap(); + let struct_array = result.as_struct(); + + let tile_width = struct_array + .column(10) + .as_primitive::() + .value(0); + let tile_height = struct_array + .column(11) + .as_primitive::() + .value(0); + + assert_eq!(tile_width, block_x.max(1) as u64); + assert_eq!(tile_height, block_y.max(1) as u64); + } + + #[test] + fn rs_metadata_tile_dimensions_golden() { + let test_file = sedona_testing::data::test_raster("test5.tiff").unwrap(); + let (raster_array, block_x, block_y) = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let provider = thread_local_provider(gdal).unwrap(); + let dataset = provider.raster_ref_to_gdal(&raster).unwrap(); + let band1 = dataset.as_dataset().rasterband(1).unwrap(); + let (block_x, block_y) = band1.block_size(); + Ok::<_, datafusion_common::DataFusionError>((raster_array, block_x, block_y)) + }) + .unwrap(); + + let udf: ScalarUDF = rs_metadata_udf().into(); + let tester = ScalarUdfTester::new(udf, vec![RASTER]); + let result = tester.invoke_array(Arc::new(raster_array)).unwrap(); + let struct_array = result.as_struct(); + + let tile_width = struct_array + .column(10) + .as_primitive::() + .value(0); + let tile_height = struct_array + .column(11) + .as_primitive::() + .value(0); + + assert_eq!(tile_width, block_x.max(1) as u64); + assert_eq!(tile_height, block_y.max(1) as u64); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_polygonize.rs b/rust/sedona-raster-gdal/src/rs_polygonize.rs new file mode 100644 index 000000000..1a744fca0 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_polygonize.rs @@ -0,0 +1,366 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_Polygonize UDF - Convert raster band to vector polygons +//! +//! Returns a list of polygons for all connected regions of pixels with the same +//! value in the specified band. +use std::convert::TryInto; +use std::sync::Arc; + +use arrow_array::builder::{BinaryBuilder, Float64Builder, ListBuilder, StructBuilder}; +use arrow_schema::{DataType, Field, Fields}; +use datafusion_common::cast::as_int32_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, exec_err}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::gdal_dyn_bindgen::{OGRFieldType, OGRwkbGeometryType}; +use sedona_gdal::mem::MemDatasetBuilder; + +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::traits::RasterRef; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::SedonaType; +use sedona_schema::matchers::ArgMatcher; + +// `dataset` removed; the provider is used instead when creating GDAL datasets. +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; + +/// RS_Polygonize() scalar UDF implementation +/// +/// Returns a list of polygons for connected regions of pixels with the same value +pub fn rs_polygonize_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_polygonize", + vec![Arc::new(RsPolygonize)], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_Polygonize +#[derive(Debug)] +struct RsPolygonize; + +impl SedonaScalarKernel for RsPolygonize { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ArgMatcher::is_raster(), ArgMatcher::is_integer()], + // Return type is List> + SedonaType::Arrow(polygon_value_list_type()), + ); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let num_iterations = executor.num_iterations(); + + // Get the band number array + let band_array = args[1] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?; + let band_array = as_int32_array(&band_array)?; + let mut band_iter = band_array.iter(); + + // Build result as List> + let struct_fields = polygon_value_struct_fields(); + let mut list_builder = ListBuilder::new(StructBuilder::from_fields(struct_fields, 16)); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + executor.execute_raster_void(|_i, raster_opt| { + let band_opt = band_iter.next().unwrap(); + + let raster = match (raster_opt, band_opt) { + (Some(raster), Some(_)) => raster, + _ => { + list_builder.append_null(); + return Ok(()); + } + }; + + let band_num: usize = band_opt.unwrap().max(1).try_into().unwrap_or(1); + + match polygonize_raster(gdal, raster, band_num) { + Ok(polygon_values) => { + let struct_builder = list_builder.values(); + + for (wkb, value) in polygon_values { + // Get field builders + let geom_builder = struct_builder + .field_builder::(0) + .expect("Expected BinaryBuilder for geom field"); + geom_builder.append_value(&wkb); + + let value_builder = struct_builder + .field_builder::(1) + .expect("Expected Float64Builder for value field"); + value_builder.append_value(value); + + struct_builder.append(true); + } + list_builder.append(true); + } + Err(e) => { + // Log error but append null + eprintln!("Polygonize error: {}", e); + list_builder.append_null(); + } + } + + Ok(()) + })?; + + executor.finish(Arc::new(list_builder.finish())) + }) + } +} + +/// Return type for the list of polygon/value structs +fn polygon_value_list_type() -> DataType { + DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(polygon_value_struct_fields()), + true, + ))) +} + +/// Struct fields for polygon/value pairs +fn polygon_value_struct_fields() -> Fields { + Fields::from(vec![ + Field::new("geom", DataType::Binary, false), + Field::new("value", DataType::Float64, false), + ]) +} + +/// Polygonize a raster band using GDAL +fn polygonize_raster( + gdal: &Gdal, + raster: &RasterRefImpl<'_>, + band_num: usize, +) -> Result, f64)>> { + let bands = raster.bands(); + if band_num == 0 || band_num > bands.len() { + return exec_err!("Band {} is out of range (1-{})", band_num, bands.len()); + } + + // Create GDAL dataset from raster (thread-local provider) + let provider = crate::gdal_dataset_provider::thread_local_provider(gdal) + .map_err(|e| exec_datafusion_err!("Failed to init GDAL provider: {}", e))?; + let raster_ds = provider + .raster_ref_to_gdal(raster) + .map_err(|e| exec_datafusion_err!("Failed to create GDAL dataset: {}", e))?; + let gdal_dataset = raster_ds.as_dataset(); + + // Get the raster band + let raster_band = gdal_dataset + .rasterband(band_num) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_num, e))?; + + // Create memory datasource for output polygons + let mem_ds = unsafe { + MemDatasetBuilder::new(0, 0) + .build(gdal) + .map_err(|e| exec_datafusion_err!("Failed to create memory dataset: {}", e))? + }; + + // Create layer with geometry field and value field + let spatial_ref = gdal_dataset.spatial_ref().ok(); + let mut layer = mem_ds + .create_layer(sedona_gdal::dataset::LayerOptions { + name: "polygons", + srs: spatial_ref.as_ref(), + ty: OGRwkbGeometryType::wkbPolygon, + options: None, + }) + .map_err(|e| exec_datafusion_err!("Failed to create layer: {}", e))?; + + // Add pixel value field + let field_defn = gdal + .create_field_defn("value", OGRFieldType::OFTReal) + .map_err(|e| exec_datafusion_err!("Failed to create field definition: {}", e))?; + layer + .create_field(&field_defn) + .map_err(|e| exec_datafusion_err!("Failed to add field to layer: {}", e))?; + + // Call GDAL Polygonize + gdal.polygonize( + &raster_band, + None, + &layer, + 0, + &sedona_gdal::raster::polygonize::PolygonizeOptions::default(), + ) + .map_err(|e| exec_datafusion_err!("GDAL polygonize failed: {e}"))?; + + // Extract polygons from layer + let mut polygon_values = Vec::new(); + + let mut value_field_idx: Option = None; + for feature in layer.features() { + let geom = feature + .geometry() + .ok_or_else(|| exec_datafusion_err!("Polygonize output feature missing geometry"))?; + let wkb = geom + .wkb() + .map_err(|e| exec_datafusion_err!("Failed to export geometry to WKB: {e}"))?; + + let idx = match value_field_idx { + Some(idx) => idx, + None => { + let idx = feature + .field_index("value") + .map_err(|e| exec_datafusion_err!("Missing 'value' field: {e}"))?; + value_field_idx = Some(idx); + idx + } + }; + + let value = feature.field_as_double(idx); + + polygon_values.push((wkb, value)); + } + + Ok(polygon_values) +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_common::ScalarValue; + use sedona_raster::array::RasterStructArray; + + #[test] + fn test_polygon_value_list_type() { + let dt = polygon_value_list_type(); + match dt { + DataType::List(field) => { + assert_eq!(field.name(), "item"); + match field.data_type() { + DataType::Struct(fields) => { + assert_eq!(fields.len(), 2); + assert_eq!(fields[0].name(), "geom"); + assert_eq!(fields[1].name(), "value"); + } + _ => panic!("Expected Struct data type"), + } + } + _ => panic!("Expected List data type"), + } + } + + #[test] + fn test_polygonize_raster() { + // Load test raster and polygonize it + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let result = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + polygonize_raster(gdal, &raster, 1) + }) + .unwrap(); + + // Should return at least one polygon + assert!( + !result.is_empty(), + "Polygonize should return at least one polygon" + ); + + // Each result should have a valid WKB and value + for (wkb, value) in &result { + // WKB should be at least 5 bytes (header) + assert!(wkb.len() >= 5, "WKB should be at least 5 bytes"); + // Value should be finite + assert!(value.is_finite(), "Value should be a finite number"); + } + } + + #[test] + fn test_polygonize_kernel_return_type() { + use arrow_schema::DataType; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::datatypes::RASTER; + + let kernel = RsPolygonize; + + let arg_types = vec![RASTER, SedonaType::Arrow(DataType::Int32)]; + let return_type = kernel.return_type(&arg_types).unwrap(); + assert!(return_type.is_some()); + + // Return type should be List> + match return_type.unwrap() { + SedonaType::Arrow(DataType::List(_)) => (), + _ => panic!("Expected List return type"), + } + } + + #[test] + fn test_polygonize_invoke_batch() { + use arrow_schema::DataType; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::datatypes::RASTER; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let raster_array = + with_gdal(|gdal| crate::utils::load_as_indb_raster(gdal, &test_file)).unwrap(); + + let kernel = RsPolygonize; + + let arg_types = vec![RASTER, SedonaType::Arrow(DataType::Int32)]; + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(raster_array))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), // band + ]; + + let result = kernel + .invoke_batch_from_args( + &arg_types, + &args, + &SedonaType::Arrow(DataType::Null), + 0, + None, + ) + .unwrap(); + + // Result should be a scalar (since input was scalar) + match result { + ColumnarValue::Scalar(_) => (), + _ => panic!("Expected Scalar result"), + } + } +} diff --git a/rust/sedona-raster-gdal/src/rs_value.rs b/rust/sedona-raster-gdal/src/rs_value.rs new file mode 100644 index 000000000..31d53b35e --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_value.rs @@ -0,0 +1,658 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_Value UDF - Get raster pixel value at a point or grid coordinate +//! +//! Returns the value at the given point in the raster. If no band number is specified, +//! it defaults to 1. If the CRS of the input point differs from the raster CRS, +//! the point will be transformed to match the raster CRS. + +use std::convert::TryInto; +use std::sync::Arc; + +use arrow_array::builder::Float64Builder; +use arrow_schema::DataType; +use datafusion_common::cast::as_int32_array; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, exec_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_gdal::gdal::Gdal; +use sedona_proj::transform::with_global_proj_engine; + +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::affine_transformation::to_raster_coordinate; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::traits::RasterRef; +use sedona_raster_functions::crs_utils::{crs_transform_wkb, resolve_crs}; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::SedonaType; +use sedona_schema::matchers::ArgMatcher; +use sedona_schema::raster::BandDataType; + +use crate::gdal_common::with_gdal; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::raster_band_reader::RasterBandReader; + +/// RS_Value() scalar UDF implementation +/// +/// Returns the value at the given point in the raster +pub fn rs_value_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_value", + vec![ + Arc::new(RsValuePoint { with_band: false }), + Arc::new(RsValuePoint { with_band: true }), + Arc::new(RsValueGrid), + ], + Volatility::Immutable, + ) +} + +/// Kernel for RS_Value with point geometry argument +#[derive(Debug)] +struct RsValuePoint { + with_band: bool, +} + +impl SedonaScalarKernel for RsValuePoint { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = if self.with_band { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ] + } else { + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ] + }; + + let matcher = ArgMatcher::new(matchers, SedonaType::Arrow(DataType::Float64)); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + // Use a full executor to compute num_iterations from all args + let full_executor = RasterExecutor::new(arg_types, args); + let num_iterations = full_executor.num_iterations(); + let mut builder = Float64Builder::with_capacity(num_iterations); + + let band_array = if self.with_band { + args[2] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)? + } else { + ScalarValue::Int32(Some(1)).to_array_of_size(num_iterations)? + }; + let band_array = as_int32_array(&band_array)?.clone(); + let mut band_iter = band_array.iter(); + + let exec_arg_types = vec![arg_types[0].clone(), arg_types[1].clone()]; + let exec_args = vec![args[0].clone(), args[1].clone()]; + let executor = + RasterExecutor::new_with_num_iterations(&exec_arg_types, &exec_args, num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + with_global_proj_engine(|engine| { + executor.execute_raster_wkb_crs_void(|raster_opt, wkb_opt, maybe_point_crs| { + let band_num = band_iter + .next() + .flatten() + .unwrap_or(1) + .max(1) + .try_into() + .unwrap_or(1); + let (raster, point_wkb) = match (raster_opt, wkb_opt) { + (Some(raster), Some(point_wkb)) => (raster, point_wkb), + _ => { + builder.append_null(); + return Ok(()); + } + }; + + let raster_crs = resolve_crs(raster.crs())?; + + let point_wkb = match (maybe_point_crs, raster_crs.as_deref()) { + (Some(point_crs), Some(raster_crs)) => { + crs_transform_wkb(point_wkb, point_crs, raster_crs, engine)? + } + (None, None) => point_wkb.to_vec(), + (Some(_), None) => { + return exec_err!( + "Cannot operate on point and raster: raster has no CRS but point does" + ) + } + (None, Some(_)) => { + return exec_err!( + "Cannot operate on point and raster: point has no CRS but raster does" + ) + } + }; + + match get_value_at_point(gdal, raster, &point_wkb, band_num) { + Ok(Some(value)) => builder.append_value(value), + Ok(None) => builder.append_null(), + Err(_) => builder.append_null(), + } + + Ok(()) + }) + }) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +/// Kernel for RS_Value with grid coordinates +#[derive(Debug)] +struct RsValueGrid; + +impl SedonaScalarKernel for RsValueGrid { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ + ArgMatcher::is_raster(), + ArgMatcher::is_integer(), + ArgMatcher::is_integer(), + ArgMatcher::is_integer(), + ], + SedonaType::Arrow(DataType::Float64), + ); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let executor = RasterExecutor::new(arg_types, args); + let num_iterations = executor.num_iterations(); + let mut builder = Float64Builder::with_capacity(num_iterations); + + // Convert col_x, row_y, band to arrays + let col_x_array = args[1] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?; + let col_x_array = as_int32_array(&col_x_array)?; + let row_y_array = args[2] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?; + let row_y_array = as_int32_array(&row_y_array)?; + let band_array = args[3] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?; + let band_array = as_int32_array(&band_array)?; + + let mut col_x_iter = col_x_array.iter(); + let mut row_y_iter = row_y_array.iter(); + let mut band_iter = band_array.iter(); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + executor.execute_raster_void(|_i, raster_opt| { + let col_x_opt = col_x_iter.next().unwrap(); + let row_y_opt = row_y_iter.next().unwrap(); + let band_opt = band_iter.next().unwrap(); + + let raster = match (raster_opt, col_x_opt, row_y_opt, band_opt) { + (Some(raster), Some(_), Some(_), Some(_)) => raster, + _ => { + builder.append_null(); + return Ok(()); + } + }; + + let x = col_x_opt.unwrap() as i64; + let y = row_y_opt.unwrap() as i64; + let band_num: usize = band_opt.unwrap().max(1).try_into().unwrap_or(1); + + match get_value_at_grid(gdal, raster, x, y, band_num) { + Ok(Some(value)) => builder.append_value(value), + Ok(None) => builder.append_null(), + Err(_) => builder.append_null(), + } + + Ok(()) + }) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +/// Get pixel value at a point geometry +fn get_value_at_point( + gdal: &Gdal, + raster: &RasterRefImpl<'_>, + point_wkb: &[u8], + band_num: usize, +) -> Result> { + // Parse point from WKB + let (x, y) = parse_point_from_wkb(point_wkb)?; + + // Convert world coordinates to raster coordinates + let (col, row) = to_raster_coordinate(raster, x, y) + .map_err(|e| exec_datafusion_err!("Failed to convert coordinates: {}", e))?; + + get_value_at_grid(gdal, raster, col, row, band_num) +} + +/// Get pixel value at grid coordinates +fn get_value_at_grid( + gdal: &Gdal, + raster: &RasterRefImpl<'_>, + col: i64, + row: i64, + band_num: usize, +) -> Result> { + let metadata = raster.metadata(); + let width = metadata.width() as i64; + let height = metadata.height() as i64; + + // Check bounds + if col < 0 || col >= width || row < 0 || row >= height { + return Ok(None); + } + + let bands = raster.bands(); + if band_num == 0 || band_num > bands.len() { + return exec_err!("Band {} is out of range (1-{})", band_num, bands.len()); + } + + let band = bands + .band(band_num) + .map_err(|e| exec_datafusion_err!("Failed to get band {}: {}", band_num, e))?; + + let band_metadata = band.metadata(); + let mut band_reader = RasterBandReader::new(gdal, raster); + let value = band_reader.read_pixel_f64(band_num, col as usize, row as usize)?; + + // Check for nodata + if let Some(nodata_bytes) = band_metadata.nodata_value() { + let nodata = read_nodata_value(nodata_bytes, band_metadata.data_type()?)?; + if (value - nodata).abs() < f64::EPSILON { + return Ok(None); + } + } + + Ok(Some(value)) +} + +/// Parse point coordinates from WKB +fn parse_point_from_wkb(wkb: &[u8]) -> Result<(f64, f64)> { + // WKB Point structure: + // - 1 byte: byte order (01 = little endian, 00 = big endian) + // - 4 bytes: geometry type (1 = Point) + // - 8 bytes: X coordinate (f64) + // - 8 bytes: Y coordinate (f64) + + if wkb.len() < 21 { + return exec_err!("Invalid WKB: too short for Point geometry"); + } + + let byte_order = wkb[0]; + let geom_type = if byte_order == 0x01 { + // Little endian + u32::from_le_bytes([wkb[1], wkb[2], wkb[3], wkb[4]]) + } else { + // Big endian + u32::from_be_bytes([wkb[1], wkb[2], wkb[3], wkb[4]]) + }; + + // Check geometry type (1 = Point, may have Z/M flags in higher bits) + let base_type = geom_type & 0xFF; + if base_type != 1 { + return exec_err!("Expected Point geometry (type 1), got type {}", base_type); + } + + let (x, y) = if byte_order == 0x01 { + // Little endian + let x = f64::from_le_bytes([ + wkb[5], wkb[6], wkb[7], wkb[8], wkb[9], wkb[10], wkb[11], wkb[12], + ]); + let y = f64::from_le_bytes([ + wkb[13], wkb[14], wkb[15], wkb[16], wkb[17], wkb[18], wkb[19], wkb[20], + ]); + (x, y) + } else { + // Big endian + let x = f64::from_be_bytes([ + wkb[5], wkb[6], wkb[7], wkb[8], wkb[9], wkb[10], wkb[11], wkb[12], + ]); + let y = f64::from_be_bytes([ + wkb[13], wkb[14], wkb[15], wkb[16], wkb[17], wkb[18], wkb[19], wkb[20], + ]); + (x, y) + }; + + Ok((x, y)) +} + +/// Read nodata value from bytes +fn read_nodata_value(bytes: &[u8], data_type: BandDataType) -> Result { + match data_type { + BandDataType::UInt8 => { + if !bytes.is_empty() { + Ok(bytes[0] as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Int8 => { + if !bytes.is_empty() { + Ok(bytes[0] as i8 as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::UInt16 => { + if bytes.len() >= 2 { + Ok(u16::from_le_bytes([bytes[0], bytes[1]]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Int16 => { + if bytes.len() >= 2 { + Ok(i16::from_le_bytes([bytes[0], bytes[1]]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::UInt32 => { + if bytes.len() >= 4 { + Ok(u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Int32 => { + if bytes.len() >= 4 { + Ok(i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::UInt64 => { + if bytes.len() >= 8 { + Ok(u64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Int64 => { + if bytes.len() >= 8 { + Ok(i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Float32 => { + if bytes.len() >= 4 { + Ok(f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as f64) + } else { + exec_err!("Invalid nodata bytes") + } + } + BandDataType::Float64 => { + if bytes.len() >= 8 { + Ok(f64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ])) + } else { + exec_err!("Invalid nodata bytes") + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::gdal_common::with_gdal; + use sedona_raster::affine_transformation::to_world_coordinate; + use sedona_raster::array::RasterStructArray; + use sedona_raster_functions::crs_utils::crs_transform_coord; + use sedona_schema::crs::deserialize_crs; + use sedona_schema::datatypes::{Edges, RASTER}; + use sedona_schema::raster::BandDataType; + use sedona_testing::create::make_wkb; + + #[test] + fn test_parse_point_from_wkb() { + // Little-endian WKB for POINT(1.0, 2.0) + let wkb = [ + 0x01, // Little endian + 0x01, 0x00, 0x00, 0x00, // Point type + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F, // X = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // Y = 2.0 + ]; + + let (x, y) = parse_point_from_wkb(&wkb).unwrap(); + assert!((x - 1.0).abs() < f64::EPSILON); + assert!((y - 2.0).abs() < f64::EPSILON); + } + + #[test] + fn test_read_pixel_value_uint8() { + let data = vec![42u8, 100, 200]; + let raster_array = sedona_testing::rasters::raster_from_single_band( + 3, + 1, + BandDataType::UInt8, + &data, + None, + ); + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let value = with_gdal(|gdal| { + let mut reader = RasterBandReader::new(gdal, &raster); + reader.read_pixel_f64(1, 1, 0) + }) + .unwrap(); + assert!((value - 100.0).abs() < f64::EPSILON); + } + + #[test] + fn test_read_pixel_value_float32() { + let mut data = vec![0u8; 12]; // 3 float32 values + let values: [f32; 3] = [1.5, 2.5, 3.5]; + for (i, &v) in values.iter().enumerate() { + data[i * 4..(i + 1) * 4].copy_from_slice(&v.to_le_bytes()); + } + let raster_array = sedona_testing::rasters::raster_from_single_band( + 3, + 1, + BandDataType::Float32, + &data, + None, + ); + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let value = with_gdal(|gdal| { + let mut reader = RasterBandReader::new(gdal, &raster); + reader.read_pixel_f64(1, 1, 0) + }) + .unwrap(); + assert!((value - 2.5).abs() < f64::EPSILON); + } + + #[test] + fn test_rs_value_grid_with_test_raster() { + // Load test raster and read value at grid coordinates + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let (value, center_value, out_of_bounds) = with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + + let value = get_value_at_grid(gdal, &raster, 0, 0, 1)?; + let center_value = get_value_at_grid(gdal, &raster, 5, 5, 1)?; + let out_of_bounds = get_value_at_grid(gdal, &raster, 100, 100, 1)?; + Ok::<_, datafusion_common::DataFusionError>((value, center_value, out_of_bounds)) + }) + .unwrap(); + assert!(value.is_some()); + assert!(center_value.is_some()); + assert!(out_of_bounds.is_none()); + } + + #[test] + fn test_rs_value_invoke_grid() { + // Test invoking RS_Value with grid coordinates + use arrow_schema::DataType; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::datatypes::RASTER; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let raster_array = + with_gdal(|gdal| crate::utils::load_as_indb_raster(gdal, &test_file)).unwrap(); + + let kernel = RsValueGrid; + + // Test return type + let arg_types = vec![ + RASTER, + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + SedonaType::Arrow(DataType::Int32), + ]; + let return_type = kernel.return_type(&arg_types).unwrap(); + assert!(return_type.is_some()); + + // Test invoke_batch + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(raster_array))), + ColumnarValue::Scalar(ScalarValue::Int32(Some(0))), // col_x + ColumnarValue::Scalar(ScalarValue::Int32(Some(0))), // row_y + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), // band + ]; + + let result = kernel.invoke_batch(&arg_types, &args).unwrap(); + + match result { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => { + // Value should be a valid pixel value + assert!(value.is_finite()); + } + _ => panic!("Expected Float64 scalar result"), + } + } + + #[test] + fn test_rs_value_point_crs_transform() { + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let raster_array = + with_gdal(|gdal| crate::utils::load_as_indb_raster(gdal, &test_file)).unwrap(); + + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let width = raster.metadata().width() as i64; + let height = raster.metadata().height() as i64; + let col = width / 2; + let row = height / 2; + let (lon, lat) = to_world_coordinate(&raster, col, row); + + let point_wkt = format!("POINT ({} {})", lon, lat); + let point_wkb = make_wkb(&point_wkt); + let (x_merc, y_merc) = with_global_proj_engine(|engine| { + crs_transform_coord(engine, (lon, lat), "OGC:CRS84", "EPSG:3857") + }) + .unwrap(); + let point_merc_wkt = format!("POINT ({} {})", x_merc, y_merc); + let point_merc_wkb = make_wkb(&point_merc_wkt); + + let raster_scalar = ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(raster_array))); + + let geom_type_4326 = SedonaType::Wkb(Edges::Planar, deserialize_crs("EPSG:4326").unwrap()); + let geom_type_3857 = SedonaType::Wkb(Edges::Planar, deserialize_crs("EPSG:3857").unwrap()); + + let kernel = RsValuePoint { with_band: false }; + + let result_4326 = kernel + .invoke_batch( + &[RASTER, geom_type_4326], + &[ + raster_scalar.clone(), + ColumnarValue::Scalar(ScalarValue::Binary(Some(point_wkb))), + ], + ) + .unwrap(); + + let value_4326 = match result_4326 { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + + let result_3857 = kernel + .invoke_batch( + &[RASTER, geom_type_3857], + &[ + raster_scalar, + ColumnarValue::Scalar(ScalarValue::Binary(Some(point_merc_wkb))), + ], + ) + .unwrap(); + + let value_3857 = match result_3857 { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + + assert_eq!(value_4326, value_3857); + } +} diff --git a/rust/sedona-raster-gdal/src/rs_zonal_stats.rs b/rust/sedona-raster-gdal/src/rs_zonal_stats.rs new file mode 100644 index 000000000..b05393441 --- /dev/null +++ b/rust/sedona-raster-gdal/src/rs_zonal_stats.rs @@ -0,0 +1,1475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! RS_ZonalStats and RS_ZonalStatsAll UDFs - Compute statistics for pixels within a geometry +//! +//! RS_ZonalStats computes a single statistic (count, sum, mean, median, mode, stddev, variance, min, max) +//! for all pixels within a geometry boundary. +//! +//! RS_ZonalStatsAll computes all statistics and returns them as a struct. +//! +//! Signatures (matching Apache Sedona docs): +//! +//! ## RS_ZonalStats +//! - `RS_ZonalStats(raster, zone, statType)` — 3 args +//! - `RS_ZonalStats(raster, zone, statType, allTouched)` — 4 args +//! - `RS_ZonalStats(raster, zone, band, statType, allTouched)` — 5 args +//! - `RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData)` — 6 args +//! - `RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData, lenient)` — 7 args +//! +//! ## RS_ZonalStatsAll +//! - `RS_ZonalStatsAll(raster, zone)` — 2 args +//! - `RS_ZonalStatsAll(raster, zone, band)` — 3 args +//! - `RS_ZonalStatsAll(raster, zone, band, allTouched)` — 4 args +//! - `RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData)` — 5 args +//! - `RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData, lenient)` — 6 args + +use std::collections::HashMap; +use std::convert::TryInto; +use std::sync::Arc; + +use arrow_array::builder::{Float64Builder, Int64Builder, StructBuilder}; +use arrow_array::{Array, ArrayRef}; +use arrow_array::{BooleanArray, Int32Array}; +use arrow_schema::{DataType, Field, Fields}; +use datafusion_common::cast::{as_boolean_array, as_int32_array, as_string_array}; +use datafusion_common::config::ConfigOptions; +use datafusion_common::error::Result; +use datafusion_common::{exec_datafusion_err, exec_err, ScalarValue}; +use datafusion_expr::{ColumnarValue, Volatility}; +use sedona_common::sedona_internal_err; +use sedona_gdal::gdal::Gdal; +use sedona_gdal::mem::MemDatasetBuilder; +use sedona_gdal::raster::types::Buffer; +use sedona_gdal::raster::types::GdalDataType; +use sedona_proj::transform::with_global_proj_engine; + +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_raster::affine_transformation::to_world_coordinate; +use sedona_raster::array::RasterRefImpl; +use sedona_raster::traits::RasterRef; +use sedona_raster_functions::crs_utils::{crs_transform_wkb, resolve_crs}; +use sedona_raster_functions::RasterExecutor; +use sedona_schema::datatypes::SedonaType; +use sedona_schema::matchers::ArgMatcher; + +use crate::gdal_common::{nodata_bytes_to_f64, with_gdal}; +use crate::gdal_dataset_provider::configure_thread_local_options; +use crate::raster_band_reader::RasterBandReader; + +/// Statistics types supported by RS_ZonalStats +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StatType { + Count, + Sum, + Mean, + Median, + Mode, + StdDev, + Variance, + Min, + Max, +} + +impl StatType { + /// Parse stat type from string (case-insensitive) + fn from_str(s: &str) -> Option { + match s.to_lowercase().as_str() { + "count" => Some(StatType::Count), + "sum" => Some(StatType::Sum), + "mean" | "avg" | "average" => Some(StatType::Mean), + "median" => Some(StatType::Median), + "mode" => Some(StatType::Mode), + "stddev" | "std" | "standarddeviation" => Some(StatType::StdDev), + "variance" | "var" => Some(StatType::Variance), + "min" | "minimum" => Some(StatType::Min), + "max" | "maximum" => Some(StatType::Max), + _ => None, + } + } +} + +/// Computed statistics for a zone +#[derive(Debug, Default)] +pub struct ZonalStatistics { + pub count: i64, + pub sum: f64, + pub mean: f64, + pub median: f64, + pub mode: f64, + pub stddev: f64, + pub variance: f64, + pub min: f64, + pub max: f64, +} + +impl ZonalStatistics { + /// Get a specific statistic value + fn get(&self, stat_type: StatType) -> f64 { + match stat_type { + StatType::Count => self.count as f64, + StatType::Sum => self.sum, + StatType::Mean => self.mean, + StatType::Median => self.median, + StatType::Mode => self.mode, + StatType::StdDev => self.stddev, + StatType::Variance => self.variance, + StatType::Min => self.min, + StatType::Max => self.max, + } + } +} + +// ============================================================================= +// RS_ZonalStats UDF +// ============================================================================= + +/// RS_ZonalStats() scalar UDF implementation +/// +/// Computes a single statistic for pixels within a geometry. +/// +/// Signatures: +/// - `RS_ZonalStats(raster, zone, statType)` — 3 args +/// - `RS_ZonalStats(raster, zone, statType, allTouched)` — 4 args +/// - `RS_ZonalStats(raster, zone, band, statType, allTouched)` — 5 args +/// - `RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData)` — 6 args +/// - `RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData, lenient)` — 7 args +pub fn rs_zonal_stats_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_zonalstats", + vec![ + Arc::new(RsZonalStats { arg_count: 3 }), + Arc::new(RsZonalStats { arg_count: 4 }), + Arc::new(RsZonalStats { arg_count: 5 }), + Arc::new(RsZonalStats { arg_count: 6 }), + Arc::new(RsZonalStats { arg_count: 7 }), + ], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_ZonalStats +#[derive(Debug)] +struct RsZonalStats { + /// Number of arguments in the matched signature (3..=7) + arg_count: usize, +} + +impl SedonaScalarKernel for RsZonalStats { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = match self.arg_count { + 3 => vec![ + // RS_ZonalStats(raster, zone, statType) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_string(), + ], + 4 => vec![ + // RS_ZonalStats(raster, zone, statType, allTouched) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_string(), + ArgMatcher::is_boolean(), + ], + 5 => vec![ + // RS_ZonalStats(raster, zone, band, statType, allTouched) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_string(), + ArgMatcher::is_boolean(), + ], + 6 => vec![ + // RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_string(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ], + 7 => vec![ + // RS_ZonalStats(raster, zone, band, statType, allTouched, excludeNoData, lenient) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_string(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ], + _ => { + return sedona_internal_err!( + "RS_ZonalStats: unexpected arg_count {}", + self.arg_count + ); + } + }; + + let matcher = ArgMatcher::new(matchers, SedonaType::Arrow(DataType::Float64)); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let num_iterations = calc_num_iterations(args); + + // Geometry is always at index 1 (zone). + let geom_arg_idx: usize = 1; + + // Determine arg indices based on arg_count. + // 3-arg: (raster, zone, statType) → stat=2 + // 4-arg: (raster, zone, statType, allTouched) → stat=2, allTouched=3 + // 5-arg: (raster, zone, band, statType, allTouched) → band=2, stat=3, allTouched=4 + // 6-arg: (raster, zone, band, statType, allTouched, excludeNoData) → band=2, stat=3, allTouched=4, excludeNoData=5 + // 7-arg: + lenient=6 + let ( + stat_arg_idx, + band_arg_idx, + all_touched_arg_idx, + exclude_nodata_arg_idx, + lenient_arg_idx, + ) = match self.arg_count { + 3 => (2, None, None, None, None), + 4 => (2, None, Some(3), None, None), + 5 => (3, Some(2), Some(4), None, None), + 6 => (3, Some(2), Some(4), Some(5), None), + 7 => (3, Some(2), Some(4), Some(5), Some(6)), + _ => unreachable!(), + }; + + // Get stat type array + let stat_array = args[stat_arg_idx] + .clone() + .cast_to(&DataType::Utf8, None)? + .into_array(num_iterations)?; + let stat_array = as_string_array(&stat_array)?.clone(); + let mut stat_iter = stat_array.iter(); + + let mut builder = Float64Builder::with_capacity(num_iterations); + + // Expand option args to arrays so they can vary row-by-row. + let band_array = expand_int32_arg(args, band_arg_idx, 1, num_iterations)?; + let all_touched_array = + expand_boolean_arg(args, all_touched_arg_idx, false, num_iterations)?; + let exclude_nodata_array = + expand_boolean_arg(args, exclude_nodata_arg_idx, true, num_iterations)?; + let lenient_array = expand_boolean_arg(args, lenient_arg_idx, true, num_iterations)?; + + let mut band_iter = band_array.iter(); + let mut all_touched_iter = all_touched_array.iter(); + let mut exclude_nodata_iter = exclude_nodata_array.iter(); + let mut lenient_iter = lenient_array.iter(); + + let exec_arg_types = vec![arg_types[0].clone(), arg_types[geom_arg_idx].clone()]; + let exec_args = vec![args[0].clone(), args[geom_arg_idx].clone()]; + let executor = + RasterExecutor::new_with_num_iterations(&exec_arg_types, &exec_args, num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + with_global_proj_engine(|engine| { + executor.execute_raster_wkb_crs_void(|raster_opt, wkb_opt, geom_crs| { + let stat_str = stat_iter + .next() + .flatten() + .ok_or_else(|| exec_datafusion_err!("Stat type is required"))?; + let stat_type = StatType::from_str(stat_str) + .ok_or_else(|| exec_datafusion_err!("Unknown stat type: {}", stat_str))?; + let band = band_iter + .next() + .flatten() + .unwrap_or(1) + .max(1) + .try_into() + .unwrap_or(1); + let all_touched = all_touched_iter.next().flatten().unwrap_or(false); + let exclude_nodata = exclude_nodata_iter.next().flatten().unwrap_or(true); + let lenient = lenient_iter.next().flatten().unwrap_or(true); + + let (raster, geom_wkb) = match (raster_opt, wkb_opt) { + (Some(r), Some(w)) => (r, w), + _ => { + builder.append_null(); + return Ok(()); + } + }; + + let raster_crs = resolve_crs(raster.crs())?; + + let geom_wkb = match (geom_crs, raster_crs.as_deref()) { + (Some(geom_crs), Some(raster_crs)) => { + crs_transform_wkb(geom_wkb, geom_crs, raster_crs, engine)? + } + (None, None) => geom_wkb.to_vec(), + (Some(_), None) => { + return exec_err!( + "Cannot operate on geometry and raster: raster has no CRS but geometry does" + ) + } + (None, Some(_)) => { + return exec_err!( + "Cannot operate on geometry and raster: geometry has no CRS but raster does" + ) + } + }; + + match compute_zonal_stats( + gdal, + raster, + &geom_wkb, + band, + all_touched, + exclude_nodata, + ) { + Ok(stats) => builder.append_value(stats.get(stat_type)), + Err(e) => { + if lenient { + eprintln!("RS_ZonalStats error: {}", e); + builder.append_null(); + } else { + return Err(e); + } + } + } + + Ok(()) + }) + })?; + + executor.finish(Arc::new(builder.finish())) + }) + } +} + +// ============================================================================= +// RS_ZonalStatsAll UDF +// ============================================================================= + +/// RS_ZonalStatsAll() scalar UDF implementation +/// +/// Computes all statistics for pixels within a geometry and returns a struct. +/// +/// Signatures: +/// - `RS_ZonalStatsAll(raster, zone)` — 2 args +/// - `RS_ZonalStatsAll(raster, zone, band)` — 3 args +/// - `RS_ZonalStatsAll(raster, zone, band, allTouched)` — 4 args +/// - `RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData)` — 5 args +/// - `RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData, lenient)` — 6 args +pub fn rs_zonal_stats_all_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "rs_zonalstatsall", + vec![ + Arc::new(RsZonalStatsAll { arg_count: 2 }), + Arc::new(RsZonalStatsAll { arg_count: 3 }), + Arc::new(RsZonalStatsAll { arg_count: 4 }), + Arc::new(RsZonalStatsAll { arg_count: 5 }), + Arc::new(RsZonalStatsAll { arg_count: 6 }), + ], + Volatility::Immutable, + ) +} + +/// Kernel implementation for RS_ZonalStatsAll +#[derive(Debug)] +struct RsZonalStatsAll { + /// Number of arguments in the matched signature (2..=6) + arg_count: usize, +} + +impl SedonaScalarKernel for RsZonalStatsAll { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matchers = match self.arg_count { + 2 => vec![ + // RS_ZonalStatsAll(raster, zone) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ], + 3 => vec![ + // RS_ZonalStatsAll(raster, zone, band) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ], + 4 => vec![ + // RS_ZonalStatsAll(raster, zone, band, allTouched) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_boolean(), + ], + 5 => vec![ + // RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ], + 6 => vec![ + // RS_ZonalStatsAll(raster, zone, band, allTouched, excludeNoData, lenient) + ArgMatcher::is_raster(), + ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_integer(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ArgMatcher::is_boolean(), + ], + _ => { + return sedona_internal_err!( + "RS_ZonalStatsAll: unexpected arg_count {}", + self.arg_count + ); + } + }; + + let matcher = ArgMatcher::new(matchers, SedonaType::Arrow(zonal_stats_struct_type())); + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + self.invoke_batch_from_args(arg_types, args, &SedonaType::Arrow(DataType::Null), 0, None) + } + + fn invoke_batch_from_args( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + _return_type: &SedonaType, + _num_rows: usize, + config_options: Option<&ConfigOptions>, + ) -> Result { + let num_iterations = calc_num_iterations(args); + + // Geometry is always at index 1 (zone). + let geom_arg_idx: usize = 1; + + // Determine arg indices based on arg_count. + // 2-arg: (raster, zone) + // 3-arg: (raster, zone, band) → band=2 + // 4-arg: (raster, zone, band, allTouched) → band=2, allTouched=3 + // 5-arg: (raster, zone, band, allTouched, excludeNoData) → band=2, allTouched=3, excludeNoData=4 + // 6-arg: + lenient=5 + let (band_arg_idx, all_touched_arg_idx, exclude_nodata_arg_idx, lenient_arg_idx) = + match self.arg_count { + 2 => (None, None, None, None), + 3 => (Some(2), None, None, None), + 4 => (Some(2), Some(3), None, None), + 5 => (Some(2), Some(3), Some(4), None), + 6 => (Some(2), Some(3), Some(4), Some(5)), + _ => unreachable!(), + }; + + // Build struct result + let fields = zonal_stats_struct_fields(); + let mut builder = StructBuilder::from_fields(fields, num_iterations); + + // Expand option args to arrays so they can vary row-by-row. + let band_array = expand_int32_arg(args, band_arg_idx, 1, num_iterations)?; + let all_touched_array = + expand_boolean_arg(args, all_touched_arg_idx, false, num_iterations)?; + let exclude_nodata_array = + expand_boolean_arg(args, exclude_nodata_arg_idx, true, num_iterations)?; + let lenient_array = expand_boolean_arg(args, lenient_arg_idx, true, num_iterations)?; + + let mut band_iter = band_array.iter(); + let mut all_touched_iter = all_touched_array.iter(); + let mut exclude_nodata_iter = exclude_nodata_array.iter(); + let mut lenient_iter = lenient_array.iter(); + + let append_null = |builder: &mut StructBuilder| { + builder + .field_builder::(0) + .unwrap() + .append_null(); + for j in 1..9 { + builder + .field_builder::(j) + .unwrap() + .append_null(); + } + builder.append_null(); + }; + + let append_stats = |builder: &mut StructBuilder, stats: ZonalStatistics| { + builder + .field_builder::(0) + .unwrap() + .append_value(stats.count); + builder + .field_builder::(1) + .unwrap() + .append_value(stats.sum); + builder + .field_builder::(2) + .unwrap() + .append_value(stats.mean); + builder + .field_builder::(3) + .unwrap() + .append_value(stats.median); + builder + .field_builder::(4) + .unwrap() + .append_value(stats.mode); + builder + .field_builder::(5) + .unwrap() + .append_value(stats.stddev); + builder + .field_builder::(6) + .unwrap() + .append_value(stats.variance); + builder + .field_builder::(7) + .unwrap() + .append_value(stats.min); + builder + .field_builder::(8) + .unwrap() + .append_value(stats.max); + builder.append(true); + }; + + let exec_arg_types = vec![arg_types[0].clone(), arg_types[geom_arg_idx].clone()]; + let exec_args = vec![args[0].clone(), args[geom_arg_idx].clone()]; + let executor = + RasterExecutor::new_with_num_iterations(&exec_arg_types, &exec_args, num_iterations); + + with_gdal(|gdal| { + configure_thread_local_options(gdal, config_options)?; + with_global_proj_engine(|engine| { + executor.execute_raster_wkb_crs_void(|raster_opt, wkb_opt, geom_crs| { + let band = band_iter + .next() + .flatten() + .unwrap_or(1) + .max(1) + .try_into() + .unwrap_or(1); + let all_touched = all_touched_iter.next().flatten().unwrap_or(false); + let exclude_nodata = exclude_nodata_iter.next().flatten().unwrap_or(true); + let lenient = lenient_iter.next().flatten().unwrap_or(true); + + let (raster, geom_wkb) = match (raster_opt, wkb_opt) { + (Some(r), Some(w)) => (r, w), + _ => { + append_null(&mut builder); + return Ok(()); + } + }; + + let raster_crs = resolve_crs(raster.crs())?; + + let geom_wkb = match (geom_crs, raster_crs.as_deref()) { + (Some(geom_crs), Some(raster_crs)) => { + crs_transform_wkb(geom_wkb, geom_crs, raster_crs, engine)? + } + (None, None) => geom_wkb.to_vec(), + (Some(_), None) => { + return exec_err!( + "Cannot operate on geometry and raster: raster has no CRS but geometry does" + ) + } + (None, Some(_)) => { + return exec_err!( + "Cannot operate on geometry and raster: geometry has no CRS but raster does" + ) + } + }; + + match compute_zonal_stats( + gdal, + raster, + &geom_wkb, + band, + all_touched, + exclude_nodata, + ) { + Ok(stats) => append_stats(&mut builder, stats), + Err(e) => { + if lenient { + eprintln!("RS_ZonalStatsAll error: {}", e); + append_null(&mut builder); + } else { + return Err(e); + } + } + } + + Ok(()) + }) + })?; + + executor.finish(Arc::new(builder.finish()) as ArrayRef) + }) + } +} + +/// Return type for ZonalStatsAll struct +fn zonal_stats_struct_type() -> DataType { + DataType::Struct(zonal_stats_struct_fields()) +} + +/// Fields for the ZonalStatsAll struct +fn zonal_stats_struct_fields() -> Fields { + Fields::from(vec![ + Field::new("count", DataType::Int64, true), + Field::new("sum", DataType::Float64, true), + Field::new("mean", DataType::Float64, true), + Field::new("median", DataType::Float64, true), + Field::new("mode", DataType::Float64, true), + Field::new("stddev", DataType::Float64, true), + Field::new("variance", DataType::Float64, true), + Field::new("min", DataType::Float64, true), + Field::new("max", DataType::Float64, true), + ]) +} + +// ============================================================================= +// Core Statistics Computation +// ============================================================================= + +/// Compute zonal statistics for a raster within a geometry +fn compute_zonal_stats( + gdal: &Gdal, + raster: &RasterRefImpl<'_>, + geom_wkb: &[u8], + band_num: usize, + all_touched: bool, + exclude_nodata: bool, +) -> Result { + let metadata = raster.metadata(); + + let mut band_reader = RasterBandReader::new(gdal, raster); + + // Parse geometry from WKB + let geometry = gdal + .geometry_from_wkb(geom_wkb) + .map_err(|e| exec_datafusion_err!("Failed to parse geometry from WKB: {}", e))?; + + let geom_bounds = bounds_from_envelope(geometry.envelope()); + let raster_bounds = raster_bounds(raster); + let intersection = match geom_bounds.intersection(raster_bounds) { + Some(bounds) => bounds, + None => return compute_statistics(&[]), + }; + + let window = match bounds_to_window(raster, intersection)? { + Some(window) => window, + None => return compute_statistics(&[]), + }; + + // Create a mask raster + let mask_dataset = + MemDatasetBuilder::create(gdal, window.width, window.height, 1, GdalDataType::UInt8) + .map_err(|e| exec_datafusion_err!("Failed to create mask dataset: {}", e))?; + + // Set geotransform + let start_col = window.xoff as f64; + let start_row = window.yoff as f64; + let geotransform = [ + metadata.upper_left_x() + start_col * metadata.scale_x() + start_row * metadata.skew_x(), + metadata.scale_x(), + metadata.skew_x(), + metadata.upper_left_y() + start_col * metadata.skew_y() + start_row * metadata.scale_y(), + metadata.skew_y(), + metadata.scale_y(), + ]; + mask_dataset + .set_geo_transform(&geotransform) + .map_err(|e| exec_datafusion_err!("Failed to set geotransform: {}", e))?; + + // Initialize mask to 0 + let mask_band = mask_dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get mask band: {}", e))?; + let zeros = vec![0u8; window.width * window.height]; + let mut buffer = Buffer::new((window.width, window.height), zeros); + mask_band + .write((0, 0), (window.width, window.height), &mut buffer) + .map_err(|e| exec_datafusion_err!("Failed to initialize mask: {}", e))?; + + gdal.rasterize_affine(&mask_dataset, &[1], &[geometry], &[1.0], all_touched) + .map_err(|e| exec_datafusion_err!("Failed to rasterize geometry: {}", e))?; + + // Read mask + let mask_band = mask_dataset + .rasterband(1) + .map_err(|e| exec_datafusion_err!("Failed to get mask band: {}", e))?; + let mask_buffer = mask_band + .read_as::( + (0, 0), + (window.width, window.height), + (window.width, window.height), + None, + ) + .map_err(|e| exec_datafusion_err!("Failed to read mask: {}", e))?; + let mask = mask_buffer.data(); + + let band = raster + .bands() + .band(band_num) + .map_err(|e| exec_datafusion_err!("Failed to get band: {}", e))?; + let band_metadata = band.metadata(); + let data_type = band_metadata.data_type()?; + let nodata = nodata_bytes_to_f64(band_metadata.nodata_value(), &data_type); + + // Collect pixel values within the geometry + let mut values: Vec = Vec::new(); + + let band_values = band_reader.read_window_f64( + band_num, + (window.xoff, window.yoff), + (window.width, window.height), + )?; + + for (pixel_idx, &mask_val) in mask.iter().enumerate().take(window.width * window.height) { + if mask_val == 1 { + let value = band_values[pixel_idx]; + + // Check for nodata + if exclude_nodata { + if let Some(no_data) = nodata { + if (value - no_data).abs() < f64::EPSILON || value.is_nan() { + continue; + } + } + } + + values.push(value); + } + } + + // Compute statistics + compute_statistics(&values) +} + +#[derive(Clone, Copy, Debug)] +struct Bounds { + min_x: f64, + max_x: f64, + min_y: f64, + max_y: f64, +} + +impl Bounds { + fn intersection(self, other: Bounds) -> Option { + let min_x = self.min_x.max(other.min_x); + let max_x = self.max_x.min(other.max_x); + let min_y = self.min_y.max(other.min_y); + let max_y = self.max_y.min(other.max_y); + + if min_x > max_x || min_y > max_y { + None + } else { + Some(Bounds { + min_x, + max_x, + min_y, + max_y, + }) + } + } +} + +#[derive(Clone, Copy, Debug)] +struct RasterWindow { + xoff: usize, + yoff: usize, + width: usize, + height: usize, +} + +fn bounds_from_envelope(env: sedona_gdal::vector::geometry::Envelope) -> Bounds { + Bounds { + min_x: env.MinX, + max_x: env.MaxX, + min_y: env.MinY, + max_y: env.MaxY, + } +} + +fn raster_bounds(raster: &RasterRefImpl<'_>) -> Bounds { + let metadata = raster.metadata(); + let width = metadata.width() as i64; + let height = metadata.height() as i64; + let corners = [ + to_world_coordinate(raster, 0, 0), + to_world_coordinate(raster, width, 0), + to_world_coordinate(raster, 0, height), + to_world_coordinate(raster, width, height), + ]; + + let mut min_x = f64::INFINITY; + let mut max_x = f64::NEG_INFINITY; + let mut min_y = f64::INFINITY; + let mut max_y = f64::NEG_INFINITY; + + for (x, y) in corners { + min_x = min_x.min(x); + max_x = max_x.max(x); + min_y = min_y.min(y); + max_y = max_y.max(y); + } + + Bounds { + min_x, + max_x, + min_y, + max_y, + } +} + +fn world_to_pixel_f64( + raster: &RasterRefImpl<'_>, + world_x: f64, + world_y: f64, +) -> Result<(f64, f64)> { + let metadata = raster.metadata(); + let det = metadata.scale_x() * metadata.scale_y() - metadata.skew_x() * metadata.skew_y(); + + if det.abs() < f64::EPSILON { + return exec_err!("Cannot compute coordinate: determinant is zero."); + } + + let inv_scale_x = metadata.scale_y() / det; + let inv_scale_y = metadata.scale_x() / det; + let inv_skew_x = -metadata.skew_x() / det; + let inv_skew_y = -metadata.skew_y() / det; + + let dx = world_x - metadata.upper_left_x(); + let dy = world_y - metadata.upper_left_y(); + + let col = inv_scale_x * dx + inv_skew_x * dy; + let row = inv_skew_y * dx + inv_scale_y * dy; + + Ok((col, row)) +} + +fn bounds_to_window(raster: &RasterRefImpl<'_>, bounds: Bounds) -> Result> { + let metadata = raster.metadata(); + let raster_w = metadata.width() as isize; + let raster_h = metadata.height() as isize; + + let corners = [ + (bounds.min_x, bounds.min_y), + (bounds.min_x, bounds.max_y), + (bounds.max_x, bounds.min_y), + (bounds.max_x, bounds.max_y), + ]; + + let mut min_col = f64::INFINITY; + let mut max_col = f64::NEG_INFINITY; + let mut min_row = f64::INFINITY; + let mut max_row = f64::NEG_INFINITY; + + for (x, y) in corners { + let (col, row) = world_to_pixel_f64(raster, x, y)?; + min_col = min_col.min(col); + max_col = max_col.max(col); + min_row = min_row.min(row); + max_row = max_row.max(row); + } + + let mut start_col = min_col.floor() as isize - 1; + let mut end_col = max_col.ceil() as isize + 1; + let mut start_row = min_row.floor() as isize - 1; + let mut end_row = max_row.ceil() as isize + 1; + + start_col = start_col.max(0).min(raster_w); + end_col = end_col.max(0).min(raster_w); + start_row = start_row.max(0).min(raster_h); + end_row = end_row.max(0).min(raster_h); + + if end_col <= start_col || end_row <= start_row { + return Ok(None); + } + + Ok(Some(RasterWindow { + xoff: start_col as usize, + yoff: start_row as usize, + width: (end_col - start_col) as usize, + height: (end_row - start_row) as usize, + })) +} + +/// Compute all statistics from a vector of values +fn compute_statistics(values: &[f64]) -> Result { + if values.is_empty() { + return Ok(ZonalStatistics { + count: 0, + sum: 0.0, + mean: f64::NAN, + median: f64::NAN, + mode: f64::NAN, + stddev: f64::NAN, + variance: f64::NAN, + min: f64::NAN, + max: f64::NAN, + }); + } + + let count = values.len() as i64; + let sum: f64 = values.iter().sum(); + let mean = sum / count as f64; + + // Min and max + let min = values.iter().cloned().fold(f64::INFINITY, f64::min); + let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + + // Variance and standard deviation + let variance = if count > 1 { + let sum_sq_diff: f64 = values.iter().map(|&v| (v - mean).powi(2)).sum(); + sum_sq_diff / (count as f64 - 1.0) // Sample variance + } else { + 0.0 + }; + let stddev = variance.sqrt(); + + // Median + let median = { + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let mid = sorted.len() / 2; + if sorted.len().is_multiple_of(2) { + (sorted[mid - 1] + sorted[mid]) / 2.0 + } else { + sorted[mid] + } + }; + + // Mode (most frequent value) + let mode = { + let mut counts: HashMap = HashMap::new(); + for &v in values { + // Quantize to avoid floating point comparison issues + let key = (v * 1_000_000.0).round() as i64; + *counts.entry(key).or_insert(0) += 1; + } + let (mode_key, _) = counts + .into_iter() + .max_by_key(|(_, count)| *count) + .unwrap_or((0, 0)); + mode_key as f64 / 1_000_000.0 + }; + + Ok(ZonalStatistics { + count, + sum, + mean, + median, + mode, + stddev, + variance, + min, + max, + }) +} + +// ============================================================================= +// Helper Functions +// ============================================================================= + +/// Expand an optional Int32 argument to an array. If `arg_idx` is `None`, returns +/// a constant array filled with `default_val`. +fn expand_int32_arg( + args: &[ColumnarValue], + arg_idx: Option, + default_val: i32, + num_iterations: usize, +) -> Result { + let array = match arg_idx { + Some(idx) => args[idx] + .clone() + .cast_to(&DataType::Int32, None)? + .into_array(num_iterations)?, + None => ScalarValue::Int32(Some(default_val)).to_array_of_size(num_iterations)?, + }; + Ok(as_int32_array(&array)?.clone()) +} + +/// Expand an optional Boolean argument to an array. If `arg_idx` is `None`, returns +/// a constant array filled with `default_val`. +fn expand_boolean_arg( + args: &[ColumnarValue], + arg_idx: Option, + default_val: bool, + num_iterations: usize, +) -> Result { + let array = match arg_idx { + Some(idx) => args[idx] + .clone() + .cast_to(&DataType::Boolean, None)? + .into_array(num_iterations)?, + None => ScalarValue::Boolean(Some(default_val)).to_array_of_size(num_iterations)?, + }; + Ok(as_boolean_array(&array)?.clone()) +} + +/// Calculate number of iterations +fn calc_num_iterations(args: &[ColumnarValue]) -> usize { + for arg in args { + if let ColumnarValue::Array(array) = arg { + return array.len(); + } + } + 1 +} + +#[cfg(test)] +mod tests { + use super::*; + use datafusion_common::cast::as_struct_array; + use sedona_gdal::mem::MemDatasetBuilder; + use sedona_gdal::raster::types::GdalDataType; + use sedona_raster::affine_transformation::to_world_coordinate; + use sedona_raster::array::RasterStructArray; + use sedona_raster_functions::crs_utils::crs_transform_coord; + use sedona_schema::crs::deserialize_crs; + use sedona_schema::datatypes::Edges; + use sedona_schema::datatypes::RASTER; + use sedona_testing::create::make_wkb; + + #[test] + fn test_stat_type_from_str() { + assert_eq!(StatType::from_str("count"), Some(StatType::Count)); + assert_eq!(StatType::from_str("COUNT"), Some(StatType::Count)); + assert_eq!(StatType::from_str("mean"), Some(StatType::Mean)); + assert_eq!(StatType::from_str("avg"), Some(StatType::Mean)); + assert_eq!(StatType::from_str("stddev"), Some(StatType::StdDev)); + assert_eq!(StatType::from_str("invalid"), None); + } + + #[test] + fn test_compute_statistics() { + let values = vec![1.0, 2.0, 3.0, 4.0, 5.0]; + let stats = compute_statistics(&values).unwrap(); + + assert_eq!(stats.count, 5); + assert!((stats.sum - 15.0).abs() < f64::EPSILON); + assert!((stats.mean - 3.0).abs() < f64::EPSILON); + assert!((stats.median - 3.0).abs() < f64::EPSILON); + assert!((stats.min - 1.0).abs() < f64::EPSILON); + assert!((stats.max - 5.0).abs() < f64::EPSILON); + // Variance = ((1-3)^2 + (2-3)^2 + (3-3)^2 + (4-3)^2 + (5-3)^2) / 4 = 10/4 = 2.5 + assert!((stats.variance - 2.5).abs() < 0.001); + assert!((stats.stddev - 2.5_f64.sqrt()).abs() < 0.001); + } + + #[test] + fn test_compute_statistics_empty() { + let values: Vec = vec![]; + let stats = compute_statistics(&values).unwrap(); + + assert_eq!(stats.count, 0); + assert_eq!(stats.sum, 0.0); + assert!(stats.mean.is_nan()); + assert!(stats.min.is_nan()); + assert!(stats.max.is_nan()); + } + + #[test] + fn test_compute_statistics_single() { + let values = vec![42.0]; + let stats = compute_statistics(&values).unwrap(); + + assert_eq!(stats.count, 1); + assert!((stats.sum - 42.0).abs() < f64::EPSILON); + assert!((stats.mean - 42.0).abs() < f64::EPSILON); + assert!((stats.median - 42.0).abs() < f64::EPSILON); + assert!((stats.min - 42.0).abs() < f64::EPSILON); + assert!((stats.max - 42.0).abs() < f64::EPSILON); + } + + #[test] + fn test_rs_zonal_stats_with_test_raster() { + use sedona_raster::array::RasterStructArray; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + with_gdal(|gdal| { + let raster_array = crate::utils::load_as_indb_raster(gdal, &test_file)?; + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()); + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()); + + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + + let geometry = gdal.geometry_from_wkt(&wkt).unwrap(); + let geom_wkb = geometry.wkb().map_err(|e| exec_datafusion_err!("{e}"))?; + let stats = compute_zonal_stats(gdal, &raster, &geom_wkb, 1, false, true)?; + assert!(stats.count > 0, "Should have some pixels"); + assert!(stats.min <= stats.max, "Min should be <= max"); + assert!( + stats.min <= stats.mean && stats.mean <= stats.max, + "Mean should be between min and max" + ); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } + + #[test] + fn test_rs_zonal_stats_crs_mismatch() { + use sedona_expr::scalar_udf::SedonaScalarKernel; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let raster_array = + with_gdal(|gdal| crate::utils::load_as_indb_raster(gdal, &test_file)).unwrap(); + + let raster_struct = RasterStructArray::new(&raster_array); + let raster = raster_struct.get(0).unwrap(); + let width = raster.metadata().width() as i64; + let height = raster.metadata().height() as i64; + let col = width / 2; + let row = height / 2; + let (lon, lat) = to_world_coordinate(&raster, col, row); + + let point_wkt = format!("POINT ({} {})", lon, lat); + let point_wkb = make_wkb(&point_wkt); + let (x_merc, y_merc) = with_global_proj_engine(|engine| { + crs_transform_coord(engine, (lon, lat), "OGC:CRS84", "EPSG:3857") + }) + .unwrap(); + let point_merc_wkt = format!("POINT ({} {})", x_merc, y_merc); + let point_merc_wkb = make_wkb(&point_merc_wkt); + + let raster_scalar = ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(raster_array))); + let geom_type_4326 = SedonaType::Wkb(Edges::Planar, deserialize_crs("EPSG:4326").unwrap()); + let geom_type_3857 = SedonaType::Wkb(Edges::Planar, deserialize_crs("EPSG:3857").unwrap()); + + let zonal_kernel = RsZonalStats { arg_count: 3 }; + + let stat_type = ColumnarValue::Scalar(ScalarValue::Utf8(Some("count".to_string()))); + + let result_4326 = match zonal_kernel.invoke_batch( + &[RASTER, geom_type_4326, SedonaType::Arrow(DataType::Utf8)], + &[ + raster_scalar.clone(), + ColumnarValue::Scalar(ScalarValue::Binary(Some(point_wkb))), + stat_type.clone(), + ], + ) { + Ok(value) => value, + Err(err) => { + let message = err.to_string(); + if message.contains("proj-sys") { + return; + } + panic!("Unexpected RS_ZonalStats error: {message}"); + } + }; + + let result_3857 = match zonal_kernel.invoke_batch( + &[RASTER, geom_type_3857, SedonaType::Arrow(DataType::Utf8)], + &[ + raster_scalar, + ColumnarValue::Scalar(ScalarValue::Binary(Some(point_merc_wkb))), + stat_type, + ], + ) { + Ok(value) => value, + Err(err) => { + let message = err.to_string(); + if message.contains("proj-sys") { + return; + } + panic!("Unexpected RS_ZonalStats error: {message}"); + } + }; + + let value_4326 = match result_4326 { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + let value_3857 = match result_3857 { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + + assert_eq!(value_4326, value_3857); + } + + #[test] + fn test_rs_zonal_stats_outdb_raster() { + use arrow_schema::DataType; + use sedona_expr::scalar_udf::SedonaScalarKernel; + use sedona_schema::datatypes::SedonaType; + use sedona_testing::create::make_wkb; + + let test_file = sedona_testing::data::test_raster("test4.tiff").unwrap(); + let in_db_array = + with_gdal(|gdal| crate::utils::load_as_indb_raster(gdal, &test_file)).unwrap(); + + let outdb_kernel = crate::rs_from_path::RsFromPath::new(false); + let outdb_value = outdb_kernel + .invoke_batch( + &[SedonaType::Arrow(DataType::Utf8)], + &[ColumnarValue::Scalar(ScalarValue::Utf8(Some( + test_file.clone(), + )))], + ) + .unwrap(); + + let raster_struct = RasterStructArray::new(&in_db_array); + let raster = raster_struct.get(0).unwrap(); + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()); + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()); + + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + let geom_wkb = make_wkb(&wkt); + + let zonal_kernel = RsZonalStats { arg_count: 3 }; + let geom_type = SedonaType::Wkb(Edges::Planar, deserialize_crs("EPSG:4326").unwrap()); + + let args = vec![ + ColumnarValue::Scalar(ScalarValue::Struct(Arc::new(in_db_array.clone()))), + ColumnarValue::Scalar(ScalarValue::Binary(Some(geom_wkb.clone()))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("count".to_string()))), + ]; + let outdb_args = vec![ + outdb_value, + ColumnarValue::Scalar(ScalarValue::Binary(Some(geom_wkb))), + ColumnarValue::Scalar(ScalarValue::Utf8(Some("count".to_string()))), + ]; + + let in_db_result = match zonal_kernel.invoke_batch( + &[RASTER, geom_type.clone(), SedonaType::Arrow(DataType::Utf8)], + &args, + ) { + Ok(value) => value, + Err(err) => { + let message = err.to_string(); + if message.contains("proj-sys") { + return; + } + panic!("Unexpected RS_ZonalStats error: {message}"); + } + }; + let outdb_result = match zonal_kernel.invoke_batch( + &[RASTER, geom_type, SedonaType::Arrow(DataType::Utf8)], + &outdb_args, + ) { + Ok(value) => value, + Err(err) => { + let message = err.to_string(); + if message.contains("proj-sys") { + return; + } + panic!("Unexpected RS_ZonalStats error: {message}"); + } + }; + + let in_db_value = match in_db_result { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + let outdb_value = match outdb_result { + ColumnarValue::Scalar(ScalarValue::Float64(Some(value))) => value, + _ => panic!("Expected Float64 scalar result"), + }; + + assert_eq!(in_db_value, outdb_value); + } + + #[test] + fn test_rs_zonal_stats_outdb_tile_from_rs_geotiff_tiles() { + use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use sedona_raster::array::RasterStructArray; + use tempfile::tempdir; + + let tmp = tempdir().unwrap(); + let dst = tmp.path().join("test4.tiff"); + let src = sedona_testing::data::test_raster("test4.tiff").unwrap(); + std::fs::copy(&src, &dst).unwrap(); + + // Build a record batch the same way rs_geotiff_tiles does. + let rast_field = sedona_schema::datatypes::RASTER + .to_storage_field("rast", false) + .unwrap(); + let schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + Field::new("x", DataType::UInt32, false), + Field::new("y", DataType::UInt32, false), + rast_field, + ])); + + let batch = crate::rs_geotiff_tiles::build_batch_for_file(dst, schema) + .unwrap() + .unwrap(); + assert!(batch.num_rows() > 0); + + let rast_array = batch.column(3).clone(); + let rast_struct_array = + as_struct_array(&rast_array).expect("rast column should be a StructArray"); + let rast_struct = RasterStructArray::new(rast_struct_array); + let raster = rast_struct.get(0).unwrap(); + + // Polygon covering the whole tile. + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()); + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()); + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + let geom_wkb = make_wkb(&wkt); + + let result = + with_gdal(|gdal| compute_zonal_stats(gdal, &raster, &geom_wkb, 1, false, true)); + assert!( + result.is_ok(), + "Zonal stats should succeed on out-db tiles: {:?}", + result.err() + ); + + let stats = result.unwrap(); + assert!(stats.count > 0); + assert!(stats.min <= stats.max); + assert!(stats.min <= stats.mean && stats.mean <= stats.max); + } + + #[test] + fn test_rs_zonal_stats_outdb_tile_exclude_nodata() { + use arrow_schema::{Schema, SchemaRef}; + use std::path::Path; + use tempfile::tempdir; + + fn write_tiled_geotiff_f32( + gdal: &sedona_gdal::gdal::Gdal, + path: &Path, + w: usize, + h: usize, + block: u32, + nodata: f64, + ) { + let mem_ds = MemDatasetBuilder::create(gdal, w, h, 1, GdalDataType::Float32).unwrap(); + mem_ds + .set_geo_transform(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0]) + .unwrap(); + + let band = mem_ds.rasterband(1).unwrap(); + band.set_no_data_value(Some(nodata)).unwrap(); + + let mut data: Vec = (0..(w * h)).map(|v| v as f32).collect(); + // Put a few nodata pixels in the upper-left block so tile (0,0) contains them. + for (col, row) in [(0usize, 0usize), (1, 2), (3, 3)] { + data[row * w + col] = nodata as f32; + } + let mut buffer = Buffer::new((w, h), data); + band.write((0, 0), (w, h), &mut buffer).unwrap(); + + let gtiff_driver = gdal.get_driver_by_name("GTiff").unwrap(); + let options_list = [ + "TILED=YES".to_string(), + format!("BLOCKXSIZE={}", block), + format!("BLOCKYSIZE={}", block), + ]; + let options_refs: Vec<&str> = options_list.iter().map(|s| s.as_str()).collect(); + let _out = mem_ds + .create_copy(>iff_driver, path.to_str().unwrap(), &options_refs) + .unwrap(); + } + + with_gdal(|gdal| { + let tmp = tempdir().unwrap(); + let dst = tmp.path().join("nodata_tiles.tif"); + let nodata = -9999.0; + write_tiled_geotiff_f32(gdal, &dst, 32, 32, 16, nodata); + + let rast_field = sedona_schema::datatypes::RASTER + .to_storage_field("rast", false) + .unwrap(); + let schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + Field::new("x", DataType::UInt32, false), + Field::new("y", DataType::UInt32, false), + rast_field, + ])); + + let batch = crate::rs_geotiff_tiles::build_batch_for_file(dst, schema)? + .expect("expected at least one tile"); + assert!(batch.num_rows() > 0); + + let rast_array = batch.column(3).clone(); + let rast_struct_array = + as_struct_array(&rast_array).expect("rast column should be a StructArray"); + let rast_struct = RasterStructArray::new(rast_struct_array); + let raster = rast_struct.get(0).unwrap(); + + let band = raster.bands().band(1).unwrap(); + let band_meta = band.metadata(); + let nodata_meta = + nodata_bytes_to_f64(band_meta.nodata_value(), &band_meta.data_type().unwrap()); + + let metadata = raster.metadata(); + let min_x = metadata.upper_left_x(); + let max_y = metadata.upper_left_y(); + let max_x = min_x + (metadata.width() as f64 * metadata.scale_x()); + let min_y = max_y + (metadata.height() as f64 * metadata.scale_y()); + let wkt = format!( + "POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))", + min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y, min_x, min_y + ); + let geom_wkb = make_wkb(&wkt); + + let include = compute_zonal_stats(gdal, &raster, &geom_wkb, 1, false, false)?; + let exclude = compute_zonal_stats(gdal, &raster, &geom_wkb, 1, false, true)?; + assert_eq!(include.count, 256); + assert_eq!(exclude.count, 253); + assert_eq!(nodata_meta, Some(-9999.0)); + assert!(exclude.min >= 0.0); + assert!((include.sum - exclude.sum - 3.0 * nodata).abs() < 1e-6); + Ok::<_, datafusion_common::DataFusionError>(()) + }) + .unwrap(); + } +} diff --git a/rust/sedona-raster-gdal/src/utils.rs b/rust/sedona-raster-gdal/src/utils.rs index 30f543071..b12b78c0a 100644 --- a/rust/sedona-raster-gdal/src/utils.rs +++ b/rust/sedona-raster-gdal/src/utils.rs @@ -23,6 +23,7 @@ use datafusion_common::error::Result; use datafusion_common::exec_datafusion_err; use sedona_gdal::dataset::Dataset; use sedona_gdal::spatial_ref::SpatialRef; +use sedona_gdal::{gdal::Gdal, raster::types::DatasetOptions}; use sedona_raster::builder::RasterBuilder; use sedona_raster::traits::BandMetadata; @@ -119,17 +120,32 @@ pub fn dataset_to_indb_raster(dataset: &Dataset) -> Result { .map_err(|e| exec_datafusion_err!("Failed to build raster: {}", e)) } +/// Load a GDAL dataset from the specified path and materialize it as an in-db raster `StructArray`. +pub fn load_as_indb_raster(gdal: &Gdal, path: &str) -> datafusion_common::Result { + let dataset = open_dataset(gdal, path).map_err(crate::gdal_common::convert_gdal_err)?; + dataset_to_indb_raster(&dataset) +} + +fn open_dataset(gdal: &Gdal, path: &str) -> sedona_gdal::errors::Result { + use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY}; + + gdal.open_ex_with_options( + path, + DatasetOptions { + open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY, + ..Default::default() + }, + ) +} + #[cfg(test)] mod tests { - use super::{append_as_indb_raster, dataset_to_indb_raster}; + use super::{append_as_indb_raster, dataset_to_indb_raster, load_as_indb_raster, open_dataset}; - use arrow_array::StructArray; use datafusion_common::exec_datafusion_err; use sedona_gdal::dataset::Dataset; use sedona_gdal::gdal::Gdal; - use sedona_gdal::gdal_dyn_bindgen::{GDAL_OF_RASTER, GDAL_OF_READONLY}; use sedona_gdal::raster::types::Buffer; - use sedona_gdal::raster::types::DatasetOptions; use sedona_raster::array::RasterStructArray; use sedona_raster::builder::RasterBuilder; use sedona_raster::traits::RasterRef; @@ -138,21 +154,6 @@ mod tests { use crate::gdal_common::with_gdal; - fn open_dataset(gdal: &Gdal, path: &str) -> sedona_gdal::errors::Result { - gdal.open_ex_with_options( - path, - DatasetOptions { - open_flags: GDAL_OF_RASTER | GDAL_OF_READONLY, - ..Default::default() - }, - ) - } - - fn load_as_indb_raster(gdal: &Gdal, path: &str) -> datafusion_common::Result { - let dataset = open_dataset(gdal, path).map_err(crate::gdal_common::convert_gdal_err)?; - dataset_to_indb_raster(&dataset) - } - fn write_uint64_tiff(gdal: &Gdal, path: &str, nodata: u64, data: Vec) { let driver = gdal.get_driver_by_name("GTiff").unwrap(); let dataset = driver.create_with_band_type::(path, 2, 2, 1).unwrap(); diff --git a/rust/sedona/src/context.rs b/rust/sedona/src/context.rs index 7cd945911..c851d6c83 100644 --- a/rust/sedona/src/context.rs +++ b/rust/sedona/src/context.rs @@ -221,6 +221,16 @@ impl SedonaContext { Arc::new(RandomGeometryFunction::default()), ); + // Register GDAL-backed raster table functions + out.ctx.register_udtf( + "rs_geotiff_tiles", + sedona_raster_gdal::rs_geotiff_tiles_udtf(), + ); + + for udf in sedona_raster_gdal::all_gdal_udfs() { + out.ctx.register_udf(udf); + } + // Always register default function set out.register_function_set(sedona_functions::register::default_function_set()); diff --git a/sedona-cli/Cargo.toml b/sedona-cli/Cargo.toml index 9b49da89e..63e6c00c6 100644 --- a/sedona-cli/Cargo.toml +++ b/sedona-cli/Cargo.toml @@ -61,7 +61,7 @@ mimalloc = { workspace = true, optional = true } libmimalloc-sys = { workspace = true, optional = true } regex = { workspace = true } rustyline = "15.0" -sedona = { workspace = true, features = ["aws", "gcp", "http", "proj"] } +sedona = { workspace = true, features = ["aws", "gcp", "http", "proj", "gdal"] } sedona-common = { workspace = true } sedona-functions = { workspace = true } sedona-raster-functions = { workspace = true }