diff --git a/Cargo.lock b/Cargo.lock index 04c768a85..4b9eff88c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5812,6 +5812,7 @@ version = "0.4.0" dependencies = [ "arrow-array", "arrow-buffer", + "arrow-schema", "criterion", "datafusion-common", "lru", diff --git a/rust/sedona-raster-functions/src/rs_setsrid.rs b/rust/sedona-raster-functions/src/rs_setsrid.rs index 2ff6134e4..ad4fc718a 100644 --- a/rust/sedona-raster-functions/src/rs_setsrid.rs +++ b/rust/sedona-raster-functions/src/rs_setsrid.rs @@ -531,10 +531,15 @@ mod tests { let orig_bands = original.bands(); let mod_bands = modified.bands(); assert_eq!(orig_bands.len(), mod_bands.len()); + let mut orig_scratch = Vec::new(); + let mut mod_scratch = Vec::new(); for band_idx in 0..orig_bands.len() { let orig_band = orig_bands.band(band_idx + 1).unwrap(); let mod_band = mod_bands.band(band_idx + 1).unwrap(); - assert_eq!(orig_band.data(), mod_band.data()); + assert_eq!( + orig_band.contiguous_data(&mut orig_scratch).unwrap(), + mod_band.contiguous_data(&mut mod_scratch).unwrap() + ); assert_eq!( orig_band.metadata().data_type().unwrap(), mod_band.metadata().data_type().unwrap() diff --git a/rust/sedona-raster-gdal/Cargo.toml b/rust/sedona-raster-gdal/Cargo.toml index 5dba31c98..f0460ba19 100644 --- a/rust/sedona-raster-gdal/Cargo.toml +++ b/rust/sedona-raster-gdal/Cargo.toml @@ -33,6 +33,7 @@ result_large_err = "allow" [dependencies] arrow-array = { workspace = true } arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } datafusion-common = { workspace = true } lru = { workspace = true } sedona-common = { workspace = true } diff --git a/rust/sedona-raster-gdal/src/gdal_common.rs b/rust/sedona-raster-gdal/src/gdal_common.rs index 2a6fad688..5d0c9bc0b 100644 --- a/rust/sedona-raster-gdal/src/gdal_common.rs +++ b/rust/sedona-raster-gdal/src/gdal_common.rs @@ -217,6 +217,16 @@ pub unsafe fn raster_ref_to_gdal_mem( // Note: GDALAddBand always appends a new band, so the destination band index // is sequential (1..=band_indices.len()), even if the source band indices are // sparse (e.g. [1, 3]). + // + // Safety: this loop rejects non-InDb bands above, so `contiguous_data` must + // early-return with a zero-copy slice of the Arrow column and leave `scratch` + // untouched. The Arrow column outlives the returned dataset (the caller owns + // the StructArray driving this loop), so the raw pointer captured into the + // builder remains valid for the dataset's lifetime. We assert `scratch` was + // not written after each iteration — if a future change relaxes the OutDb + // rejection above, the assert fires loudly instead of silently aliasing the + // captured pointer into a `scratch` that the next iteration will overwrite. + let mut scratch: Vec = Vec::new(); for &src_band_index in band_indices.iter() { let band = bands .band(src_band_index) @@ -238,11 +248,24 @@ pub unsafe fn raster_ref_to_gdal_mem( let band_metadata = band.metadata(); let band_type = band_metadata.data_type()?; let gdal_type = band_data_type_to_gdal(&band_type); - let band_data = band.data(); + let band_data = band + .contiguous_data(&mut scratch) + .map_err(|e| arrow_datafusion_err!(e))?; let data_ptr = band_data.as_ptr(); unsafe { mem_ds_builder = mem_ds_builder.add_band(gdal_type, data_ptr as *mut u8); } + if !scratch.is_empty() { + return Err(DataFusionError::Internal(format!( + "raster_ref_to_gdal_mem: contiguous_data wrote {} bytes into scratch \ + for band {src_band_index}; this path captures the slice's pointer and \ + reuses scratch across iterations, so the band MUST be schema-InDb \ + (zero-copy from the Arrow column). Either an InDb band hit the OutDb \ + loader path, or a future caller relaxed the OutDb rejection above \ + without restructuring to per-band buffers.", + scratch.len() + ))); + } } let dataset = unsafe { diff --git a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs index a9d1013c5..b872d8d84 100644 --- a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs +++ b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs @@ -187,7 +187,7 @@ impl GDALDatasetCache { }) } - fn get_or_create_outdb_source( + pub(crate) fn get_or_create_outdb_source( &self, gdal: &Gdal, path: &str, diff --git a/rust/sedona-raster-gdal/src/lib.rs b/rust/sedona-raster-gdal/src/lib.rs index 8e8c871fb..dc4683492 100644 --- a/rust/sedona-raster-gdal/src/lib.rs +++ b/rust/sedona-raster-gdal/src/lib.rs @@ -31,15 +31,14 @@ mod gdal_common; // Temporary until https://github.com/apache/sedona-db/issues/804 is resolved. #[allow(dead_code)] mod gdal_dataset_provider; - -mod utils; - -#[cfg(test)] +mod outdb_loader; mod source_uri; +mod utils; // Re-export main dataset conversion functions pub use gdal_common::{ band_data_type_to_gdal, bytes_to_f64, gdal_to_band_data_type, gdal_type_byte_size, nodata_bytes_to_f64, nodata_f64_to_bytes, }; +pub use outdb_loader::register_outdb_loader; pub use utils::{append_as_indb_raster, dataset_to_indb_raster}; diff --git a/rust/sedona-raster-gdal/src/outdb_loader.rs b/rust/sedona-raster-gdal/src/outdb_loader.rs new file mode 100644 index 000000000..89a679406 --- /dev/null +++ b/rust/sedona-raster-gdal/src/outdb_loader.rs @@ -0,0 +1,379 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! GDAL-backed implementation of the OutDb band loader. +//! +//! Installs the process-wide loader registered with +//! [`sedona_raster::set_outdb_band_loader`]. Reads the band identified by +//! a `#band=N` URI fragment via the thread-local `GDALDatasetCache` so +//! the underlying GDAL dataset is opened at most once per (thread, URI). + +use arrow_schema::ArrowError; +use sedona_raster::{OutDbBandLoader, OutDbLoadRequest}; + +use crate::gdal_common::{convert_gdal_err, gdal_to_band_data_type, with_gdal}; +use crate::gdal_dataset_provider::thread_local_cache; +use crate::source_uri::parse_outdb_source; + +/// Install the GDAL-backed OutDb loader as the process-wide handler. +/// +/// Idempotent — multiple `SedonaContext::new()` calls per process are +/// safe; subsequent registrations are silently ignored by the underlying +/// `OnceLock` in `sedona-raster`. +pub fn register_outdb_loader() { + let loader: OutDbBandLoader = gdal_load; + sedona_raster::set_outdb_band_loader(loader); +} + +fn gdal_load(req: &OutDbLoadRequest<'_>, scratch: &mut Vec) -> Result<(), ArrowError> { + // 2-D `[y, x]` only for v1; higher-rank or transposed OutDb reads + // need MDArray support and explicit axis mapping, both tracked + // separately. Matches `raster_ref_to_gdal_mem`'s 2-D constraint in + // `gdal_common.rs`. + if req.source_shape.len() != 2 { + return Err(ArrowError::NotYetImplemented(format!( + "OutDb GDAL loader only supports 2-D bands; got source_shape with {} dims", + req.source_shape.len() + ))); + } + if req.dim_names != ["y", "x"] { + return Err(ArrowError::InvalidArgumentError(format!( + "OutDb GDAL loader requires dim_names=[\"y\", \"x\"]; got {:?}", + req.dim_names + ))); + } + // u64 → usize narrowing. Only the 32-bit target ever exercises the + // error branch — on 64-bit targets the conversion is infallible. + let height = usize::try_from(req.source_shape[0]).map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "OutDb source_shape[0]={} exceeds usize::MAX", + req.source_shape[0] + )) + })?; + let width = usize::try_from(req.source_shape[1]).map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "OutDb source_shape[1]={} exceeds usize::MAX", + req.source_shape[1] + )) + })?; + + let uri = req.uri; + + with_gdal(|gdal| { + // `#band=N` fragment, with N defaulting to 1 if absent. The + // returned path is what GDAL itself will open (VSI translation + // happens inside `open_gdal_dataset` via the cache key). + let (path, band_num) = parse_outdb_source(uri)?; + let cache = thread_local_cache()?; + let dataset = cache.get_or_create_outdb_source(gdal, &path, None)?; + let band = dataset + .rasterband(band_num as usize) + .map_err(convert_gdal_err)?; + // Verify the file's pixel type matches the band metadata's + // claim BEFORE reading. `read_as_bytes` returns bytes in the + // file's native type with no conversion — a mismatch would + // produce a 2x-or-N/2 byte count and the size check in + // `source_bytes()` would mis-blame the loader for size rather + // than naming the dtype mismatch. Catch it cleanly here. + let file_dtype = gdal_to_band_data_type(band.band_type())?; + if file_dtype != req.data_type { + return Err(sedona_common::sedona_internal_datafusion_err!( + "OutDb band metadata claims {:?} but file {} band {} is {:?}", + req.data_type, + uri, + band_num, + file_dtype + )); + } + // Caller (`load_outdb`) cleared scratch on entry; we append the + // band's bytes into it. `read_as_bytes` allocates a fresh Vec, so + // this is one heap operation per band — when GDAL exposes a + // `read_into_bytes(&mut [u8])` we can write directly into scratch + // and eliminate it. Tracked separately. + let bytes = band + .read_as_bytes((0, 0), (width, height), (width, height), None) + .map_err(convert_gdal_err)?; + scratch.extend_from_slice(&bytes); + Ok(()) + }) + .map_err(|e| ArrowError::ExternalError(Box::new(e))) +} + +#[cfg(test)] +mod tests { + use super::*; + use sedona_gdal::gdal::Gdal; + use sedona_gdal::raster::types::Buffer; + use sedona_raster::array::RasterStructArray; + use sedona_raster::builder::RasterBuilder; + use sedona_raster::traits::RasterRef; + use sedona_schema::raster::BandDataType; + use std::sync::Once; + use tempfile::TempDir; + + static REGISTER: Once = Once::new(); + fn ensure_registered() { + REGISTER.call_once(register_outdb_loader); + } + + /// 4x3 UInt8 GeoTIFF with deterministic byte pattern. Returns the + /// raw row-major bytes the file holds so callers can compare. + fn write_test_tiff(gdal: &Gdal, path: &str, width: usize, height: usize) -> Vec { + let driver = gdal.get_driver_by_name("GTiff").unwrap(); + let dataset = driver + .create_with_band_type::(path, width, height, 1) + .unwrap(); + dataset + .set_geo_transform(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0]) + .unwrap(); + let data: Vec = (0..(width * height)).map(|i| (i & 0xFF) as u8).collect(); + { + let band = dataset.rasterband(1).unwrap(); + let mut buffer = Buffer::new((width, height), data.clone()); + band.write((0, 0), (width, height), &mut buffer).unwrap(); + } + // Force a flush so reads via a separate dataset see the bytes. + drop(dataset); + data + } + + fn build_outdb_band_array(uri: &str, source_shape: &[u64]) -> arrow_array::StructArray { + build_outdb_band_array_with(uri, &["y", "x"], source_shape, BandDataType::UInt8) + } + + fn build_outdb_band_array_with( + uri: &str, + dim_names: &[&str], + source_shape: &[u64], + data_type: BandDataType, + ) -> arrow_array::StructArray { + let mut b = RasterBuilder::new(1); + let spatial: Vec = source_shape.iter().map(|&v| v as i64).collect(); + b.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], dim_names, &spatial, None) + .unwrap(); + b.start_band_nd( + None, + dim_names, + source_shape, + data_type, + None, + Some(uri), + Some("geotiff"), + ) + .unwrap(); + b.band_data_writer().append_value([0u8; 0]); // schema-OutDb + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + b.finish().unwrap() + } + + #[test] + fn loads_outdb_band_bytes_from_geotiff() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("single.tif"); + let tif_str = tif_path.to_str().unwrap(); + + let expected = with_gdal(|gdal| Ok(write_test_tiff(gdal, tif_str, 4, 3))).unwrap(); + + let arr = build_outdb_band_array(tif_str, &[3, 4]); // [height, width] + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + + let mut scratch = Vec::new(); + let buf = band.nd_buffer(&mut scratch).unwrap(); + assert_eq!(buf.buffer, expected.as_slice()); + assert_eq!(buf.shape, vec![3, 4]); + assert!( + !band.is_indb(), + "is_indb must remain false; OutDb is a schema property" + ); + } + + #[test] + fn repeated_calls_on_same_band_reuse_thread_local_dataset_cache() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("cached.tif"); + let tif_str = tif_path.to_str().unwrap(); + let expected = with_gdal(|gdal| Ok(write_test_tiff(gdal, tif_str, 4, 3))).unwrap(); + + let arr = build_outdb_band_array(tif_str, &[3, 4]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + + // Both calls go through GDAL. Cross-call cheapness comes from the + // `GDALDatasetCache` thread-local LRU keeping the source dataset + // open across calls; the per-band scratch is owned here and reused + // for both reads (same Vec, same allocation reused). The + // returned bytes must match across calls. + let mut scratch = Vec::new(); + let a = band.nd_buffer(&mut scratch).unwrap().buffer.to_vec(); + let b = band.nd_buffer(&mut scratch).unwrap().buffer.to_vec(); + assert_eq!(a, expected); + assert_eq!(a, b); + } + + fn write_two_band_tiff(gdal: &Gdal, path: &str) -> (Vec, Vec) { + let driver = gdal.get_driver_by_name("GTiff").unwrap(); + let dataset = driver.create_with_band_type::(path, 2, 2, 2).unwrap(); + dataset + .set_geo_transform(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0]) + .unwrap(); + let band1_data = vec![10u8, 20, 30, 40]; + let band2_data = vec![100u8, 110, 120, 130]; + { + let b1 = dataset.rasterband(1).unwrap(); + let mut buf1 = Buffer::new((2, 2), band1_data.clone()); + b1.write((0, 0), (2, 2), &mut buf1).unwrap(); + } + { + let b2 = dataset.rasterband(2).unwrap(); + let mut buf2 = Buffer::new((2, 2), band2_data.clone()); + b2.write((0, 0), (2, 2), &mut buf2).unwrap(); + } + drop(dataset); + (band1_data, band2_data) + } + + #[test] + fn band_fragment_selects_correct_band() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("multi.tif"); + let tif_str = tif_path.to_str().unwrap(); + let (band1, band2) = with_gdal(|gdal| Ok(write_two_band_tiff(gdal, tif_str))).unwrap(); + + let uri_b1 = format!("{tif_str}#band=1"); + let uri_b2 = format!("{tif_str}#band=2"); + + let mut scratch = Vec::new(); + let arr1 = build_outdb_band_array(&uri_b1, &[2, 2]); + let r1 = RasterStructArray::new(&arr1); + let raster_one = r1.get(0).unwrap(); + let band_one = raster_one.band(0).unwrap(); + assert_eq!( + band_one.nd_buffer(&mut scratch).unwrap().buffer, + band1.as_slice() + ); + + let arr2 = build_outdb_band_array(&uri_b2, &[2, 2]); + let r2 = RasterStructArray::new(&arr2); + let raster_two = r2.get(0).unwrap(); + let band_two = raster_two.band(0).unwrap(); + assert_eq!( + band_two.nd_buffer(&mut scratch).unwrap().buffer, + band2.as_slice() + ); + } + + #[test] + fn rejects_non_yx_dim_names() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("xy.tif"); + let tif_str = tif_path.to_str().unwrap(); + let _ = with_gdal(|gdal| Ok(write_test_tiff(gdal, tif_str, 4, 3))).unwrap(); + + // Band metadata says dim order is [x, y] instead of the + // GDAL-expected [y, x]. Loader must refuse rather than reading + // with transposed dimensions. + let arr = build_outdb_band_array_with(tif_str, &["x", "y"], &[4, 3], BandDataType::UInt8); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let err = band.nd_buffer(&mut scratch).unwrap_err().to_string(); + assert!( + err.contains("dim_names") && err.contains("\"x\""), + "expected dim_names rejection naming the offending axes, got: {err}" + ); + } + + #[test] + fn rejects_pixel_type_mismatch() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("u16.tif"); + let tif_str = tif_path.to_str().unwrap(); + // Write a UInt16 GeoTIFF but claim UInt8 in the band metadata. + with_gdal(|gdal| { + let driver = gdal.get_driver_by_name("GTiff").unwrap(); + let dataset = driver + .create_with_band_type::(tif_str, 2, 2, 1) + .unwrap(); + dataset + .set_geo_transform(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0]) + .unwrap(); + { + let band = dataset.rasterband(1).unwrap(); + let mut buf = Buffer::new((2, 2), vec![1u16, 2, 3, 4]); + band.write((0, 0), (2, 2), &mut buf).unwrap(); + } + drop(dataset); + Ok(()) + }) + .unwrap(); + + let arr = build_outdb_band_array_with(tif_str, &["y", "x"], &[2, 2], BandDataType::UInt8); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let err = band.nd_buffer(&mut scratch).unwrap_err().to_string(); + assert!( + err.contains("UInt8") && err.contains("UInt16"), + "expected dtype mismatch naming both types, got: {err}" + ); + } + + #[test] + fn errors_on_missing_file() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let missing = tmp.path().join("does_not_exist.tif"); + let arr = build_outdb_band_array(missing.to_str().unwrap(), &[2, 2]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + // Loader propagates GDAL's open-failure as an ArrowError; the + // exact message is GDAL/version dependent, so just assert that + // *some* error surfaces rather than a panic or silent empty buffer. + let mut scratch = Vec::new(); + assert!(band.nd_buffer(&mut scratch).is_err()); + } + + #[test] + fn errors_on_band_index_out_of_range() { + ensure_registered(); + let tmp = TempDir::new().unwrap(); + let tif_path = tmp.path().join("oneband.tif"); + let tif_str = tif_path.to_str().unwrap(); + let _ = with_gdal(|gdal| Ok(write_test_tiff(gdal, tif_str, 2, 2))).unwrap(); + + // File has 1 band; request band 5. + let uri = format!("{tif_str}#band=5"); + let arr = build_outdb_band_array(&uri, &[2, 2]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + assert!(band.nd_buffer(&mut scratch).is_err()); + } +} diff --git a/rust/sedona-raster-gdal/src/utils.rs b/rust/sedona-raster-gdal/src/utils.rs index 30f543071..1a5cbc4d6 100644 --- a/rust/sedona-raster-gdal/src/utils.rs +++ b/rust/sedona-raster-gdal/src/utils.rs @@ -270,7 +270,10 @@ mod tests { assert_eq!(band.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band.data(), [1u8, 2, 3, 4, 5, 6]); + assert_eq!( + band.contiguous_data(&mut Vec::new()).unwrap(), + [1u8, 2, 3, 4, 5, 6] + ); } #[test] @@ -301,8 +304,10 @@ mod tests { &nodata.to_le_bytes() ); + let mut scratch = Vec::new(); let pixels: Vec = band - .data() + .contiguous_data(&mut scratch) + .unwrap() .chunks_exact(8) .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -333,8 +338,10 @@ mod tests { &nodata.to_le_bytes() ); + let mut scratch = Vec::new(); let pixels: Vec = band - .data() + .contiguous_data(&mut scratch) + .unwrap() .chunks_exact(8) .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -365,8 +372,10 @@ mod tests { &nodata.to_le_bytes() ); + let mut scratch = Vec::new(); let pixels: Vec = band - .data() + .contiguous_data(&mut scratch) + .unwrap() .chunks_exact(2) .map(|chunk| u16::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -395,12 +404,18 @@ mod tests { assert_eq!(band1.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band1.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band1.data(), [10u8, 11, 12, 13]); + assert_eq!( + band1.contiguous_data(&mut Vec::new()).unwrap(), + [10u8, 11, 12, 13] + ); assert_eq!(band2.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band2.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band2.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band2.data(), [100u8, 0, 200, 0]); + assert_eq!( + band2.contiguous_data(&mut Vec::new()).unwrap(), + [100u8, 0, 200, 0] + ); } #[test] @@ -420,12 +435,18 @@ mod tests { assert_eq!(band1.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band1.metadata().nodata_value().unwrap(), [0u8]); - assert_eq!(band1.data(), [10u8, 11, 12, 13]); + assert_eq!( + band1.contiguous_data(&mut Vec::new()).unwrap(), + [10u8, 11, 12, 13] + ); assert_eq!(band2.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band2.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band2.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band2.data(), [100u8, 0, 200, 0]); + assert_eq!( + band2.contiguous_data(&mut Vec::new()).unwrap(), + [100u8, 0, 200, 0] + ); } #[test] diff --git a/rust/sedona-raster/src/array.rs b/rust/sedona-raster/src/array.rs index bc7c2317b..187e21957 100644 --- a/rust/sedona-raster/src/array.rs +++ b/rust/sedona-raster/src/array.rs @@ -15,14 +15,13 @@ // specific language governing permissions and limitations // under the License. -use std::borrow::Cow; - use arrow_array::{ Array, BinaryArray, BinaryViewArray, Float64Array, Int64Array, ListArray, StringArray, StringViewArray, StructArray, UInt32Array, UInt64Array, }; use arrow_schema::ArrowError; +use crate::outdb_loader::{load_outdb, OutDbLoadRequest}; use crate::traits::{BandRef, Bands, NdBuffer, RasterRef, ViewEntry}; use sedona_schema::raster::{band_indices, raster_indices, BandDataType}; @@ -54,6 +53,86 @@ struct BandRefImpl<'a> { byte_offset: u64, } +impl<'a> BandRefImpl<'a> { + /// Overflow-safe expected size of the band's raw byte buffer. + /// `Π raw_source_shape × data_type.byte_size()`. A malicious or merely + /// huge `source_shape` could wrap silently in unchecked arithmetic; we + /// route every step through `checked_mul`/`try_into` and surface a + /// clean error. + fn expected_byte_count(&self) -> Result { + let elements = self + .raw_source_shape() + .iter() + .try_fold(1u64, |acc, &d| acc.checked_mul(d)) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "band {} has source_shape whose product overflows u64", + self.band_row + )) + })?; + let elements_usize: usize = elements.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "band {} has too many elements for usize on this platform", + self.band_row + )) + })?; + elements_usize + .checked_mul(self.data_type.byte_size()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "band {} byte count overflows usize", + self.band_row + )) + }) + } + + /// Resolve the band's bytes through the uniform InDb/OutDb path. + /// + /// For schema-InDb bands (`data` column non-empty), returns a zero-copy + /// slice of the Arrow column and `scratch` is untouched. For schema-OutDb + /// bands (`data` column empty), invokes the process-wide OutDb loader, + /// which writes into `scratch`; returns a slice of `scratch`. + /// + /// Callers should pass the same `Vec` for every band in a loop — + /// scratch capacity grows to the largest band's source size and then + /// stabilises. The InDb path will leave the scratch contents unchanged + /// from the previous call; that's harmless because the returned slice + /// points into the Arrow column, not the scratch. + fn source_bytes<'s>(&'s self, scratch: &'s mut Vec) -> Result<&'s [u8], ArrowError> { + let column = self.data_array.value(self.band_row); + if !column.is_empty() { + return Ok(column); + } + let uri = self.outdb_uri().ok_or_else(|| { + ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( + "OutDb band has empty data column but no outdb_uri" + ))) + })?; + let dim_names = self.dim_names(); + let req = OutDbLoadRequest { + uri, + format: self.outdb_format(), + dim_names: &dim_names, + source_shape: self.raw_source_shape(), + data_type: self.data_type, + }; + let expected = self.expected_byte_count()?; + load_outdb(&req, scratch)?; + if scratch.len() != expected { + return Err(ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "OutDb loader wrote {} bytes for band {} (uri={}), expected {}", + scratch.len(), + self.band_row, + uri, + expected + ), + ))); + } + Ok(scratch.as_slice()) + } +} + impl<'a> BandRef for BandRefImpl<'a> { fn ndim(&self) -> usize { self.view_entries.len() @@ -85,17 +164,6 @@ impl<'a> BandRef for BandRefImpl<'a> { self.data_type } - fn data(&self) -> &[u8] { - // Pre-N-D compatibility surface. Identity-view InDb bands → the - // row-major in-line buffer (zero-copy borrow into the StructArray), - // matching the pre-N-D behavior exactly. OutDb → `&[]` from the - // empty `data` column, no panic. Non-identity views never reach - // here — `RasterRefImpl::band()` rejects them upstream so the - // raw column bytes always equal the visible bytes for any band - // this reader produces. - self.data_array.value(self.band_row) - } - fn nodata(&self) -> Option<&[u8]> { if self.nodata_array.is_null(self.band_row) { None @@ -124,18 +192,13 @@ impl<'a> BandRef for BandRefImpl<'a> { !self.data_array.value(self.band_row).is_empty() } - fn nd_buffer(&self) -> Result, ArrowError> { - if !self.is_indb() { - return Err(ArrowError::NotYetImplemented( - "OutDb byte access via nd_buffer() is not yet implemented; \ - backend-specific OutDb resolvers are tracked separately" - .to_string(), - )); - } + fn nd_buffer<'s>(&'s self, scratch: &'s mut Vec) -> Result, ArrowError> { // shape and strides are owned by NdBuffer (see its doc comment). // Cloning here is cheap — both vecs are O(ndim), a handful of values. + // `source_bytes()` resolves to either the Arrow column (InDb, + // zero-copy) or `scratch` (OutDb, written by the loader on this call). Ok(NdBuffer { - buffer: self.data_array.value(self.band_row), + buffer: self.source_bytes(scratch)?, shape: self.visible_shape.clone(), strides: self.byte_strides.clone(), offset: self.byte_offset, @@ -143,17 +206,11 @@ impl<'a> BandRef for BandRefImpl<'a> { }) } - fn contiguous_data(&self) -> Result, ArrowError> { - if !self.is_indb() { - return Err(ArrowError::NotYetImplemented( - "OutDb byte access via contiguous_data() is not yet implemented; \ - backend-specific OutDb resolvers are tracked separately" - .to_string(), - )); - } - // Identity-view only today, so the data buffer is already row-major + fn contiguous_data<'s>(&'s self, scratch: &'s mut Vec) -> Result<&'s [u8], ArrowError> { + // Identity-view only today, so the source bytes (Arrow column for + // InDb, scratch-resident buffer for OutDb) are already row-major // over the visible region. - Ok(Cow::Borrowed(self.data_array.value(self.band_row))) + self.source_bytes(scratch) } } @@ -672,8 +729,10 @@ mod tests { // Access band with 1-based band_number let band = bands.band(1).unwrap(); - assert_eq!(band.data().len(), 100); - assert_eq!(band.data()[0], 1u8); + let mut scratch = Vec::new(); + let data = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(data.len(), 100); + assert_eq!(data[0], 1u8); let band_meta = band.metadata(); assert_eq!(band_meta.storage_type().unwrap(), StorageType::InDb); @@ -733,11 +792,13 @@ mod tests { // Test each band has different data // Use 1-based band numbers + let mut scratch = Vec::new(); for i in 0..3 { // Access band with 1-based band_number let band = bands.band(i + 1).unwrap(); let expected_value = i as u8; - assert!(band.data().iter().all(|&x| x == expected_value)); + let data = band.contiguous_data(&mut scratch).unwrap(); + assert!(data.iter().all(|&x| x == expected_value)); } // Test array @@ -746,8 +807,9 @@ mod tests { .enumerate() .map(|(i, band)| { let band = band.unwrap(); - assert_eq!(band.data()[0], i as u8); - band.data()[0] + let data = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(data[0], i as u8); + data[0] }) .collect(); @@ -1037,17 +1099,26 @@ mod tests { assert_eq!(r0.num_bands(), 3); assert_eq!(r0.band(0).unwrap().shape(), &[3]); assert_eq!( - &*r0.band(0).unwrap().contiguous_data().unwrap(), + r0.band(0) + .unwrap() + .contiguous_data(&mut Vec::new()) + .unwrap(), &[10u8, 20, 30] ); assert_eq!(r0.band(1).unwrap().shape(), &[3]); assert_eq!( - &*r0.band(1).unwrap().contiguous_data().unwrap(), + r0.band(1) + .unwrap() + .contiguous_data(&mut Vec::new()) + .unwrap(), &[40u8, 50, 60] ); assert_eq!(r0.band(2).unwrap().shape(), &[3]); assert_eq!( - &*r0.band(2).unwrap().contiguous_data().unwrap(), + r0.band(2) + .unwrap() + .contiguous_data(&mut Vec::new()) + .unwrap(), &[100u8, 101, 102] ); @@ -1055,12 +1126,18 @@ mod tests { assert_eq!(r1.num_bands(), 2); assert_eq!(r1.band(0).unwrap().shape(), &[4]); assert_eq!( - &*r1.band(0).unwrap().contiguous_data().unwrap(), + r1.band(0) + .unwrap() + .contiguous_data(&mut Vec::new()) + .unwrap(), &[42u8, 43, 44, 45] ); assert_eq!(r1.band(1).unwrap().shape(), &[4]); assert_eq!( - &*r1.band(1).unwrap().contiguous_data().unwrap(), + r1.band(1) + .unwrap() + .contiguous_data(&mut Vec::new()) + .unwrap(), &[1u8, 2, 3, 4] ); diff --git a/rust/sedona-raster/src/builder.rs b/rust/sedona-raster/src/builder.rs index 31d018de4..d2d8718f8 100644 --- a/rust/sedona-raster/src/builder.rs +++ b/rust/sedona-raster/src/builder.rs @@ -687,7 +687,6 @@ mod tests { use arrow_ipc::writer::StreamWriter; use arrow_schema::Schema; use sedona_schema::raster::StorageType; - use std::borrow::Cow; use std::io::Cursor; #[test] @@ -747,8 +746,10 @@ mod tests { // Access band with 1-based band_number let band = bands.band(1).unwrap(); - assert_eq!(band.data().len(), 100); - assert_eq!(band.data()[0], 1u8); + let mut scratch = Vec::new(); + let data = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(data.len(), 100); + assert_eq!(data[0], 1u8); let band_meta = band.metadata(); assert_eq!(band_meta.storage_type().unwrap(), StorageType::InDb); @@ -808,11 +809,13 @@ mod tests { // Test each band has different data // Use 1-based band numbers + let mut scratch = Vec::new(); for i in 0..3 { // Access band with 1-based band_number let band = bands.band(i + 1).unwrap(); let expected_value = i as u8; - assert!(band.data().iter().all(|&x| x == expected_value)); + let data = band.contiguous_data(&mut scratch).unwrap(); + assert!(data.iter().all(|&x| x == expected_value)); } // Test iterator @@ -821,8 +824,9 @@ mod tests { .enumerate() .map(|(i, band)| { let band = band.unwrap(); - assert_eq!(band.data()[0], i as u8); - band.data()[0] + let data = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(data[0], i as u8); + data[0] }) .collect(); @@ -913,7 +917,10 @@ mod tests { let target_band_meta = target_band.metadata(); assert_eq!(target_band_meta.data_type().unwrap(), BandDataType::UInt16); assert!(target_band_meta.nodata_value().is_none()); - assert_eq!(target_band.data().len(), 2016); // 1008 * 2 bytes per u16 + assert_eq!( + target_band.contiguous_data(&mut Vec::new()).unwrap().len(), + 2016 + ); // 1008 * 2 bytes per u16 let result = target_raster.bands().band(0); assert!(result.is_err(), "Band number 0 should be invalid"); @@ -1112,7 +1119,10 @@ mod tests { assert_eq!(indb_metadata.data_type().unwrap(), BandDataType::UInt8); assert!(indb_metadata.outdb_url().is_none()); assert!(indb_metadata.outdb_band_id().is_none()); - assert_eq!(indb_band.data().len(), 100); + assert_eq!( + indb_band.contiguous_data(&mut Vec::new()).unwrap().len(), + 100 + ); // Test OutDbRef band let outdb_band = bands.band(2).unwrap(); @@ -1127,7 +1137,9 @@ mod tests { "s3://mybucket/satellite_image.tif" ); assert_eq!(outdb_metadata.outdb_band_id().unwrap(), 2); - assert_eq!(outdb_band.data().len(), 0); // Empty data for OutDbRef + // Schema-OutDb: data column is empty (is_indb() is the discriminator); + // attempting to materialise here would require a registered loader. + assert!(!outdb_band.is_indb()); } #[test] @@ -1182,7 +1194,7 @@ mod tests { let result = bands.band(1); assert!(result.is_ok()); let band = result.unwrap(); - assert_eq!(band.data().len(), 100); + assert_eq!(band.contiguous_data(&mut Vec::new()).unwrap().len(), 100); } #[test] @@ -1227,7 +1239,7 @@ mod tests { assert_eq!(band.shape(), &[20, 10]); assert_eq!(band.data_type(), BandDataType::UInt8); assert_eq!(band.nodata(), Some(&[255u8][..])); - assert_eq!(band.contiguous_data().unwrap().len(), 200); + assert_eq!(band.contiguous_data(&mut Vec::new()).unwrap().len(), 200); } #[test] @@ -1329,7 +1341,8 @@ mod tests { assert_eq!(band.dim_size("z"), None); // Verify strides are standard C-order: [4*5*4, 5*4, 4] = [80, 20, 4] - let buf = band.nd_buffer().unwrap(); + let mut scratch = Vec::new(); + let buf = band.nd_buffer(&mut scratch).unwrap(); assert_eq!(buf.strides, &[80, 20, 4]); assert_eq!(buf.offset, 0); } @@ -1493,10 +1506,12 @@ mod tests { let r = rasters.get(0).unwrap(); let band = r.band(0).unwrap(); - let data = band.contiguous_data().unwrap(); - // Identity-view bands are always contiguous, so should be Cow::Borrowed - assert!(matches!(data, Cow::Borrowed(_))); + // Identity-view InDb bands borrow the Arrow column zero-copy; the + // scratch buffer is unused (capacity stays at 0). + let mut scratch = Vec::new(); + let data = band.contiguous_data(&mut scratch).unwrap(); assert_eq!(data.len(), 16); + assert_eq!(scratch.len(), 0, "InDb path must not touch scratch"); } #[test] @@ -1562,17 +1577,18 @@ mod tests { let array = builder.finish().unwrap(); let rasters = RasterStructArray::new(&array); + let mut scratch = Vec::new(); let r0 = rasters.get(0).unwrap(); let b0 = r0.band(0).unwrap(); - assert_eq!(b0.nd_buffer().unwrap().strides, &[4, 1]); // UInt8 [3, 4] + assert_eq!(b0.nd_buffer(&mut scratch).unwrap().strides, &[4, 1]); // UInt8 [3, 4] let r1 = rasters.get(1).unwrap(); let b1 = r1.band(0).unwrap(); - assert_eq!(b1.nd_buffer().unwrap().strides, &[120, 40, 8]); // Float64 [2, 3, 5] + assert_eq!(b1.nd_buffer(&mut scratch).unwrap().strides, &[120, 40, 8]); // Float64 [2, 3, 5] let r2 = rasters.get(2).unwrap(); let b2 = r2.band(0).unwrap(); - assert_eq!(b2.nd_buffer().unwrap().strides, &[2]); // UInt16 [10] + assert_eq!(b2.nd_buffer(&mut scratch).unwrap().strides, &[2]); // UInt16 [10] } #[test] @@ -1768,13 +1784,13 @@ mod tests { assert_eq!(band.shape(), &[2, 3]); assert_eq!(band.raw_source_shape(), &[2, 3]); - let buf = band.nd_buffer().unwrap(); + let mut scratch = Vec::new(); + let buf = band.nd_buffer(&mut scratch).unwrap(); assert_eq!(buf.strides, &[3, 1]); assert_eq!(buf.offset, 0); - let bytes = band.contiguous_data().unwrap(); - assert!(matches!(bytes, Cow::Borrowed(_))); - assert_eq!(&*bytes, pixels.as_slice()); + let bytes = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(bytes, pixels.as_slice()); } #[test] diff --git a/rust/sedona-raster/src/lib.rs b/rust/sedona-raster/src/lib.rs index 77db0c0dd..d43528084 100644 --- a/rust/sedona-raster/src/lib.rs +++ b/rust/sedona-raster/src/lib.rs @@ -19,4 +19,7 @@ pub mod affine_transformation; pub mod array; pub mod builder; pub mod display; +pub mod outdb_loader; pub mod traits; + +pub use outdb_loader::{set_outdb_band_loader, OutDbBandLoader, OutDbLoadRequest}; diff --git a/rust/sedona-raster/src/outdb_loader.rs b/rust/sedona-raster/src/outdb_loader.rs new file mode 100644 index 000000000..1c1a70205 --- /dev/null +++ b/rust/sedona-raster/src/outdb_loader.rs @@ -0,0 +1,298 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Process-wide hook that resolves the bytes of a schema-OutDb raster band. +//! +//! `sedona-raster` deliberately knows nothing about GDAL, Zarr, or any +//! other backend. When a session needs to read pixels from a band whose +//! Arrow `data` column is empty, [`BandRefImpl`](crate::array) calls into +//! [`load_outdb`], which dispatches to whatever loader was installed at +//! bootstrap via [`set_outdb_band_loader`]. `sedona-raster-gdal` installs +//! a GDAL-backed loader from `SedonaContext::new()`; other backends (e.g. +//! a future Zarr crate) would install their own. +//! +//! Existing GDAL kernels (`raster_ref_to_gdal_mem`, VRT builder) short- +//! circuit OutDb bands long before reaching the byte-access surface, so +//! the loader is consulted only by non-GDAL consumers (Arrow FFI, numeric +//! kernels, Python bindings). + +use std::sync::OnceLock; + +use arrow_schema::ArrowError; +use sedona_schema::raster::BandDataType; + +/// Everything a backend needs to materialise a single OutDb band's bytes. +/// +/// `source_shape` is the **raw** source shape in `dim_names` order — the +/// shape of the buffer the loader must produce — *not* the visible shape. +/// Loaders write into a caller-provided scratch `Vec` rather than +/// allocating; the band then borrows from that scratch for the lifetime +/// of the byte-access call. This lets consumers in scalar-function loops +/// reuse a single scratch buffer across rasters and (eventually) reserve +/// memory through DataFusion's memory pool. +pub struct OutDbLoadRequest<'a> { + /// External URI (e.g. `file:///tmp/foo.tif#band=1`, + /// `s3://bucket/cube.zarr#path=/a&slice=0`). Bare paths are also + /// allowed; backends are responsible for parsing. + pub uri: &'a str, + /// Optional format hint set on the band (`"geotiff"`, `"zarr"`, …). + /// `None` means "let the backend infer from the URI". + pub format: Option<&'a str>, + /// Per-axis names, parallel to `source_shape`. Backends use this to + /// map their native axis order onto the band's. The GDAL backend + /// today requires 2-D `["y", "x"]`; other shapes return an error. + pub dim_names: &'a [&'a str], + /// Raw source shape in `dim_names` order. The loader returns bytes + /// sized to exactly `prod(source_shape) * data_type.byte_size()`. + pub source_shape: &'a [u64], + /// Pixel type. Loaders must return bytes encoding pixels in this type. + pub data_type: BandDataType, +} + +/// Function-pointer signature for OutDb loaders. +/// +/// The loader appends exactly `Π source_shape × data_type.byte_size()` bytes +/// to `scratch`. Callers (`load_outdb`) clear the scratch before calling and +/// validate the written length on return, so loader implementations only +/// need to write; they need not check pre-call state nor zero the buffer. +pub type OutDbBandLoader = fn(&OutDbLoadRequest<'_>, &mut Vec) -> Result<(), ArrowError>; + +static OUTDB_BAND_LOADER: OnceLock = OnceLock::new(); + +/// Install the process-wide OutDb loader. Idempotent — first writer wins. +/// +/// Multiple `SedonaContext::new()` calls within one process must not panic; +/// subsequent calls are silently ignored. This is the only way to register +/// a loader; tests dispatch internally on `req.uri` / `req.data_type`. +pub fn set_outdb_band_loader(f: OutDbBandLoader) { + let _ = OUTDB_BAND_LOADER.set(f); +} + +/// Dispatch a load request through the installed loader. Clears `scratch`, +/// invokes the loader, then verifies the loader wrote the expected number +/// of bytes. Returns `NotYetImplemented` with a clear message if no loader +/// was registered. +pub(crate) fn load_outdb( + req: &OutDbLoadRequest<'_>, + scratch: &mut Vec, +) -> Result<(), ArrowError> { + let loader = OUTDB_BAND_LOADER.get().ok_or_else(|| { + ArrowError::NotYetImplemented( + "no OutDb loader registered; sedona-raster-gdal (or another \ + backend) must be initialised before reading OutDb bands" + .to_string(), + ) + })?; + scratch.clear(); + loader(req, scratch) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::RasterStructArray; + use crate::builder::RasterBuilder; + use crate::traits::RasterRef; + use std::sync::Once; + + /// Single process-wide mock loader, dispatched by URI scheme. + /// `OUTDB_BAND_LOADER` is a `OnceLock` — exactly one registration per + /// process, so every test that needs OutDb behavior routes through + /// here and selects a case via the URI it constructs the band with. + fn mock_load(req: &OutDbLoadRequest<'_>, scratch: &mut Vec) -> Result<(), ArrowError> { + let expected: usize = + req.source_shape.iter().product::() as usize * req.data_type.byte_size(); + match req.uri { + "mock://zeros" => { + scratch.resize(expected, 0); + Ok(()) + } + "mock://pattern" => { + scratch.extend((0..expected).map(|i| (i & 0xFF) as u8)); + Ok(()) + } + "mock://too-small" => { + scratch.resize(expected.saturating_sub(1), 0); + Ok(()) + } + "mock://error" => Err(ArrowError::ExternalError(Box::new(std::io::Error::other( + "mock loader simulated failure", + )))), + other => Err(ArrowError::NotYetImplemented(format!( + "mock loader: unrecognised uri `{other}`" + ))), + } + } + + static MOCK_INSTALL: Once = Once::new(); + fn install_mock_loader() { + MOCK_INSTALL.call_once(|| set_outdb_band_loader(mock_load)); + } + + /// Build a single-raster, single-band StructArray whose band is + /// schema-OutDb (empty data column) and points at `uri`. + fn build_outdb_band( + uri: Option<&str>, + data_type: BandDataType, + source_shape: &[u64], + ) -> arrow_array::StructArray { + let dim_names: Vec<&str> = ["y", "x", "t", "u", "v"] + .iter() + .take(source_shape.len()) + .copied() + .collect(); + let spatial_shape: Vec = source_shape.iter().map(|&v| v as i64).collect(); + + let mut b = RasterBuilder::new(1); + b.start_raster_nd( + &[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], + &dim_names, + &spatial_shape, + None, + ) + .unwrap(); + b.start_band_nd( + None, + &dim_names, + source_shape, + data_type, + None, + uri, + uri.map(|_| "geotiff"), + ) + .unwrap(); + b.band_data_writer().append_value([0u8; 0]); // empty → schema-OutDb + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + b.finish().unwrap() + } + + #[test] + fn nd_buffer_returns_loader_bytes_for_outdb_band() { + install_mock_loader(); + let arr = build_outdb_band(Some("mock://pattern"), BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let buf = band.nd_buffer(&mut scratch).unwrap(); + assert_eq!(buf.buffer, &[0u8, 1, 2, 3, 4, 5]); + assert_eq!(buf.shape, vec![2, 3]); + assert_eq!(buf.data_type, BandDataType::UInt8); + } + + #[test] + fn contiguous_data_returns_loader_bytes_for_outdb_band() { + install_mock_loader(); + let arr = build_outdb_band(Some("mock://pattern"), BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let data = band.contiguous_data(&mut scratch).unwrap(); + assert_eq!(data, &[0u8, 1, 2, 3, 4, 5]); + } + + #[test] + fn is_indb_stays_false_after_loader_runs() { + // `is_indb()` is the schema discriminator (Arrow column emptiness), + // not a runtime byte-availability check. Must stay false after a + // successful byte-access call. + install_mock_loader(); + let arr = build_outdb_band(Some("mock://zeros"), BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + assert!(!band.is_indb()); + let mut scratch = Vec::new(); + let _ = band.nd_buffer(&mut scratch).unwrap(); + assert!( + !band.is_indb(), + "is_indb must remain a schema discriminator after the loader runs" + ); + } + + #[test] + fn scratch_is_reusable_across_bands() { + // Two consecutive OutDb byte-access calls against different bands + // must each produce the right bytes through the same scratch — + // load_outdb clears scratch on entry so stale data from the + // previous band cannot leak into the next. + install_mock_loader(); + let zeros = build_outdb_band(Some("mock://zeros"), BandDataType::UInt8, &[2, 3]); + let pattern = build_outdb_band(Some("mock://pattern"), BandDataType::UInt8, &[2, 3]); + let rasters_z = RasterStructArray::new(&zeros); + let rasters_p = RasterStructArray::new(&pattern); + let mut scratch = Vec::new(); + { + let r = rasters_z.get(0).unwrap(); + let band = r.band(0).unwrap(); + let buf = band.nd_buffer(&mut scratch).unwrap(); + assert_eq!(buf.buffer, &[0u8; 6]); + } + { + let r = rasters_p.get(0).unwrap(); + let band = r.band(0).unwrap(); + let buf = band.nd_buffer(&mut scratch).unwrap(); + assert_eq!(buf.buffer, &[0u8, 1, 2, 3, 4, 5]); + } + } + + #[test] + fn nd_buffer_errors_when_outdb_uri_is_missing() { + install_mock_loader(); + // OutDb-shaped band (empty data column) but no outdb_uri set. + let arr = build_outdb_band(None, BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let err = band.nd_buffer(&mut scratch).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("outdb_uri"), + "error message should reference the missing uri: {msg}" + ); + } + + #[test] + fn nd_buffer_surfaces_loader_failure() { + install_mock_loader(); + let arr = build_outdb_band(Some("mock://error"), BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + assert!(band.nd_buffer(&mut scratch).is_err()); + } + + #[test] + fn nd_buffer_rejects_undersized_loader_output() { + install_mock_loader(); + let arr = build_outdb_band(Some("mock://too-small"), BandDataType::UInt8, &[2, 3]); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let err = band.nd_buffer(&mut scratch).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("bytes") || msg.contains("expected"), + "undersized loader output should trip the size check: {msg}" + ); + } +} diff --git a/rust/sedona-raster/src/traits.rs b/rust/sedona-raster/src/traits.rs index 381c3c154..ed53d0236 100644 --- a/rust/sedona-raster/src/traits.rs +++ b/rust/sedona-raster/src/traits.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::borrow::Cow; - use arrow_schema::ArrowError; use sedona_schema::raster::BandDataType; @@ -558,14 +556,19 @@ pub trait BandRef { /// True if this band's bytes live in the `data` buffer (in-database). /// False if the bytes must be fetched from `outdb_uri` (out-of-database). /// - /// The discriminator is whether the `data` buffer is non-empty — - /// `outdb_uri` and `outdb_format` are orthogonal location/format hints - /// that may be set on either kind of band. - fn is_indb(&self) -> bool { - // Default: materialize via nd_buffer and check buffer emptiness. - // Concrete impls should override with a direct buffer check. - self.nd_buffer().is_ok_and(|b| !b.buffer.is_empty()) - } + /// This is a **schema-level** discriminator: "does the Arrow `data` + /// column carry these bytes inline?". It is a property of the encoded + /// payload, not of runtime byte availability; serialization, FFI + /// consumers, and view-construction rejection rely on it. `outdb_uri` + /// and `outdb_format` are orthogonal location/format hints that may + /// be set on either kind of band. + /// + /// Concrete impls must check the `data` column's emptiness directly. + /// Deriving from `nd_buffer().is_ok_and(...)` is wrong once an OutDb + /// loader is installed (it returns loader-resolved bytes for OutDb + /// bands, which would misclassify them as InDb), so this method + /// intentionally has no default impl. + fn is_indb(&self) -> bool; /// Eagerly-computed concrete band metadata. Mirrors the pre-N-D /// `BandRef::metadata()` accessor. @@ -604,46 +607,26 @@ pub trait BandRef { // -- Data access -- - /// Raw backing buffer + visible-region layout. Triggers load for lazy - /// impls. The returned `NdBuffer` describes the band's view in - /// byte-stride terms — `shape` is the visible shape, `strides` and - /// `offset` are computed by composing the view with the source's - /// natural C-order byte strides. Strides may be zero (broadcast) or - /// negative (reverse iteration). - fn nd_buffer(&self) -> Result, ArrowError>; - - /// Contiguous row-major bytes covering the *visible* region. Zero-copy - /// (`Cow::Borrowed`) when the view is full identity over a C-order - /// source buffer; copies into a new buffer when the view slices, - /// broadcasts, or permutes. Most RS_* functions use this. - fn contiguous_data(&self) -> Result, ArrowError>; - - /// Pre-N-D compatibility shim: raw row-major bytes for InDb, - /// identity-view bands. Panics on anything else (OutDb, non-identity - /// view, or a `contiguous_data` error) — corresponds to main's - /// infallible `BandRef::data() -> &[u8]` which only ever ran against - /// identity-view InDb bands. - fn data(&self) -> &[u8] { - // Compatibility shim: returns the same bytes pre-N-D callers expect - // from `BandRef::data() -> &[u8]`. Delegates to `contiguous_data()` - // so identity-view bands surface the borrowed in-line bytes, - // matching the pre-N-D behavior exactly. View-materialized - // (`Cow::Owned`) bands can't be returned through `&[u8]` because - // the owned `Vec` would die at the end of this call — implementers - // that need view-materialized bytes via `data()` must override and - // anchor the materialized buffer on `Self`; other consumers should - // reach for `contiguous_data()` directly. - match self - .contiguous_data() - .expect("BandRef::data() requires an in-db band with bytes") - { - Cow::Borrowed(b) => b, - Cow::Owned(_) => panic!( - "BandRef::data() can't return view-materialized bytes; \ - use contiguous_data() for sliced/permuted bands" - ), - } - } + /// Raw backing buffer + visible-region layout. + /// + /// `scratch` is a caller-owned `Vec` that the band uses if it has + /// to materialise bytes from outside the Arrow column (e.g. an OutDb + /// load). InDb identity-view bands ignore `scratch` and borrow the + /// column directly. The returned `NdBuffer` borrows from whichever + /// source applies — both share the `'s` lifetime, so the caller can + /// reuse the same scratch across iterations of a loop without leaking + /// stale data between bands. Callers in scratch-managed loops should + /// pass the same `Vec` for every band; its capacity will grow to + /// the largest band's source size and then stabilise. + fn nd_buffer<'s>(&'s self, scratch: &'s mut Vec) -> Result, ArrowError>; + + /// Contiguous row-major bytes covering the *visible* region. + /// + /// Returns a slice borrowing from either the Arrow column (InDb, + /// zero-copy) or the caller-provided `scratch` (OutDb, written by + /// the loader, or any future view-materialised path). Most RS_* + /// kernels use this. See `nd_buffer` for the scratch contract. + fn contiguous_data<'s>(&'s self, scratch: &'s mut Vec) -> Result<&'s [u8], ArrowError>; /// Nodata value interpreted as f64. /// @@ -909,10 +892,18 @@ mod tests { fn nodata(&self) -> Option<&[u8]> { None } - fn nd_buffer(&self) -> Result, ArrowError> { + fn is_indb(&self) -> bool { + // The is_2d tests do not exercise data provenance; any + // concrete answer satisfies the (now-required) trait method. + false + } + fn nd_buffer<'s>(&'s self, _scratch: &'s mut Vec) -> Result, ArrowError> { unimplemented!("not used in is_2d tests") } - fn contiguous_data(&self) -> Result, ArrowError> { + fn contiguous_data<'s>( + &'s self, + _scratch: &'s mut Vec, + ) -> Result<&'s [u8], ArrowError> { unimplemented!("not used in is_2d tests") } } diff --git a/rust/sedona-raster/tests/no_loader_registered.rs b/rust/sedona-raster/tests/no_loader_registered.rs new file mode 100644 index 000000000..9a3c0722b --- /dev/null +++ b/rust/sedona-raster/tests/no_loader_registered.rs @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Process-isolated assertion for the "no OutDb loader registered" error +//! message. Lives in a separate integration binary so the global +//! `OUTDB_BAND_LOADER` `OnceLock` stays unset — the in-crate unit tests +//! all install a mock and would mask this path. + +use sedona_raster::array::RasterStructArray; +use sedona_raster::builder::RasterBuilder; +use sedona_raster::traits::RasterRef; +use sedona_schema::raster::BandDataType; + +fn build_outdb_band() -> arrow_array::StructArray { + let mut b = RasterBuilder::new(1); + b.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["y", "x"], &[2, 3], None) + .unwrap(); + b.start_band_nd( + None, + &["y", "x"], + &[2, 3], + BandDataType::UInt8, + None, + Some("file:///nonexistent.tif"), + Some("geotiff"), + ) + .unwrap(); + b.band_data_writer().append_value([0u8; 0]); // empty → schema-OutDb + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + b.finish().unwrap() +} + +#[test] +fn nd_buffer_errors_with_clear_message_when_no_loader_registered() { + let arr = build_outdb_band(); + let rasters = RasterStructArray::new(&arr); + let raster = rasters.get(0).unwrap(); + let band = raster.band(0).unwrap(); + let mut scratch = Vec::new(); + let err = band.nd_buffer(&mut scratch).unwrap_err().to_string(); + assert!( + err.contains("no OutDb loader registered"), + "expected the no-loader diagnostic, got: {err}" + ); +} diff --git a/rust/sedona-testing/src/rasters.rs b/rust/sedona-testing/src/rasters.rs index a29b72f9e..10daa4a56 100644 --- a/rust/sedona-testing/src/rasters.rs +++ b/rust/sedona-testing/src/rasters.rs @@ -475,7 +475,13 @@ pub fn assert_raster_equal(raster1: &impl RasterRef, raster2: &impl RasterRef) { "Band outdb band IDs do not match" ); - assert_eq!(band1.data(), band2.data(), "Band data does not match"); + let mut scratch1 = Vec::new(); + let mut scratch2 = Vec::new(); + assert_eq!( + band1.contiguous_data(&mut scratch1).unwrap(), + band2.contiguous_data(&mut scratch2).unwrap(), + "Band data does not match", + ); } } @@ -513,7 +519,8 @@ mod tests { assert_eq!(band_metadata.outdb_url(), None); assert_eq!(band_metadata.outdb_band_id(), None); - let band_data = band.data(); + let mut scratch = Vec::new(); + let band_data = band.contiguous_data(&mut scratch).unwrap(); let expected_pixel_count = (i + 1) * (i + 2); // width * height // Convert raw bytes back to u16 values for comparison @@ -550,7 +557,8 @@ mod tests { let band_metadata = band.metadata(); assert_eq!(band_metadata.data_type().unwrap(), BandDataType::UInt8); assert_eq!(band_metadata.storage_type().unwrap(), StorageType::InDb); - let band_data = band.data(); + let mut scratch = Vec::new(); + let band_data = band.contiguous_data(&mut scratch).unwrap(); assert_eq!(band_data.len(), 64 * 64); // 4096 pixels } } @@ -619,7 +627,10 @@ mod tests { let b1 = bands.band(1).unwrap(); assert_eq!(b1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(b1.metadata().nodata_value(), Some(&[255u8][..])); - assert_eq!(b1.data(), &[1u8, 2, 3, 4]); + assert_eq!( + b1.contiguous_data(&mut Vec::new()).unwrap(), + &[1u8, 2, 3, 4] + ); // Band 2: UInt16, nodata=0 let b2 = bands.band(2).unwrap(); diff --git a/rust/sedona/src/context.rs b/rust/sedona/src/context.rs index 156b5b355..4a6ba9b3f 100644 --- a/rust/sedona/src/context.rs +++ b/rust/sedona/src/context.rs @@ -267,6 +267,12 @@ impl SedonaContext { // Always register raster functions out.register_function_set(sedona_raster_functions::register::default_function_set()); + // Install the GDAL-backed OutDb byte loader so non-GDAL consumers + // (FFI, future numeric kernels) can read pixels out of OutDb + // bands via `BandRef::nd_buffer()`. Idempotent across repeated + // `SedonaContext::new()` calls in one process. + sedona_raster_gdal::register_outdb_loader(); + Ok(out) }