diff --git a/rust/sedona-raster-functions/src/rs_setsrid.rs b/rust/sedona-raster-functions/src/rs_setsrid.rs index 2ff6134e4..ec4dbcabe 100644 --- a/rust/sedona-raster-functions/src/rs_setsrid.rs +++ b/rust/sedona-raster-functions/src/rs_setsrid.rs @@ -534,7 +534,12 @@ mod tests { for band_idx in 0..orig_bands.len() { let orig_band = orig_bands.band(band_idx + 1).unwrap(); let mod_band = mod_bands.band(band_idx + 1).unwrap(); - assert_eq!(orig_band.data(), mod_band.data()); + let orig_nd = orig_band.nd_buffer().unwrap(); + let mod_nd = mod_band.nd_buffer().unwrap(); + assert_eq!( + orig_nd.as_contiguous().unwrap(), + mod_nd.as_contiguous().unwrap() + ); assert_eq!( orig_band.metadata().data_type().unwrap(), mod_band.metadata().data_type().unwrap() diff --git a/rust/sedona-raster-gdal/src/gdal_common.rs b/rust/sedona-raster-gdal/src/gdal_common.rs index 2a6fad688..1e86dc32d 100644 --- a/rust/sedona-raster-gdal/src/gdal_common.rs +++ b/rust/sedona-raster-gdal/src/gdal_common.rs @@ -182,19 +182,27 @@ pub(crate) fn convert_gdal_err(e: GdalError) -> DataFusionError { DataFusionError::External(Box::new(e)) } -/// This function creates a GDAL dataset backed by the MEM driver that directly -/// references the band data stored in the [RasterRef]. No data copying occurs - -/// the GDAL bands point to the same memory as the data buffer held by [RasterRef]. +/// Build a GDAL MEM dataset whose bands point at the bytes held by `raster`. +/// +/// Each band's bytes are borrowed zero-copy via `NdBuffer::as_contiguous()`: +/// the GDAL band points directly at the StructArray's backing buffer, so the +/// caller must keep `raster` alive for the dataset's lifetime. Only 2-D +/// identity-view bands are accepted (enforced by `is_2d()` below), so the +/// bytes are always already packed row-major and no materialization happens +/// here — a strided view would be rejected, directing the user to +/// `RS_EnsureContiguous`. /// /// # Arguments /// * `raster` - The RasterRef value /// * `band_indices` - The indices of the bands to include in the GDAL dataset (1-based) /// /// # Returns -/// A [`Dataset`] that provides access to the GDAL dataset. +/// The `Dataset`, whose band pointers borrow into `raster` (which must +/// outlive it). /// /// # Errors /// Returns an error if: +/// - Any band is N-D (not the legacy `["y","x"]` 2-D shape with identity view) /// - Any band uses OutDb storage /// - GDAL driver operations fail /// - Accessing RasterRef fails @@ -212,7 +220,7 @@ pub unsafe fn raster_ref_to_gdal_mem( // Create internal MEM dataset via sedona-gdal shim to avoid open dataset list contention. let mut mem_ds_builder = MemDatasetBuilder::new(width, height); - // Add bands with DATAPOINTER option (zero-copy) + // Add bands with DATAPOINTER option. // // Note: GDALAddBand always appends a new band, so the destination band index // is sequential (1..=band_indices.len()), even if the source band indices are @@ -238,8 +246,16 @@ pub unsafe fn raster_ref_to_gdal_mem( let band_metadata = band.metadata(); let band_type = band_metadata.data_type()?; let gdal_type = band_data_type_to_gdal(&band_type); - let band_data = band.data(); - let data_ptr = band_data.as_ptr(); + // `is_2d()` above guarantees an identity view, so the band's bytes + // are already packed row-major and `as_contiguous()` borrows them + // zero-copy from the StructArray (never materializes). GDAL holds the + // resulting pointer for the dataset's lifetime; the caller keeps + // `raster` alive to back it. A strided view would surface here as an + // error directing the user to RS_EnsureContiguous, but `is_2d()` + // rejects those first. + let buf = band.nd_buffer().map_err(|e| arrow_datafusion_err!(e))?; + let band_bytes = buf.as_contiguous().map_err(|e| arrow_datafusion_err!(e))?; + let data_ptr: *const u8 = band_bytes.as_ptr(); unsafe { mem_ds_builder = mem_ds_builder.add_band(gdal_type, data_ptr as *mut u8); } @@ -307,8 +323,9 @@ pub unsafe fn raster_ref_to_gdal_mem( pub fn raster_ref_to_gdal_empty(gdal: &Gdal, raster: &R) -> Result { unsafe { - // SAFETY: raster_ref_to_gdal_mem is safe to call with an empty band list. The - // returned dataset will have zero bands and references no external memory. + // SAFETY: raster_ref_to_gdal_mem is safe to call with an empty band + // list. The returned dataset has zero bands and references no + // external memory. raster_ref_to_gdal_mem(gdal, raster, &[]) } } diff --git a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs index a9d1013c5..98778a7b6 100644 --- a/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs +++ b/rust/sedona-raster-gdal/src/gdal_dataset_provider.rs @@ -425,9 +425,8 @@ impl<'a> GDALDatasetProvider<'a> { } let mut gdal_mem_source = if !indb_band_indices.is_empty() { - Some(Rc::new(unsafe { - raster_ref_to_gdal_mem(self.gdal, raster, &indb_band_indices)? - })) + let mem_ds = unsafe { raster_ref_to_gdal_mem(self.gdal, raster, &indb_band_indices)? }; + Some(Rc::new(mem_ds)) } else { None }; diff --git a/rust/sedona-raster-gdal/src/utils.rs b/rust/sedona-raster-gdal/src/utils.rs index 30f543071..55549870b 100644 --- a/rust/sedona-raster-gdal/src/utils.rs +++ b/rust/sedona-raster-gdal/src/utils.rs @@ -270,7 +270,8 @@ mod tests { assert_eq!(band.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band.data(), [1u8, 2, 3, 4, 5, 6]); + let ndb = band.nd_buffer().unwrap(); + assert_eq!(ndb.as_contiguous().unwrap(), [1u8, 2, 3, 4, 5, 6]); } #[test] @@ -301,8 +302,10 @@ mod tests { &nodata.to_le_bytes() ); - let pixels: Vec = band - .data() + let ndb = band.nd_buffer().unwrap(); + let pixels: Vec = ndb + .as_contiguous() + .unwrap() .chunks_exact(8) .map(|chunk| u64::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -333,8 +336,10 @@ mod tests { &nodata.to_le_bytes() ); - let pixels: Vec = band - .data() + let ndb = band.nd_buffer().unwrap(); + let pixels: Vec = ndb + .as_contiguous() + .unwrap() .chunks_exact(8) .map(|chunk| i64::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -365,8 +370,10 @@ mod tests { &nodata.to_le_bytes() ); - let pixels: Vec = band - .data() + let ndb = band.nd_buffer().unwrap(); + let pixels: Vec = ndb + .as_contiguous() + .unwrap() .chunks_exact(2) .map(|chunk| u16::from_le_bytes(chunk.try_into().unwrap())) .collect(); @@ -395,12 +402,14 @@ mod tests { assert_eq!(band1.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band1.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band1.data(), [10u8, 11, 12, 13]); + let nd1 = band1.nd_buffer().unwrap(); + assert_eq!(nd1.as_contiguous().unwrap(), [10u8, 11, 12, 13]); assert_eq!(band2.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band2.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band2.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band2.data(), [100u8, 0, 200, 0]); + let nd2 = band2.nd_buffer().unwrap(); + assert_eq!(nd2.as_contiguous().unwrap(), [100u8, 0, 200, 0]); } #[test] @@ -420,12 +429,14 @@ mod tests { assert_eq!(band1.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band1.metadata().nodata_value().unwrap(), [0u8]); - assert_eq!(band1.data(), [10u8, 11, 12, 13]); + let nd1 = band1.nd_buffer().unwrap(); + assert_eq!(nd1.as_contiguous().unwrap(), [10u8, 11, 12, 13]); assert_eq!(band2.metadata().storage_type().unwrap(), StorageType::InDb); assert_eq!(band2.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(band2.metadata().nodata_value().unwrap(), [255u8]); - assert_eq!(band2.data(), [100u8, 0, 200, 0]); + let nd2 = band2.nd_buffer().unwrap(); + assert_eq!(nd2.as_contiguous().unwrap(), [100u8, 0, 200, 0]); } #[test] diff --git a/rust/sedona-raster/src/array.rs b/rust/sedona-raster/src/array.rs index bc7c2317b..8c2736103 100644 --- a/rust/sedona-raster/src/array.rs +++ b/rust/sedona-raster/src/array.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::borrow::Cow; - use arrow_array::{ Array, BinaryArray, BinaryViewArray, Float64Array, Int64Array, ListArray, StringArray, StringViewArray, StructArray, UInt32Array, UInt64Array, @@ -24,13 +22,15 @@ use arrow_array::{ use arrow_schema::ArrowError; use crate::traits::{BandRef, Bands, NdBuffer, RasterRef, ViewEntry}; -use sedona_schema::raster::{band_indices, raster_indices, BandDataType}; +use crate::view_entries::ViewEntries; +use sedona_schema::raster::{band_indices, band_view_indices, raster_indices, BandDataType}; /// Arrow-backed implementation of BandRef for a single band within a raster. /// -/// Today this handles only the canonical identity view: `view_entries` is -/// synthesised from `source_shape`, `visible_shape == source_shape`, -/// and `byte_strides` are plain C-order strides with `byte_offset = 0`. +/// View-derived layout (`visible_shape`, `byte_strides`, `byte_offset`, +/// `is_identity_view`) is computed once at construction and reused by every +/// accessor. Source-shape and dim-name slices are borrowed directly from +/// the underlying Arrow buffers. struct BandRefImpl<'a> { dim_names_list: &'a ListArray, dim_names_values: &'a StringArray, @@ -44,14 +44,20 @@ struct BandRefImpl<'a> { band_row: usize, /// Resolved at construction so accessors don't re-decode the discriminant. data_type: BandDataType, - /// Per-visible-axis view, length = ndim. Always identity today. - view_entries: Vec, - /// Visible shape, length = ndim. Equals `source_shape` today. - visible_shape: Vec, - /// Byte strides per visible axis. C-order over `source_shape` today. + /// Per-visible-axis view, length = ndim + view_entries: ViewEntries, + /// Visible shape (== `[v.steps for v in view_entries]`), length = ndim. + /// `i64` to match `BandRef::shape()`'s return type and the surrounding + /// view-machinery arithmetic (strides, offsets). `validate_view` + /// guarantees entries are non-negative. + visible_shape: Vec, + /// Byte strides per visible axis. May be 0 (broadcast) or negative. byte_strides: Vec, /// Byte offset into `data` of the visible region's `[0,...,0]` element. - byte_offset: u64, + /// Typed `i64` to match the surrounding stride arithmetic + /// (`byte_strides` are `i64` to allow negative steps). Always non-negative + /// by construction — `RasterRefImpl::band` asserts `>= 0` before storing. + byte_offset: i64, } impl<'a> BandRef for BandRefImpl<'a> { @@ -67,7 +73,7 @@ impl<'a> BandRef for BandRefImpl<'a> { .collect() } - fn shape(&self) -> &[u64] { + fn shape(&self) -> &[i64] { &self.visible_shape } @@ -78,24 +84,13 @@ impl<'a> BandRef for BandRefImpl<'a> { } fn view(&self) -> &[ViewEntry] { - &self.view_entries + self.view_entries.as_slice() } fn data_type(&self) -> BandDataType { self.data_type } - fn data(&self) -> &[u8] { - // Pre-N-D compatibility surface. Identity-view InDb bands → the - // row-major in-line buffer (zero-copy borrow into the StructArray), - // matching the pre-N-D behavior exactly. OutDb → `&[]` from the - // empty `data` column, no panic. Non-identity views never reach - // here — `RasterRefImpl::band()` rejects them upstream so the - // raw column bytes always equal the visible bytes for any band - // this reader produces. - self.data_array.value(self.band_row) - } - fn nodata(&self) -> Option<&[u8]> { if self.nodata_array.is_null(self.band_row) { None @@ -142,19 +137,80 @@ impl<'a> BandRef for BandRefImpl<'a> { data_type: self.data_type, }) } +} - fn contiguous_data(&self) -> Result, ArrowError> { - if !self.is_indb() { - return Err(ArrowError::NotYetImplemented( - "OutDb byte access via contiguous_data() is not yet implemented; \ - backend-specific OutDb resolvers are tracked separately" - .to_string(), - )); +/// Verify that every byte the view can address lies within `buffer_len` +/// and that every stride × index product (and their accumulations) fits +/// in i64. +/// +/// **Load-bearing**: this is the *only* bound check between the view's +/// byte-stride description and the data buffer. Stride-aware consumers walk +/// the buffer with plain-arithmetic indexing and rely on this precheck +/// having proven every addressed byte is in range. Two corruption modes it +/// catches: +/// +/// 1. A writer that lies about `source_shape` (Arrow column shorter +/// than the view promises). +/// 2. A composed view whose stride × index product or accumulated +/// offset overflows i64 even though `validate_view` accepted the +/// per-entry bounds. +/// +/// Empty visible regions (any axis with `steps == 0`) address no bytes +/// and skip the check. +fn check_view_buffer_bounds( + buffer_len: usize, + visible_shape: &[i64], + byte_strides: &[i64], + byte_offset: i64, + dtype_size: usize, +) -> Result<(), ArrowError> { + if visible_shape.contains(&0) { + return Ok(()); + } + let mut min_offset = byte_offset; + let mut max_offset = byte_offset; + for (k, &stride) in byte_strides.iter().enumerate() { + // `validate_view` guarantees `steps >= 0`, so `visible_shape[k] >= 0` + // and `visible_shape[k] - 1` is in-range for any non-empty axis. + let last_idx = visible_shape[k] - 1; + let contribution = last_idx.checked_mul(stride).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "max addressable offset on axis {k} overflows i64" + )) + })?; + if contribution > 0 { + max_offset = max_offset.checked_add(contribution).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "max addressable offset accumulation overflows i64".to_string(), + ) + })?; + } else if contribution < 0 { + min_offset = min_offset.checked_add(contribution).ok_or_else(|| { + ArrowError::InvalidArgumentError( + "min addressable offset accumulation overflows i64".to_string(), + ) + })?; } - // Identity-view only today, so the data buffer is already row-major - // over the visible region. - Ok(Cow::Borrowed(self.data_array.value(self.band_row))) } + let last_byte = max_offset + .checked_add(dtype_size as i64 - 1) + .ok_or_else(|| { + ArrowError::InvalidArgumentError("max addressable byte overflows i64".to_string()) + })?; + if min_offset < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "view addresses out-of-bounds negative byte offset {min_offset}" + ))); + } + let buffer_len_i64 = i64::try_from(buffer_len).map_err(|_| { + ArrowError::InvalidArgumentError(format!("buffer length {buffer_len} exceeds i64::MAX")) + })?; + if last_byte >= buffer_len_i64 { + return Err(ArrowError::InvalidArgumentError(format!( + "view addresses byte {last_byte} but buffer is only {buffer_len} bytes" + ))); + } + Ok(()) } /// Arrow-backed implementation of RasterRef for a single raster row. @@ -181,6 +237,10 @@ pub struct RasterRefImpl<'a> { band_datatype_array: &'a UInt32Array, band_nodata_array: &'a BinaryArray, band_view_list: &'a ListArray, + band_view_source_axis: &'a Int64Array, + band_view_start: &'a Int64Array, + band_view_step: &'a Int64Array, + band_view_steps: &'a Int64Array, band_outdb_uri_array: &'a StringArray, band_outdb_format_array: &'a StringViewArray, band_data_array: &'a BinaryViewArray, @@ -196,6 +256,135 @@ impl<'a> RasterRefImpl<'a> { Some(self.crs_array.value(self.raster_index)) } } + + /// Read the band's source_shape and convert u64 → i64 with overflow check. + /// + /// Rejects 0-D bands (empty source_shape) at the read boundary: the schema + /// doesn't forbid them outright but every consumer assumes ndim >= 1. Every + /// downstream consumer in the view machinery wants i64 (matches ViewEntry's + /// signed fields and the stride arithmetic); converting once here keeps the + /// rest of band() free of mixed-signedness gymnastics. + fn read_band_source_shape(&self, band_row: usize) -> Result, ArrowError> { + let ss_start = self.band_source_shape_list.value_offsets()[band_row] as usize; + let ss_end = self.band_source_shape_list.value_offsets()[band_row + 1] as usize; + let source_shape_u64: &[u64] = &self.band_source_shape_values.values()[ss_start..ss_end]; + + if source_shape_u64.is_empty() { + return Err(ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "band {band_row} has empty source_shape; ndim must be >= 1" + ), + ))); + } + + source_shape_u64 + .iter() + .map(|&s| { + i64::try_from(s).map_err(|_| { + ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "band {band_row}: source_shape axis {s} exceeds i64::MAX" + ), + )) + }) + }) + .collect() + } + + /// Resolve the band's data-type discriminant or fail. An unknown + /// discriminant is schema-corruption, not user data. + fn read_band_data_type_or_err(&self, band_row: usize) -> Result { + let data_type_value = self.band_datatype_array.value(band_row); + BandDataType::try_from_u32(data_type_value).ok_or_else(|| { + ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( + "band {band_row} has unknown data_type discriminant {data_type_value}" + ))) + }) + } + + /// Read the band's view-entry list. Identity is encoded exclusively as a + /// NULL row — an empty (non-null, zero-length) list is malformed and + /// rejected later by [`ViewEntries::validate`]. The schema (see + /// `RasterSchema::view_type`) documents this contract. + fn read_band_view_entries( + &self, + band_row: usize, + source_shape: &[i64], + ) -> Result { + if self.band_view_list.is_null(band_row) { + return Ok(ViewEntries::identity_for_shape(source_shape)); + } + let v_start = self.band_view_list.value_offsets()[band_row] as usize; + let v_end = self.band_view_list.value_offsets()[band_row + 1] as usize; + Ok(ViewEntries::new( + (v_start..v_end) + .map(|i| ViewEntry { + source_axis: self.band_view_source_axis.value(i), + start: self.band_view_start.value(i), + step: self.band_view_step.value(i), + steps: self.band_view_steps.value(i), + }) + .collect(), + )) + } +} + +/// Compose a validated view against a source shape into C-order byte strides +/// and a byte offset. +/// +/// C-order source strides are dtype-scaled cumulative products of `source_shape`, +/// then each visible axis's stride/offset is composed as `view.step * +/// src_stride` / `view.start * src_stride`. All arithmetic is checked: even +/// after `ViewEntries::validate`, the cumulative byte product can overflow +/// `i64` for cosmically large shapes, and a corrupt source_shape whose product +/// wraps would otherwise silently pass downstream bound checks. The returned +/// `byte_offset` is non-negative by construction (start >= 0, src_stride > 0); +/// the defensive sign check guards future refactors that might break that +/// invariant before we cross the i64 → u64 boundary in `nd_buffer()`. +fn compose_byte_strides( + band_row: usize, + source_shape: &[i64], + view_entries: &ViewEntries, + dtype_byte_size: usize, +) -> Result<(Vec, i64), ArrowError> { + let overflow_err = |msg: &str| { + ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( + "band {band_row}: {msg}" + ))) + }; + + let dtype_size = dtype_byte_size as i64; + + let mut source_strides_bytes = vec![0i64; source_shape.len()]; + source_strides_bytes[source_shape.len() - 1] = dtype_size; + for k in (0..source_shape.len() - 1).rev() { + source_strides_bytes[k] = source_strides_bytes[k + 1] + .checked_mul(source_shape[k + 1]) + .ok_or_else(|| overflow_err("source-stride product overflows i64"))?; + } + + let mut byte_strides = vec![0i64; view_entries.len()]; + let mut byte_offset: i64 = 0; + for (k, v) in view_entries.iter().enumerate() { + let src_stride = source_strides_bytes[v.source_axis as usize]; + byte_strides[k] = v + .step + .checked_mul(src_stride) + .ok_or_else(|| overflow_err("view step × source-stride overflows i64"))?; + let start_off = v + .start + .checked_mul(src_stride) + .ok_or_else(|| overflow_err("view start × source-stride overflows i64"))?; + byte_offset = byte_offset + .checked_add(start_off) + .ok_or_else(|| overflow_err("view offset accumulation overflows i64"))?; + } + + if byte_offset < 0 { + return Err(overflow_err("composed byte_offset is negative")); + } + + Ok((byte_strides, byte_offset)) } impl<'a> RasterRef for RasterRefImpl<'a> { @@ -217,61 +406,44 @@ impl<'a> RasterRef for RasterRefImpl<'a> { let start = self.bands_list.value_offsets()[self.raster_index] as usize; let band_row = start + index; - // Read source shape slice. - let ss_start = self.band_source_shape_list.value_offsets()[band_row] as usize; - let ss_end = self.band_source_shape_list.value_offsets()[band_row + 1] as usize; - let source_shape: &[u64] = &self.band_source_shape_values.values()[ss_start..ss_end]; - - // Reject 0-D bands at the read boundary. Schema doesn't forbid them - // outright but every consumer assumes ndim >= 1. - if source_shape.is_empty() { - return Err(ArrowError::ExternalError(Box::new( - sedona_common::sedona_internal_datafusion_err!( - "band {band_row} has empty source_shape; ndim must be >= 1" - ), - ))); - } - - // Resolve data type up front; an unknown discriminant is a - // schema-corruption bug, not user data, so failing the band loudly - // here is appropriate. - let data_type_value = self.band_datatype_array.value(band_row); - let data_type = BandDataType::try_from_u32(data_type_value).ok_or_else(|| { + let source_shape = self.read_band_source_shape(band_row)?; + let data_type = self.read_band_data_type_or_err(band_row)?; + let view_entries = self.read_band_view_entries(band_row, &source_shape)?; + view_entries.validate(&source_shape).map_err(|e| { ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( - "band {band_row} has unknown data_type discriminant {data_type_value}" + "band {band_row} has malformed view: {e}" ))) })?; - // Only the canonical identity view (null view row) is written today. - // A non-null view row would require the view → byte-stride composition - // path, which is not yet implemented. Surface it loudly here rather - // than silently rejecting the band, so callers see the standardised - // SedonaDB-internal-error framing. - if !self.band_view_list.is_null(band_row) { - return Err(ArrowError::ExternalError(Box::new( - sedona_common::sedona_internal_datafusion_err!( - "non-null view row at band {band_row}: view composition is not yet implemented" - ), - ))); - } - let view_entries: Vec = source_shape - .iter() - .enumerate() - .map(|(i, &s)| ViewEntry { - source_axis: i as i64, - start: 0, - step: 1, - steps: s as i64, - }) - .collect(); - - let visible_shape: Vec = source_shape.to_vec(); - - let dtype_size = data_type.byte_size() as i64; - let mut byte_strides = vec![0i64; source_shape.len()]; - byte_strides[source_shape.len() - 1] = dtype_size; - for k in (0..source_shape.len() - 1).rev() { - byte_strides[k] = byte_strides[k + 1] * (source_shape[k + 1] as i64); + let visible_shape = view_entries.visible_shape(); + let (byte_strides, byte_offset) = compose_byte_strides( + band_row, + &source_shape, + &view_entries, + data_type.byte_size(), + )?; + + // For InDb bands, verify the underlying data column is long enough + // to cover every byte the view can address. The view-machinery + // validation above doesn't know the actual `data` BinaryView + // length — a writer that lies about source_shape vs the bytes + // written would otherwise slip through and panic later when a + // consumer walks the strided buffer. OutDb bands skip this check: + // their data column is empty by design. + let data_bytes = self.band_data_array.value(band_row); + if !data_bytes.is_empty() { + check_view_buffer_bounds( + data_bytes.len(), + &visible_shape, + &byte_strides, + byte_offset, + data_type.byte_size(), + ) + .map_err(|e| { + ArrowError::ExternalError(Box::new(sedona_common::sedona_internal_datafusion_err!( + "band {band_row}: view-buffer bounds check failed: {e}" + ))) + })?; } Ok(Box::new(BandRefImpl { @@ -288,7 +460,7 @@ impl<'a> RasterRef for RasterRefImpl<'a> { view_entries, visible_shape, byte_strides, - byte_offset: 0, + byte_offset, })) } @@ -410,6 +582,10 @@ pub struct RasterStructArray<'a> { band_datatype_array: &'a UInt32Array, band_nodata_array: &'a BinaryArray, band_view_list: &'a ListArray, + band_view_source_axis: &'a Int64Array, + band_view_start: &'a Int64Array, + band_view_step: &'a Int64Array, + band_view_steps: &'a Int64Array, band_outdb_uri_array: &'a StringArray, band_outdb_format_array: &'a StringViewArray, band_data_array: &'a BinaryViewArray, @@ -509,6 +685,31 @@ impl<'a> RasterStructArray<'a> { .as_any() .downcast_ref::() .unwrap(); + let band_view_struct = band_view_list + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let band_view_source_axis = band_view_struct + .column(band_view_indices::SOURCE_AXIS) + .as_any() + .downcast_ref::() + .unwrap(); + let band_view_start = band_view_struct + .column(band_view_indices::START) + .as_any() + .downcast_ref::() + .unwrap(); + let band_view_step = band_view_struct + .column(band_view_indices::STEP) + .as_any() + .downcast_ref::() + .unwrap(); + let band_view_steps = band_view_struct + .column(band_view_indices::STEPS) + .as_any() + .downcast_ref::() + .unwrap(); let band_outdb_uri_array = bands_struct .column(band_indices::OUTDB_URI) .as_any() @@ -543,6 +744,10 @@ impl<'a> RasterStructArray<'a> { band_datatype_array, band_nodata_array, band_view_list, + band_view_source_axis, + band_view_start, + band_view_step, + band_view_steps, band_outdb_uri_array, band_outdb_format_array, band_data_array, @@ -586,6 +791,10 @@ impl<'a> RasterStructArray<'a> { band_datatype_array: self.band_datatype_array, band_nodata_array: self.band_nodata_array, band_view_list: self.band_view_list, + band_view_source_axis: self.band_view_source_axis, + band_view_start: self.band_view_start, + band_view_step: self.band_view_step, + band_view_steps: self.band_view_steps, band_outdb_uri_array: self.band_outdb_uri_array, band_outdb_format_array: self.band_outdb_format_array, band_data_array: self.band_data_array, @@ -603,11 +812,13 @@ impl<'a> RasterStructArray<'a> { #[cfg(test)] mod tests { use super::*; - use crate::builder::RasterBuilder; - use crate::traits::{BandMetadata, RasterMetadata}; - use arrow_array::{ArrayRef, ListArray, StructArray, UInt32Array, UInt64Array}; - use arrow_buffer::{OffsetBuffer, ScalarBuffer}; - use arrow_schema::{DataType, Fields}; + use crate::builder::{RasterBuilder, StartBandWithViewArgs}; + use crate::traits::{BandMetadata, RasterMetadata, ViewEntry}; + use arrow_array::{ + types::Int64Type, ArrayRef, ListArray, StructArray, UInt32Array, UInt64Array, + }; + use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; + use arrow_schema::{DataType, Field, Fields}; use sedona_schema::raster::{ band_indices, raster_indices, BandDataType, RasterSchema, StorageType, }; @@ -672,8 +883,10 @@ mod tests { // Access band with 1-based band_number let band = bands.band(1).unwrap(); - assert_eq!(band.data().len(), 100); - assert_eq!(band.data()[0], 1u8); + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); + assert_eq!(band_data.len(), 100); + assert_eq!(band_data[0], 1u8); let band_meta = band.metadata(); assert_eq!(band_meta.storage_type().unwrap(), StorageType::InDb); @@ -737,7 +950,12 @@ mod tests { // Access band with 1-based band_number let band = bands.band(i + 1).unwrap(); let expected_value = i as u8; - assert!(band.data().iter().all(|&x| x == expected_value)); + let ndb = band.nd_buffer().unwrap(); + assert!(ndb + .as_contiguous() + .unwrap() + .iter() + .all(|&x| x == expected_value)); } // Test array @@ -746,8 +964,10 @@ mod tests { .enumerate() .map(|(i, band)| { let band = band.unwrap(); - assert_eq!(band.data()[0], i as u8); - band.data()[0] + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); + assert_eq!(band_data[0], i as u8); + band_data[0] }) .collect(); @@ -763,20 +983,36 @@ mod tests { assert!(rasters.is_null(1)); } - /// Build a single-raster, single-band raster StructArray with the - /// canonical identity view. Used as the baseline input to the surgery - /// helpers below; callers replace one band-level column to simulate - /// schema corruption on non-view fields. - fn build_identity_raster() -> StructArray { + /// Build a single-raster, single-band raster StructArray with an explicit + /// view. Used as the input to the surgery helpers below; callers replace + /// one band-level column to simulate schema corruption. + fn build_explicit_view_raster() -> StructArray { let mut builder = RasterBuilder::new(1); let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; builder .start_raster_nd(&transform, &["x"], &[3], None) .unwrap(); + let view = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }]; builder - .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[8], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) .unwrap(); - builder.band_data_writer().append_value(vec![0u8, 1, 2]); + builder + .band_data_writer() + .append_value(vec![0u8, 1, 2, 3, 4, 5, 6, 7]); builder.finish_band().unwrap(); builder.finish_raster().unwrap(); builder.finish().unwrap() @@ -829,11 +1065,73 @@ mod tests { ) } - // bad data_type discriminant + /// Rebuild the band view list with hand-rolled entries. `entries[i]` + /// supplies all four `(source_axis, start, step, steps)` Int64 values + /// for band-row `i`. `nulls` controls per-row validity bits — `None` + /// means every row is non-null. + fn make_band_view_list( + entries: Vec>, + nulls: Option>, + ) -> ArrayRef { + let mut offsets: Vec = vec![0]; + let mut sa: Vec = vec![]; + let mut start: Vec = vec![]; + let mut step: Vec = vec![]; + let mut steps: Vec = vec![]; + for row in &entries { + for &(a, s, k, n) in row { + sa.push(a); + start.push(s); + step.push(k); + steps.push(n); + } + offsets.push(sa.len() as i32); + } + let view_struct_fields = Fields::from(vec![ + Field::new("source_axis", DataType::Int64, false), + Field::new("start", DataType::Int64, false), + Field::new("step", DataType::Int64, false), + Field::new("steps", DataType::Int64, false), + ]); + let view_struct = StructArray::new( + view_struct_fields, + vec![ + Arc::new(arrow_array::PrimitiveArray::::from(sa)) as ArrayRef, + Arc::new(arrow_array::PrimitiveArray::::from(start)) as ArrayRef, + Arc::new(arrow_array::PrimitiveArray::::from(step)) as ArrayRef, + Arc::new(arrow_array::PrimitiveArray::::from(steps)) as ArrayRef, + ], + None, + ); + let DataType::List(view_field) = RasterSchema::view_type() else { + unreachable!() + }; + let null_buf = nulls.map(NullBuffer::from); + Arc::new(ListArray::new( + view_field, + OffsetBuffer::new(ScalarBuffer::from(offsets)), + Arc::new(view_struct), + null_buf, + )) + } + + // ---- Critical #1: malformed view entries ---- + + #[test] + fn band_returns_none_when_view_length_mismatches_source_shape() { + // source_shape has 1 dim but view encodes 2 entries. + let array = build_explicit_view_raster(); + let bad_view = make_band_view_list(vec![vec![(0, 0, 1, 3), (0, 0, 1, 3)]], None); + let mutated = replace_band_column(&array, band_indices::VIEW, bad_view); + let rasters = RasterStructArray::new(&mutated); + assert!(rasters.get(0).unwrap().band(0).is_err()); + } + + // ---- Critical #2: bad data_type discriminant ---- #[test] fn band_and_band_data_type_surface_corruption_for_unknown_discriminant() { - let array = build_identity_raster(); + let array = build_explicit_view_raster(); let bad_dtype: ArrayRef = Arc::new(UInt32Array::from(vec![0xFFu32])); let mutated = replace_band_column(&array, band_indices::DATA_TYPE, bad_dtype); let rasters = RasterStructArray::new(&mutated); @@ -855,7 +1153,7 @@ mod tests { #[test] fn band_surfaces_internal_error_when_source_shape_is_empty() { - let array = build_identity_raster(); + let array = build_explicit_view_raster(); // Replace source_shape with a single empty list row. let DataType::List(ss_field) = RasterSchema::source_shape_type() else { unreachable!() @@ -877,7 +1175,115 @@ mod tests { assert!(err.to_string().contains("empty source_shape")); } - // direct fast-path tests + #[test] + fn band_surfaces_internal_error_when_data_column_shorter_than_view() { + // build_explicit_view_raster writes 8 UInt8 source bytes with view + // (start=1, step=2, steps=3) which addresses bytes 1, 3, 5. + // Inflate source_shape to [16] and the view to cover steps=10 along + // the (now nominally-larger) source axis: the byte range jumps past + // the actual 8-byte data column and the precheck must fire. + let array = build_explicit_view_raster(); + // source_shape := [16] + let new_source_shape = make_band_source_shape_list(vec![vec![16u64]]); + let mutated_ss = replace_band_column(&array, band_indices::SOURCE_SHAPE, new_source_shape); + // view := (source_axis=0, start=0, step=1, steps=10) — addresses + // bytes 0..10 but the underlying data column only has 8 bytes. + let new_view = make_band_view_list(vec![vec![(0, 0, 1, 10)]], None); + let mutated = replace_band_column(&mutated_ss, band_indices::VIEW, new_view); + let rasters = RasterStructArray::new(&mutated); + let err = rasters.get(0).unwrap().band(0).err().unwrap(); + assert!(err.to_string().contains("SedonaDB internal error")); + assert!(err.to_string().contains("view-buffer bounds check failed")); + } + + #[test] + fn band_rejects_empty_non_null_view_row() { + // The identity view is encoded exclusively as a NULL row; a + // non-null zero-length list is malformed and must error rather + // than silently fall back to identity. (Pre-rev behaviour + // accepted it — see `RasterSchema::view_type` for the contract.) + let array = build_explicit_view_raster(); + let empty_non_null_view = make_band_view_list(vec![vec![]], Some(vec![true])); + let mutated = replace_band_column(&array, band_indices::VIEW, empty_non_null_view); + let rasters = RasterStructArray::new(&mutated); + let err = rasters.get(0).unwrap().band(0).err().unwrap(); + assert!(err.to_string().contains("view length"), "got: {err}"); + } + + // ---- Stride composition overflow guards ---- + + /// Build a band source_shape list with hand-rolled u64 entries so tests + /// can inject values that the builder's writer-side checks would refuse. + fn make_band_source_shape_list(rows: Vec>) -> ArrayRef { + let mut offsets: Vec = vec![0]; + let mut values: Vec = vec![]; + for row in &rows { + values.extend_from_slice(row); + offsets.push(values.len() as i32); + } + let DataType::List(field) = RasterSchema::source_shape_type() else { + unreachable!() + }; + Arc::new(ListArray::new( + field, + OffsetBuffer::new(ScalarBuffer::from(offsets)), + Arc::new(UInt64Array::from(values)), + None, + )) + } + + #[test] + fn band_returns_none_when_source_shape_value_exceeds_i64_max() { + // A u64 value > i64::MAX must not silently wrap to a negative i64 + // during stride composition. The 1-D fixture is replaced with a + // 2-D `[1, u64::MAX]`, view shaped to match, with steps=0 on the + // pathological axis so validate_view accepts it. + let array = build_explicit_view_raster(); + let new_source_shape = make_band_source_shape_list(vec![vec![1u64, u64::MAX]]); + let mutated_ss = replace_band_column(&array, band_indices::SOURCE_SHAPE, new_source_shape); + let new_view = make_band_view_list(vec![vec![(0, 0, 1, 1), (1, 0, 1, 0)]], None); + let mutated = replace_band_column(&mutated_ss, band_indices::VIEW, new_view); + let rasters = RasterStructArray::new(&mutated); + assert!(rasters.get(0).unwrap().band(0).is_err()); + } + + #[test] + fn band_returns_none_when_source_strides_product_overflows() { + // dtype_size × Π source_shape[j>k] must not silently wrap. With a + // 3-D source_shape of `[1, 1<<32, 1<<32]` the product (1<<32) × + // (1<<32) = 1<<64 overflows i64 in the source-stride build. + let array = build_explicit_view_raster(); + let new_source_shape = + make_band_source_shape_list(vec![vec![1u64, 1u64 << 32, 1u64 << 32]]); + let mutated_ss = replace_band_column(&array, band_indices::SOURCE_SHAPE, new_source_shape); + // Pad the view to 3 entries; steps=0 on the giant axes keeps + // validate_view's start/last checks out of the casts-from-u64 path. + let new_view = + make_band_view_list(vec![vec![(0, 0, 1, 1), (1, 0, 1, 0), (2, 0, 1, 0)]], None); + let mutated = replace_band_column(&mutated_ss, band_indices::VIEW, new_view); + let rasters = RasterStructArray::new(&mutated); + assert!(rasters.get(0).unwrap().band(0).is_err()); + } + + #[test] + fn band_returns_none_when_view_step_times_source_stride_overflows() { + // `validate_view` bounds (steps-1)*step + start on the SOURCE axis + // but doesn't bound v.step × cumulative_byte_stride. A view with a + // small visible region but a step large enough to wrap the byte + // stride must be rejected at construction. + // + // Source `[3, 1<<60]`, dtype_size=1 (UInt8) → src_stride[0] = 1<<60. + // View on axis 0 with step=8 makes byte_strides[0] = 8 × (1<<60) = + // 1<<63 which overflows i64. The view itself only walks 1 step on + // that axis so validate_view's (steps-1)*step bound holds. + let array = build_explicit_view_raster(); + let new_source_shape = make_band_source_shape_list(vec![vec![3u64, 1u64 << 60]]); + let mutated_ss = replace_band_column(&array, band_indices::SOURCE_SHAPE, new_source_shape); + let new_view = make_band_view_list(vec![vec![(0, 0, 8, 1), (1, 0, 1, 1)]], None); + let mutated = replace_band_column(&mutated_ss, band_indices::VIEW, new_view); + let rasters = RasterStructArray::new(&mutated); + assert!(rasters.get(0).unwrap().band(0).is_err()); + } #[test] fn raster_ref_fast_paths_return_expected_values() { @@ -978,18 +1384,17 @@ mod tests { assert!(bm0.outdb_band_id().is_none()); } - // multi-band, multi-raster identity + // ---- Important #9: multi-band, multi-raster mixed identity/explicit ---- #[test] - fn multi_raster_identity_views() { - // Two rasters with multiple identity bands each. Exercises the - // `bands_list.value_offsets()` routing for every per-band lookup — - // a naive reader that forgets to add the per-raster offset would - // hand back data from the wrong band. + fn multi_raster_mixed_identity_and_explicit_views() { + // Two rasters. Raster 0 has 3 bands (identity, explicit slice, + // identity). Raster 1 has 2 bands (explicit broadcast, identity). + // bands_list.value_offsets() must correctly route each band. let mut builder = RasterBuilder::new(2); let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; - // Raster 0: three identity bands. + // Raster 0 builder .start_raster_nd(&transform, &["x"], &[3], None) .unwrap(); @@ -999,9 +1404,25 @@ mod tests { builder.band_data_writer().append_value(vec![10u8, 20, 30]); builder.finish_band().unwrap(); builder - .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[8], + view: &[ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }], + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) .unwrap(); - builder.band_data_writer().append_value(vec![40u8, 50, 60]); + builder + .band_data_writer() + .append_value(vec![0u8, 1, 2, 3, 4, 5, 6, 7]); builder.finish_band().unwrap(); builder .start_band_nd(None, &["x"], &[3], BandDataType::UInt8, None, None, None) @@ -1012,16 +1433,28 @@ mod tests { builder.finish_band().unwrap(); builder.finish_raster().unwrap(); - // Raster 1: two identity bands of a different shape. + // Raster 1 builder .start_raster_nd(&transform, &["x"], &[4], None) .unwrap(); builder - .start_band_nd(None, &["x"], &[4], BandDataType::UInt8, None, None, None) + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[1], + view: &[ViewEntry { + source_axis: 0, + start: 0, + step: 0, + steps: 4, + }], + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) .unwrap(); - builder - .band_data_writer() - .append_value(vec![42u8, 43, 44, 45]); + builder.band_data_writer().append_value(vec![42u8]); builder.finish_band().unwrap(); builder .start_band_nd(None, &["x"], &[4], BandDataType::UInt8, None, None, None) @@ -1033,36 +1466,39 @@ mod tests { let array = builder.finish().unwrap(); let rasters = RasterStructArray::new(&array); + // Raster 0 bands: identity (3), slice (3), identity (3). The identity + // bands are contiguous and borrow zero-copy; the step=2 slice is + // strided so `as_contiguous` rejects it. let r0 = rasters.get(0).unwrap(); assert_eq!(r0.num_bands(), 3); assert_eq!(r0.band(0).unwrap().shape(), &[3]); - assert_eq!( - &*r0.band(0).unwrap().contiguous_data().unwrap(), - &[10u8, 20, 30] - ); + let b0 = r0.band(0).unwrap(); + let nd0 = b0.nd_buffer().unwrap(); + assert_eq!(nd0.as_contiguous().unwrap(), &[10u8, 20, 30]); assert_eq!(r0.band(1).unwrap().shape(), &[3]); - assert_eq!( - &*r0.band(1).unwrap().contiguous_data().unwrap(), - &[40u8, 50, 60] - ); + let b1 = r0.band(1).unwrap(); + let nd1 = b1.nd_buffer().unwrap(); + assert!(!nd1.is_contiguous()); + assert!(nd1.as_contiguous().is_err()); assert_eq!(r0.band(2).unwrap().shape(), &[3]); - assert_eq!( - &*r0.band(2).unwrap().contiguous_data().unwrap(), - &[100u8, 101, 102] - ); + let b2 = r0.band(2).unwrap(); + let nd2 = b2.nd_buffer().unwrap(); + assert_eq!(nd2.as_contiguous().unwrap(), &[100u8, 101, 102]); + // Raster 1 bands: broadcast (4 copies of 42), identity (4). The + // broadcast band has a zero stride so it is non-contiguous and + // rejected; the identity band borrows zero-copy. let r1 = rasters.get(1).unwrap(); assert_eq!(r1.num_bands(), 2); assert_eq!(r1.band(0).unwrap().shape(), &[4]); - assert_eq!( - &*r1.band(0).unwrap().contiguous_data().unwrap(), - &[42u8, 43, 44, 45] - ); + let r1b0 = r1.band(0).unwrap(); + let r1nd0 = r1b0.nd_buffer().unwrap(); + assert!(!r1nd0.is_contiguous()); + assert!(r1nd0.as_contiguous().is_err()); assert_eq!(r1.band(1).unwrap().shape(), &[4]); - assert_eq!( - &*r1.band(1).unwrap().contiguous_data().unwrap(), - &[1u8, 2, 3, 4] - ); + let r1b1 = r1.band(1).unwrap(); + let r1nd1 = r1b1.nd_buffer().unwrap(); + assert_eq!(r1nd1.as_contiguous().unwrap(), &[1u8, 2, 3, 4]); // Fast paths must honour the same offsets. assert_eq!(r0.band_data_type(1), Some(BandDataType::UInt8)); @@ -1119,4 +1555,58 @@ mod tests { assert!(r1.band_outdb_format(0).is_none()); assert!(r1.band_nodata(0).is_none()); } + + // ---- Fast-path / band(i) divergence on a corrupt view ---- + + #[test] + fn fast_paths_return_columnar_values_when_band_is_corrupt() { + // band(i) goes through validate_view and returns None for a + // malformed view; the columnar fast paths read their fields + // directly without consulting the view at all. Pin down that + // contract so a future reader doesn't accidentally couple them + // (or "fix" the divergence in either direction without us + // noticing). Also catches a regression where a fast path would + // panic instead of returning the underlying value. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x"], &[3], None) + .unwrap(); + builder + .start_band_with_view(StartBandWithViewArgs { + name: Some("a"), + dim_names: &["x"], + source_shape: &[8], + view: &[ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }], + data_type: BandDataType::UInt32, + nodata: Some(&[0u8, 0, 0, 0]), + outdb_uri: Some("s3://bucket/a.tif"), + outdb_format: Some("GTiff"), + }) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 32]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + + let bad_view = make_band_view_list(vec![vec![(0, 0, 1, -1)]], None); + let mutated = replace_band_column(&array, band_indices::VIEW, bad_view); + let rasters = RasterStructArray::new(&mutated); + let r = rasters.get(0).unwrap(); + + // band(i) rejects on validate_view. + assert!(r.band(0).is_err()); + + // Fast paths still surface the underlying columnar values — + // they don't validate the view, by design. Locking that in. + assert_eq!(r.band_data_type(0), Some(BandDataType::UInt32)); + assert_eq!(r.band_outdb_uri(0), Some("s3://bucket/a.tif")); + assert_eq!(r.band_outdb_format(0), Some("GTiff")); + assert_eq!(r.band_nodata(0), Some(&[0u8, 0, 0, 0][..])); + } } diff --git a/rust/sedona-raster/src/builder.rs b/rust/sedona-raster/src/builder.rs index 31d018de4..0d44fbaff 100644 --- a/rust/sedona-raster/src/builder.rs +++ b/rust/sedona-raster/src/builder.rs @@ -28,7 +28,8 @@ use std::sync::Arc; use sedona_schema::raster::{BandDataType, RasterSchema}; -use crate::traits::{BandMetadata, MetadataRef}; +use crate::traits::{BandMetadata, BandRef, MetadataRef, ViewEntry}; +use crate::view_entries::ViewEntries; /// Builder for constructing raster arrays with zero-copy band data writing /// @@ -78,6 +79,36 @@ use crate::traits::{BandMetadata, MetadataRef}; /// // Get the final StructArray /// let raster_array = builder.finish().unwrap(); /// ``` +/// Arguments to [`RasterBuilder::start_band_with_view`]. Bundled into a +/// struct to keep the call site readable — eight slots is enough that +/// positional args invite mis-ordering bugs. +pub(crate) struct StartBandWithViewArgs<'a> { + pub name: Option<&'a str>, + pub dim_names: &'a [&'a str], + pub source_shape: &'a [u64], + pub view: &'a [ViewEntry], + pub data_type: BandDataType, + pub nodata: Option<&'a [u8]>, + pub outdb_uri: Option<&'a str>, + pub outdb_format: Option<&'a str>, +} + +/// Arguments to [`RasterBuilder::with_view`]. Mirrors +/// [`StartBandWithViewArgs`] minus the two fields `with_view` derives from +/// `input` (`source_shape` from `input.raw_source_shape()`, `data_type` from +/// `input.data_type()`) — accepting those from the caller would let them +/// contradict `input`. `view` here is a *delta* composed against +/// `input.view()`, not the absolute view stored on the band. +pub struct WithViewArgs<'a> { + pub name: Option<&'a str>, + pub dim_names: &'a [&'a str], + pub input: &'a dyn BandRef, + pub view: &'a [ViewEntry], + pub nodata: Option<&'a [u8]>, + pub outdb_uri: Option<&'a str>, + pub outdb_format: Option<&'a str>, +} + pub struct RasterBuilder { // Top-level raster fields crs: StringViewBuilder, @@ -374,6 +405,209 @@ impl RasterBuilder { Ok(()) } + /// Internal: write a band with an explicit view over a raw source + /// shape. Public callers should use [`Self::with_view`] instead, + /// which derives `source_shape`, validates view composition, and + /// inherits the input band's source bytes — `with_view` calls this + /// helper after composing. + /// + /// Each `ViewEntry` describes one *visible* axis in `dim_names` order: + /// `(source_axis, start, step, steps)`. Validates that: + /// - `dim_names`, `source_shape`, and `view` have equal length. + /// - Across `view`, `source_axis` values form a permutation of + /// `0..ndim` (no axis duplicated, none missing). + /// - For each entry with `steps > 0`: `start` and (when `step != 0`) + /// `start + (steps - 1) * step` are in `[0, source_shape[source_axis])`. + /// - `steps >= 0`. + pub(crate) fn start_band_with_view( + &mut self, + args: StartBandWithViewArgs<'_>, + ) -> Result<(), ArrowError> { + let StartBandWithViewArgs { + name, + dim_names, + source_shape, + view, + data_type, + nodata, + outdb_uri, + outdb_format, + } = args; + let ndim = dim_names.len(); + if ndim == 0 { + return Err(ArrowError::InvalidArgumentError( + "start_band_with_view: 0-dimensional bands are not supported".into(), + )); + } + if source_shape.len() != ndim || view.len() != ndim { + return Err(ArrowError::InvalidArgumentError(format!( + "start_band_with_view: dim_names ({}), source_shape ({}), and view ({}) \ + must all have the same length", + ndim, + source_shape.len(), + view.len() + ))); + } + + // Convert source_shape u64 → i64 for validation. `start_band_with_view` + // is a builder entry point so producers can express a source_shape + // that doesn't fit in i64 — reject it here rather than letting it + // wrap inside the view-machinery validation. + let source_shape_i64: Vec = source_shape + .iter() + .map(|&s| { + i64::try_from(s).map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "start_band_with_view: source_shape axis {s} exceeds i64::MAX" + )) + }) + }) + .collect::>()?; + let view_entries = ViewEntries::new(view.to_vec()); + view_entries.validate(&source_shape_i64)?; + + // Write fields. + match name { + Some(n) => self.band_name.append_value(n), + None => self.band_name.append_null(), + } + + for dn in dim_names { + self.band_dim_names_values.append_value(dn); + } + let next = *self.band_dim_names_offsets.last().unwrap() + ndim as i32; + self.band_dim_names_offsets.push(next); + + for &s in source_shape { + self.band_shape_values.append_value(s); + } + let next = *self.band_shape_offsets.last().unwrap() + ndim as i32; + self.band_shape_offsets.push(next); + + self.band_datatype.append_value(data_type as u32); + + match nodata { + Some(b) => self.band_nodata.append_value(b), + None => self.band_nodata.append_null(), + } + + for v in view { + self.band_view_source_axis_values + .append_value(v.source_axis); + self.band_view_start_values.append_value(v.start); + self.band_view_step_values.append_value(v.step); + self.band_view_steps_values.append_value(v.steps); + } + let next = *self.band_view_offsets.last().unwrap() + ndim as i32; + self.band_view_offsets.push(next); + self.band_view_validity.push(true); + + match outdb_uri { + Some(uri) => self.band_outdb_uri.append_value(uri), + None => self.band_outdb_uri.append_null(), + } + match outdb_format { + Some(format) => self.band_outdb_format.append_value(format), + None => self.band_outdb_format.append_null(), + } + + self.current_band_count += 1; + self.band_data_count_at_start = self.band_data.len(); + + // finish_raster compares visible shape against spatial_shape. + // `visible_shape()` returns Vec; entries are guaranteed + // non-negative by `validate` above, so the cast to u64 is + // lossless. + let visible_shape_u64: Vec = view_entries + .visible_shape() + .into_iter() + .map(|v| v as u64) + .collect(); + self.current_raster_bands.push(( + dim_names.iter().map(|s| s.to_string()).collect(), + visible_shape_u64, + )); + + Ok(()) + } + + /// Build a band that is a new view into an existing band. + /// + /// The output band stores a view that is the composition of `input`'s + /// existing view with the supplied `view`. The supplied `view`'s + /// `source_axis` entries refer to `input`'s *visible* axes, not its + /// source axes — composition with `input.view()` translates them. + /// + /// `dim_names` names the output's *visible* axes (length == view.len()). + /// + /// Storage: + /// - **InDb input** → output is InDb. The input's source bytes are + /// copied verbatim into the output's `data` column (today's + /// simple-share strategy; buffer-sharing via Arrow `BinaryView` is a + /// future optimisation). + /// - **OutDb input** → output is OutDb. The data column stays empty; + /// the input's `outdb_uri` and `outdb_format` are inherited (unless + /// overridden via the explicit `outdb_uri` / `outdb_format` args). + /// The composed view lives alongside the same external pointer — + /// loading is deferred to whoever reads the visible bytes. + /// + /// Identity-input shortcut: when `input` carries identity view, the + /// composed view equals `view` verbatim. + pub fn with_view(&mut self, args: WithViewArgs) -> Result<(), ArrowError> { + let WithViewArgs { + name, + dim_names, + input, + view, + nodata, + outdb_uri, + outdb_format, + } = args; + let source_shape: Vec = input.raw_source_shape().to_vec(); + let composed = + ViewEntries::new(input.view().to_vec()).compose(&ViewEntries::new(view.to_vec()))?; + + // Inherit storage metadata from the input unless the caller has + // explicitly overridden it. For OutDb inputs this propagates the + // external pointer to the output; for InDb inputs the input's + // outdb_uri/outdb_format are typically None anyway. + let final_outdb_uri = outdb_uri.or_else(|| input.outdb_uri()); + let final_outdb_format = outdb_format.or_else(|| input.outdb_format()); + + // Reuse the internal start_band_with_view helper to perform + // validation + write the schema fields. + self.start_band_with_view(StartBandWithViewArgs { + name, + dim_names, + source_shape: &source_shape, + view: composed.as_slice(), + data_type: input.data_type(), + nodata, + outdb_uri: final_outdb_uri, + outdb_format: final_outdb_format, + })?; + + if input.is_indb() { + // InDb: nd_buffer().buffer is the source bytes — borrow them + // directly into `append_value` so the only copy is the one + // BinaryViewBuilder makes into its block. This is still + // a full source-bytes copy per `with_view` call, which + // undermines the "lazy slice" framing for large rasters. + // + // The principled fix is Arrow `BinaryView` buffer-sharing: + // the output's data row references the input's existing + // backing `Buffer` instead of copying. Tracked separately + // in the Raster Clean Up project. + let buf = input.nd_buffer()?; + self.band_data_writer().append_value(buf.buffer); + } else { + // OutDb: data column stays empty; the source bytes live at the + // inherited outdb_uri and are fetched lazily on read. + self.band_data_writer().append_value([]); + } + Ok(()) + } + /// Convenience: start a 2D band with `dim_names=["y","x"]` and `shape=[height, width]`. /// /// Must be called after `start_raster_2d` / `start_raster_2d` which sets @@ -687,7 +921,6 @@ mod tests { use arrow_ipc::writer::StreamWriter; use arrow_schema::Schema; use sedona_schema::raster::StorageType; - use std::borrow::Cow; use std::io::Cursor; #[test] @@ -747,8 +980,10 @@ mod tests { // Access band with 1-based band_number let band = bands.band(1).unwrap(); - assert_eq!(band.data().len(), 100); - assert_eq!(band.data()[0], 1u8); + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); + assert_eq!(band_data.len(), 100); + assert_eq!(band_data[0], 1u8); let band_meta = band.metadata(); assert_eq!(band_meta.storage_type().unwrap(), StorageType::InDb); @@ -812,7 +1047,12 @@ mod tests { // Access band with 1-based band_number let band = bands.band(i + 1).unwrap(); let expected_value = i as u8; - assert!(band.data().iter().all(|&x| x == expected_value)); + let ndb = band.nd_buffer().unwrap(); + assert!(ndb + .as_contiguous() + .unwrap() + .iter() + .all(|&x| x == expected_value)); } // Test iterator @@ -821,8 +1061,10 @@ mod tests { .enumerate() .map(|(i, band)| { let band = band.unwrap(); - assert_eq!(band.data()[0], i as u8); - band.data()[0] + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); + assert_eq!(band_data[0], i as u8); + band_data[0] }) .collect(); @@ -913,7 +1155,8 @@ mod tests { let target_band_meta = target_band.metadata(); assert_eq!(target_band_meta.data_type().unwrap(), BandDataType::UInt16); assert!(target_band_meta.nodata_value().is_none()); - assert_eq!(target_band.data().len(), 2016); // 1008 * 2 bytes per u16 + let target_nd = target_band.nd_buffer().unwrap(); + assert_eq!(target_nd.as_contiguous().unwrap().len(), 2016); // 1008 * 2 bytes per u16 let result = target_raster.bands().band(0); assert!(result.is_err(), "Band number 0 should be invalid"); @@ -1054,9 +1297,13 @@ mod tests { // Test creating raster with OutDb reference metadata let mut builder = RasterBuilder::new(10); + // 10x10 raster of UInt8 = 100 visible bytes, matching the data buffer + // written below. `RasterRef::band()` now verifies the data column is + // long enough to cover the visible region, so the dimensions and the + // byte count must agree. let metadata = RasterMetadata { - width: 1024, - height: 1024, + width: 10, + height: 10, upperleft_x: 0.0, upperleft_y: 0.0, scale_x: 1.0, @@ -1112,7 +1359,9 @@ mod tests { assert_eq!(indb_metadata.data_type().unwrap(), BandDataType::UInt8); assert!(indb_metadata.outdb_url().is_none()); assert!(indb_metadata.outdb_band_id().is_none()); - assert_eq!(indb_band.data().len(), 100); + assert!(indb_band.is_indb()); + let indb_nd = indb_band.nd_buffer().unwrap(); + assert_eq!(indb_nd.as_contiguous().unwrap().len(), 100); // Test OutDbRef band let outdb_band = bands.band(2).unwrap(); @@ -1127,7 +1376,10 @@ mod tests { "s3://mybucket/satellite_image.tif" ); assert_eq!(outdb_metadata.outdb_band_id().unwrap(), 2); - assert_eq!(outdb_band.data().len(), 0); // Empty data for OutDbRef + // OutDbRef bands carry no in-band bytes; byte access via nd_buffer() + // is not supported (backend-specific resolvers are tracked separately). + assert!(!outdb_band.is_indb()); + assert!(outdb_band.nd_buffer().is_err()); } #[test] @@ -1182,7 +1434,8 @@ mod tests { let result = bands.band(1); assert!(result.is_ok()); let band = result.unwrap(); - assert_eq!(band.data().len(), 100); + let ndb = band.nd_buffer().unwrap(); + assert_eq!(ndb.as_contiguous().unwrap().len(), 100); } #[test] @@ -1227,7 +1480,8 @@ mod tests { assert_eq!(band.shape(), &[20, 10]); assert_eq!(band.data_type(), BandDataType::UInt8); assert_eq!(band.nodata(), Some(&[255u8][..])); - assert_eq!(band.contiguous_data().unwrap().len(), 200); + let ndb = band.nd_buffer().unwrap(); + assert_eq!(ndb.as_contiguous().unwrap().len(), 200); } #[test] @@ -1478,7 +1732,7 @@ mod tests { } #[test] - fn test_contiguous_data_is_borrowed() { + fn test_as_contiguous_borrows_identity_view() { let mut builder = RasterBuilder::new(1); builder .start_raster_2d(4, 4, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, None) @@ -1493,9 +1747,11 @@ mod tests { let r = rasters.get(0).unwrap(); let band = r.band(0).unwrap(); - let data = band.contiguous_data().unwrap(); - // Identity-view bands are always contiguous, so should be Cow::Borrowed - assert!(matches!(data, Cow::Borrowed(_))); + let ndb = band.nd_buffer().unwrap(); + // Identity-view bands are always contiguous, so as_contiguous borrows + // the underlying bytes zero-copy rather than erroring. + assert!(ndb.is_contiguous()); + let data = ndb.as_contiguous().unwrap(); assert_eq!(data.len(), 16); } @@ -1716,6 +1972,44 @@ mod tests { ); } + #[test] + fn test_start_band_with_view_identity_matches_start_band() { + // Identity view through start_band_with_view should produce the same + // visible shape and byte strides as the convenience start_band path. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[5, 4], None) + .unwrap(); + + let view = crate::view_entries![0:4, 0:5]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["y", "x"], + source_shape: &[4, 5], + view: view.as_slice(), + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 20]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + assert_eq!(band.shape(), &[4, 5]); + assert_eq!(band.raw_source_shape(), &[4, 5]); + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.strides, &[5, 1]); + assert_eq!(buf.offset, 0); + } + #[test] fn test_start_band_rejects_zero_dim() { // 0-D bands carry no spatial extent and no caller has a use for @@ -1734,7 +2028,64 @@ mod tests { } #[test] - fn test_contiguous_data_identity_via_start_band_is_borrowed() { + fn test_start_band_with_view_rejects_zero_dim() { + // start_band_with_view must apply the same 0-D guard as start_band + // — accepting empty dim_names would otherwise bypass it via the + // explicit-view path. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let err = builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &[], + source_shape: &[], + view: &[], + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap_err(); + assert!( + err.to_string().contains("0-dimensional"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_view_validation_rejects_step_overrun() { + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + // start=1, step=2, steps=4 → addresses element 1+(4-1)*2 = 7 which is + // out of range for a source size of 7. + let view = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 4, + }]; + let err = builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[7], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap_err(); + assert!( + err.to_string().contains("out of range"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_as_contiguous_identity_via_start_band_borrows() { // Canonical identity: the row's view list is null, and the read path // synthesises the identity view. Should still hand the underlying // bytes back without copying. @@ -1771,10 +2122,151 @@ mod tests { let buf = band.nd_buffer().unwrap(); assert_eq!(buf.strides, &[3, 1]); assert_eq!(buf.offset, 0); + assert!(buf.is_contiguous()); + assert_eq!(buf.as_contiguous().unwrap(), pixels.as_slice()); + } + + #[test] + fn test_as_contiguous_explicit_identity_view_borrows() { + // Identity expressed *explicitly* through start_band_with_view must be + // indistinguishable to consumers from the null-row identity above — + // same visible shape, same byte strides, same zero-copy borrow. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + let view = crate::view_entries![0:2, 0:3]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["y", "x"], + source_shape: &[2, 3], + view: view.as_slice(), + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + let pixels: Vec = (0..6).collect(); + builder.band_data_writer().append_value(pixels.clone()); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); - let bytes = band.contiguous_data().unwrap(); - assert!(matches!(bytes, Cow::Borrowed(_))); - assert_eq!(&*bytes, pixels.as_slice()); + assert_eq!(band.shape(), &[2, 3]); + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.strides, &[3, 1]); + assert_eq!(buf.offset, 0); + assert!(buf.is_contiguous()); + assert_eq!(buf.as_contiguous().unwrap(), pixels.as_slice()); + } + + #[test] + fn test_zero_step_broadcast_2d_is_strided_and_rejected() { + // 2D broadcast: source shape [1, 3], view broadcasts axis 0 four + // times so the visible region is 4×3. Each visible row must equal the + // source's only row. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let view = [ + ViewEntry { + source_axis: 0, + start: 0, + step: 0, + steps: 4, + }, + ViewEntry { + source_axis: 1, + start: 0, + step: 1, + steps: 3, + }, + ]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["row", "col"], + source_shape: &[1, 3], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder.band_data_writer().append_value(vec![10u8, 20, 30]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.shape, &[4, 3]); + // Broadcast row stride is 0; column stride is 1 byte per UInt8. + assert_eq!(buf.strides, &[0, 1]); + assert_eq!(buf.offset, 0); + + // A zero stride is not C-order packed, so the buffer is non-contiguous + // and as_contiguous rejects it (repacking lives behind + // RS_EnsureContiguous, https://github.com/apache/sedona-db/issues/899). + assert!(!buf.is_contiguous()); + assert!(buf.as_contiguous().is_err()); + } + + #[test] + fn test_negative_step_strided_reverse_is_rejected() { + // 1D source [0..8] with start=6, step=-2, steps=3 picks every other + // element walking backwards: {6, 4, 2}. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let view = [ViewEntry { + source_axis: 0, + start: 6, + step: -2, + steps: 3, + }]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[8], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8, 1, 2, 3, 4, 5, 6, 7]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.shape, &[3]); + assert_eq!(buf.strides, &[-2]); + assert_eq!(buf.offset, 6); + + // A negative stride is not C-order packed → non-contiguous, rejected. + assert!(!buf.is_contiguous()); + assert!(buf.as_contiguous().is_err()); } #[test] @@ -1857,6 +2349,187 @@ mod tests { ); } + #[test] + fn test_outer_axis_slice_float32_is_contiguous() { + // Multi-byte dtype outer-axis slice: a 2D view over Float32 that + // takes the leading rows from offset 0 is contiguous-but-not-identity, + // so as_contiguous borrows the source prefix zero-copy. Catches a + // regression where contiguity assumed dtype_size == 1. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + // Slice the outer axis: take rows 0 and 1 of a 3-row source. With + // start=0, step=1, steps=2 over an axis of size 3, the view is not + // identity, but its byte strides are still C-order packed from + // offset 0, so the buffer is contiguous and borrows zero-copy. + let view = crate::view_entries![0:2, 0:3]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["y", "x"], + source_shape: &[3, 3], // 3x3 source + view: view.as_slice(), + data_type: BandDataType::Float32, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + let source: Vec = (0..9).map(|i| i as f32).collect(); + let source_bytes: Vec = source.iter().flat_map(|f| f.to_le_bytes()).collect(); + builder + .band_data_writer() + .append_value(source_bytes.clone()); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + // Visible shape is [2, 3]; the first 6 source floats (rows 0,1) are + // exactly the visible pixels — i.e. the first 24 source bytes. + let buf = band.nd_buffer().unwrap(); + assert!(buf.is_contiguous()); + assert_eq!(buf.as_contiguous().unwrap(), &source_bytes[0..24]); + } + + #[test] + fn test_outer_axis_slice_3d_is_contiguous() { + // 3D source [T=3, Y=2, X=3] of UInt8. View slices T to T=1..3 + // (start=1, step=1, steps=2), keeps Y and X identity. The visible + // region is a contiguous source sub-range (offset 6, C-order packed + // strides), so as_contiguous borrows it zero-copy. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder + .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) + .unwrap(); + let view = crate::view_entries![1:3, 0:2, 0:3]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["t", "y", "x"], + source_shape: &[3, 2, 3], + view: view.as_slice(), + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + let source: Vec = (0..18).collect(); + builder.band_data_writer().append_value(source.clone()); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + + // Visible region = source[6..18] (T=1 and T=2 planes). + assert_eq!(band.shape(), &[2, 2, 3]); + let buf = band.nd_buffer().unwrap(); + assert!(buf.is_contiguous()); + assert_eq!(buf.as_contiguous().unwrap(), &source[6..18]); + } + + #[test] + fn test_nd_buffer_permutation_and_slice_combined() { + // 2D source [Y=4, X=3]. View permutes (visible order [X, Y]) and + // slices Y from 1, step 2, steps 2. Expected: + // visible_shape = [3, 2] + // byte_strides = [step_X * stride_X_src, step_Y * stride_Y_src] + // = [1 * 1, 2 * 3] = [1, 6] + // byte_offset = start_X * stride_X_src + start_Y * stride_Y_src + // = 0 * 1 + 1 * 3 = 3 + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let view = [ + ViewEntry { + source_axis: 1, + start: 0, + step: 1, + steps: 3, + }, // X + ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 2, + }, // Y + ]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x", "y"], + source_shape: &[4, 3], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder + .band_data_writer() + .append_value((0u8..12).collect::>()); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.shape, &[3, 2]); + assert_eq!(buf.strides, &[1, 6]); + assert_eq!(buf.offset, 3); + + // The permuted+strided layout (strides [1, 6]) is not C-order packed, + // so the buffer is non-contiguous and as_contiguous rejects it. + assert!(!buf.is_contiguous()); + assert!(buf.as_contiguous().is_err()); + } + + #[test] + fn test_nd_buffer_multidim_with_zero_axis() { + // visible_shape contains a zero axis somewhere in the middle. The + // buffer spans zero bytes, so it is trivially contiguous and + // as_contiguous borrows an empty slice; nd_buffer returns the + // zero-element shape. + let mut builder = RasterBuilder::new(1); + let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + builder.start_raster_nd(&transform, &[], &[], None).unwrap(); + let view = crate::view_entries![0:3, 0:0, 0:5]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["a", "b", "c"], + source_shape: &[3, 4, 5], + view: view.as_slice(), + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder.band_data_writer().append_value(vec![0u8; 60]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); + let array = builder.finish().unwrap(); + let rasters = RasterStructArray::new(&array); + let r = rasters.get(0).unwrap(); + let band = r.band(0).unwrap(); + assert_eq!(band.shape(), &[3, 0, 5]); + let buf = band.nd_buffer().unwrap(); + assert_eq!(buf.shape, &[3, 0, 5]); + assert!(buf.is_contiguous()); + assert!(buf.as_contiguous().unwrap().is_empty()); + } + #[test] fn test_view_null_round_trips_through_arrow_ipc() { // Schema invariant: a band built via start_band_nd serialises with a @@ -1865,8 +2538,9 @@ mod tests { // instead, downstream readers (DuckDB, PyArrow, sedona-py) will // disagree about whether the view is identity. - let mut builder = RasterBuilder::new(1); + let mut builder = RasterBuilder::new(2); let transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0]; + // Raster 0: identity-view band → null view row. builder .start_raster_nd(&transform, &["x", "y"], &[3, 2], None) .unwrap(); @@ -1884,6 +2558,33 @@ mod tests { builder.band_data_writer().append_value(vec![0u8; 6]); builder.finish_band().unwrap(); builder.finish_raster().unwrap(); + // Raster 1: explicit non-identity view → non-null view row. + builder + .start_raster_nd(&transform, &["x"], &[3], None) + .unwrap(); + let view = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }]; + builder + .start_band_with_view(StartBandWithViewArgs { + name: None, + dim_names: &["x"], + source_shape: &[8], + view: &view, + data_type: BandDataType::UInt8, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + builder + .band_data_writer() + .append_value(vec![0u8, 1, 2, 3, 4, 5, 6, 7]); + builder.finish_band().unwrap(); + builder.finish_raster().unwrap(); let array = builder.finish().unwrap(); let schema = Arc::new(Schema::new(vec![Arc::new(arrow_schema::Field::new( @@ -1910,6 +2611,8 @@ mod tests { .downcast_ref::() .unwrap(); + // Reach into the restored bands list and confirm the view list + // preserves null/non-null per row. let bands_list = restored_struct .column(sedona_schema::raster::raster_indices::BANDS) .as_any() @@ -1925,14 +2628,256 @@ mod tests { .as_any() .downcast_ref::() .unwrap(); - assert_eq!(view_list.len(), 1); + assert_eq!(view_list.len(), 2); assert!( view_list.is_null(0), "identity-view band must remain a null view row after IPC round-trip" ); + assert!( + !view_list.is_null(1), + "explicit-view band must remain non-null after IPC round-trip" + ); + // Sanity: read paths still produce the expected visible shapes. let rasters = RasterStructArray::new(restored_struct); let r0 = rasters.get(0).unwrap(); assert_eq!(r0.band(0).unwrap().shape(), &[2, 3]); + let r1 = rasters.get(1).unwrap(); + assert_eq!(r1.band(0).unwrap().shape(), &[3]); + } + + // ---- with_view: public "create a new view into an existing band" ---- + + /// Build a 1-D UInt8 raster with `source_shape=[8]` and bytes + /// `[0, 1, ..., 7]`. Identity-view; used as input to with_view tests. + fn build_1d_identity_raster() -> StructArray { + let mut b = RasterBuilder::new(1); + b.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[8], None) + .unwrap(); + b.start_band_nd(None, &["x"], &[8], BandDataType::UInt8, None, None, None) + .unwrap(); + b.band_data_writer() + .append_value((0u8..8).collect::>()); + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + b.finish().unwrap() + } + + #[test] + fn with_view_over_identity_input_produces_expected_visible_bytes() { + // Input is identity over [0..8]. with_view layers a slice + // (start=1, step=2, steps=3) producing visible bytes [1, 3, 5]. + let input_array = build_1d_identity_raster(); + let input_rasters = RasterStructArray::new(&input_array); + let input_raster = input_rasters.get(0).unwrap(); + let input_band = input_raster.band(0).unwrap(); + + let mut b = RasterBuilder::new(1); + b.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[3], None) + .unwrap(); + let view = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }]; + b.with_view(WithViewArgs { + name: None, + dim_names: &["x"], + input: input_band.as_ref(), + view: &view, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + let out_array = b.finish().unwrap(); + + let out_rasters = RasterStructArray::new(&out_array); + let out_raster = out_rasters.get(0).unwrap(); + let out_band = out_raster.band(0).unwrap(); + assert_eq!(out_band.shape(), &[3]); + // The composed view is a strided slice (start=1, step=2) over the + // original source: byte stride 2, offset 1. Non-contiguous, so + // as_contiguous rejects it; the visible bytes [1, 3, 5] would be + // produced by RS_EnsureContiguous + // (https://github.com/apache/sedona-db/issues/899). + let buf = out_band.nd_buffer().unwrap(); + assert_eq!(buf.strides, &[2]); + assert_eq!(buf.offset, 1); + assert!(!buf.is_contiguous()); + assert!(buf.as_contiguous().is_err()); + // The output's source_shape is inherited from the input. + assert_eq!(out_band.raw_source_shape(), &[8]); + } + + #[test] + fn with_view_chained_composes_into_single_view() { + // Round 1: with_view layers (start=1, step=2, steps=4) → visible + // bytes [1, 3, 5, 7] over source [0..8]. + // Round 2: with_view on that, layering (start=1, step=1, steps=2) → + // visible bytes [3, 5] (input visible indices 1 and 2). + // + // After Round 2 the output's view, when composed against the + // ORIGINAL source [0..8], must give bytes [3, 5] from indices + // 3 and 5. compose_view collapses the chain into one source-space + // view; the test verifies the bytes round-trip end-to-end. + let input_array = build_1d_identity_raster(); + let input_rasters = RasterStructArray::new(&input_array); + let input_raster = input_rasters.get(0).unwrap(); + let input_band = input_raster.band(0).unwrap(); + + // Round 1. + let mut b1 = RasterBuilder::new(1); + b1.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[4], None) + .unwrap(); + let v1 = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 4, + }]; + b1.with_view(WithViewArgs { + name: None, + dim_names: &["x"], + input: input_band.as_ref(), + view: &v1, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + b1.finish_band().unwrap(); + b1.finish_raster().unwrap(); + let mid_array = b1.finish().unwrap(); + + // Sanity: Round 1 alone produces [1, 3, 5, 7]. + let mid_rasters = RasterStructArray::new(&mid_array); + let mid_raster = mid_rasters.get(0).unwrap(); + let mid_band = mid_raster.band(0).unwrap(); + assert_eq!(mid_band.shape(), &[4]); + // Round 1 view: start=1, step=2 over source [0..8] → strides [2], + // offset 1 (visible bytes would be [1, 3, 5, 7]). Strided, so the + // buffer is non-contiguous. + let mid_buf = mid_band.nd_buffer().unwrap(); + assert_eq!(mid_buf.strides, &[2]); + assert_eq!(mid_buf.offset, 1); + assert!(!mid_buf.is_contiguous()); + + // Round 2: with_view applied on the view-bearing mid_band. + let mut b2 = RasterBuilder::new(1); + b2.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[2], None) + .unwrap(); + let v2 = crate::view_entries![1:3]; + b2.with_view(WithViewArgs { + name: None, + dim_names: &["x"], + input: mid_band.as_ref(), + view: v2.as_slice(), + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + b2.finish_band().unwrap(); + b2.finish_raster().unwrap(); + let final_array = b2.finish().unwrap(); + + let final_rasters = RasterStructArray::new(&final_array); + let final_raster = final_rasters.get(0).unwrap(); + let final_band = final_raster.band(0).unwrap(); + assert_eq!(final_band.shape(), &[2]); + // compose_view collapses both rounds into a single source-space view: + // visible bytes [3, 5] map to source indices 3 and 5 → strides [2], + // offset 3. Strided, so the composed buffer is non-contiguous. + let final_buf = final_band.nd_buffer().unwrap(); + assert_eq!(final_buf.strides, &[2]); + assert_eq!(final_buf.offset, 3); + assert!(!final_buf.is_contiguous()); + // The composed view still references the original 8-byte source. + assert_eq!(final_band.raw_source_shape(), &[8]); + } + + #[test] + fn with_view_on_outdb_input_produces_outdb_output_with_composed_view() { + // Viewing an OutDb band doesn't need the source bytes — the output + // band is itself OutDb, pointing at the same external resource via + // an inherited outdb_uri, with the composed view describing the + // slice. Loading is deferred to whoever reads the visible bytes. + let mut b = RasterBuilder::new(1); + b.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[8], None) + .unwrap(); + b.start_band_nd( + None, + &["x"], + &[8], + BandDataType::UInt8, + None, + Some("s3://bucket/file.tif#band=1"), + Some("geotiff"), + ) + .unwrap(); + b.band_data_writer().append_value([0u8; 0]); // empty → OutDb + b.finish_band().unwrap(); + b.finish_raster().unwrap(); + let input_array = b.finish().unwrap(); + + let input_rasters = RasterStructArray::new(&input_array); + let input_raster = input_rasters.get(0).unwrap(); + let input_band = input_raster.band(0).unwrap(); + assert!(!input_band.is_indb(), "fixture must be OutDb"); + + // Slice the OutDb band's visible axis: start=1, step=2, steps=3. + let mut b2 = RasterBuilder::new(1); + b2.start_raster_nd(&[0.0, 1.0, 0.0, 0.0, 0.0, -1.0], &["x"], &[3], None) + .unwrap(); + let view = [ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }]; + b2.with_view(WithViewArgs { + name: None, + dim_names: &["x"], + input: input_band.as_ref(), + view: &view, + nodata: None, + outdb_uri: None, + outdb_format: None, + }) + .unwrap(); + b2.finish_band().unwrap(); + b2.finish_raster().unwrap(); + let out_array = b2.finish().unwrap(); + + let out_rasters = RasterStructArray::new(&out_array); + let out_raster = out_rasters.get(0).unwrap(); + let out_band = out_raster.band(0).unwrap(); + + // Output remains OutDb (data column stayed empty). + assert!( + !out_band.is_indb(), + "output of OutDb-input with_view must be OutDb" + ); + // Storage metadata inherited from the input. + assert_eq!(out_band.outdb_uri(), Some("s3://bucket/file.tif#band=1")); + assert_eq!(out_band.outdb_format(), Some("geotiff")); + // Composed view: input had identity view, so composed == supplied + // view entry verbatim. + assert_eq!( + out_band.view(), + &[ViewEntry { + source_axis: 0, + start: 1, + step: 2, + steps: 3, + }] + ); + assert_eq!(out_band.raw_source_shape(), &[8]); + // Visible shape derived from composed view. + assert_eq!(out_band.shape(), &[3]); } } diff --git a/rust/sedona-raster/src/lib.rs b/rust/sedona-raster/src/lib.rs index 77db0c0dd..ebfa0bdcb 100644 --- a/rust/sedona-raster/src/lib.rs +++ b/rust/sedona-raster/src/lib.rs @@ -20,3 +20,4 @@ pub mod array; pub mod builder; pub mod display; pub mod traits; +pub mod view_entries; diff --git a/rust/sedona-raster/src/traits.rs b/rust/sedona-raster/src/traits.rs index 381c3c154..0563c6415 100644 --- a/rust/sedona-raster/src/traits.rs +++ b/rust/sedona-raster/src/traits.rs @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::borrow::Cow; - use arrow_schema::ArrowError; use sedona_schema::raster::BandDataType; @@ -27,8 +25,14 @@ use sedona_schema::raster::BandDataType; /// `source_shape` (the natural extent of `buffer`) with its `view` /// (the per-axis `(source_axis, start, step, steps)` slice spec). Stride /// can be zero (broadcast) or negative (reverse iteration), and may not be -/// C-order. Consumers that need a flat row-major buffer should use -/// `BandRef::contiguous_data()` instead. +/// C-order. Consumers that need a flat row-major buffer should check +/// `NdBuffer::is_contiguous()` and borrow via `NdBuffer::as_contiguous()`, +/// which errors on strided layouts rather than allocating. +/// +/// `shape` and `offset` are `i64` (not `u64`) to match the surrounding +/// stride arithmetic (`strides: Vec` to allow negative steps); +/// shape elements are always non-negative — `validate_view` enforces +/// this — and `offset` is non-negative by construction. /// /// Only `buffer` is tied to the producer's lifetime `'a` (it can be tens of /// MBs of pixel data and must not be copied). `shape` and `strides` are @@ -38,33 +42,85 @@ use sedona_schema::raster::BandDataType; #[derive(Debug)] pub struct NdBuffer<'a> { pub buffer: &'a [u8], - pub shape: Vec, + pub shape: Vec, pub strides: Vec, - pub offset: u64, + pub offset: i64, pub data_type: BandDataType, } -/// One per-dimension entry of a band's logical view. Describes how a -/// visible axis maps onto an axis of the underlying source buffer. -/// -/// - `source_axis`: index into the band's `source_shape` that this visible -/// axis reads from. Across a band's full view, `source_axis` values must -/// form a permutation of `0..ndim` — axis-dropping and axis-introducing -/// views are not supported today. -/// - `start`: starting index along the source axis (in elements, not bytes). -/// - `step`: stride between consecutive visible elements along the source -/// axis. `step == 0` means broadcast (the same source element is -/// exposed `steps` times); negative `step` means reverse iteration. -/// - `steps`: number of visible elements along this axis. `steps == 0` is -/// allowed (empty axis). -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct ViewEntry { - pub source_axis: i64, - pub start: i64, - pub step: i64, - pub steps: i64, +impl<'a> NdBuffer<'a> { + /// True iff the visible region is packed row-major (C-order) over + /// `buffer`: each axis's byte stride equals the C-order stride implied + /// by `shape` and `data_type` — innermost stride `== byte_size`, each + /// outer stride `== inner stride × inner extent`. + /// + /// `offset` is irrelevant: a contiguous sub-range that starts partway + /// into the buffer (e.g. an outer-axis slice) is still contiguous, so a + /// non-identity view can be contiguous. Broadcast (stride 0), reverse + /// (negative stride), permuted, or gapped layouts are *not* contiguous — + /// the C-order stride equality rejects them. A zero-extent axis (empty + /// visible region) is trivially contiguous. + /// + /// Pure function of `(shape, strides, data_type)`; no I/O, no allocation. + pub fn is_contiguous(&self) -> bool { + if self.shape.len() != self.strides.len() { + return false; + } + // An empty visible region is trivially packed. + if self.shape.contains(&0) { + return true; + } + // Expected C-order byte strides, innermost first: + // stride[i] == byte_size × Π_{j>i} shape[j]. + let mut expected = self.data_type.byte_size() as i64; + for (&dim, &stride) in self.shape.iter().zip(self.strides.iter()).rev() { + if stride != expected { + return false; + } + expected = expected.saturating_mul(dim); + } + true + } + + /// The visible bytes as a packed row-major slice, borrowed zero-copy + /// from `buffer`. `Ok` iff [`is_contiguous`](Self::is_contiguous); + /// otherwise an error directing the caller to materialize via + /// `RS_EnsureContiguous` + /// (). Never copies or + /// allocates — a strided layout returns an error, it is not materialized + /// here. + pub fn as_contiguous(&self) -> Result<&'a [u8], ArrowError> { + if !self.is_contiguous() { + return Err(ArrowError::InvalidArgumentError(format!( + "band view is not contiguous (shape {:?}, strides {:?}); \ + materialize it with RS_EnsureContiguous before contiguous \ + byte access — see https://github.com/apache/sedona-db/issues/899", + self.shape, self.strides + ))); + } + let elements: i64 = self.shape.iter().product(); + let len = elements as usize * self.data_type.byte_size(); + let start = self.offset as usize; + let buffer: &'a [u8] = self.buffer; + match start.checked_add(len) { + Some(end) if end <= buffer.len() => Ok(&buffer[start..end]), + _ => Err(ArrowError::ExternalError(Box::new( + sedona_common::sedona_internal_datafusion_err!( + "contiguous region [{start}, {start}+{len}) is out of bounds \ + for buffer of length {}", + buffer.len() + ), + ))), + } + } } +// `ViewEntry` and `ViewEntries` (with their associated machinery — +// `validate`, `is_identity`, `visible_shape`, `compose`) live in +// `view_entries.rs`. Re-exported here so the existing `crate::traits` +// import path keeps working. +pub use crate::view_entries::ViewEntry; + /// Concrete raster metadata returned by `RasterRef::metadata()`. /// /// Restored from the pre-N-D schema to keep callers that pattern-match on @@ -458,11 +514,15 @@ pub trait RasterRef { /// Trait for accessing a single band/variable within an N-D raster. /// /// This is the consumer interface. Implementations handle storage details -/// Two data access paths: -/// - `contiguous_data()` — flat row-major bytes for consumers that don't need -/// stride awareness (most RS_* functions, GDAL boundary, serialization). -/// - `nd_buffer()` — raw buffer + shape + strides + offset for stride-aware -/// consumers (numpy zero-copy views, Arrow FFI) that want to avoid copies. +/// and view composition transparently. +/// +/// `nd_buffer()` is the sole zero-copy byte accessor: it returns the raw +/// buffer plus shape + strides + offset describing the visible region. +/// Stride-aware consumers (numpy zero-copy views, Arrow FFI) read it +/// directly; consumers that need flat row-major bytes borrow via +/// `NdBuffer::as_contiguous()`, which errors on strided layouts rather than +/// allocating. The trait never materializes a strided view behind the +/// caller's back — repacking is an explicit plan-node concern. pub trait BandRef { // -- Dimension metadata -- @@ -476,7 +536,11 @@ pub trait BandRef { /// `dim_names` order. Derived from `view`: `[v.steps for v in view]`. /// This is what almost all consumers want; use `raw_source_shape()` only /// when you need to address into the raw `data` buffer (e.g. FFI). - fn shape(&self) -> &[u64]; + /// + /// `i64` (not `u64`) to match the surrounding view-machinery + /// arithmetic (strides, offsets); shape elements are always + /// non-negative — `validate_view` enforces this. + fn shape(&self) -> &[i64]; /// **Internal/FFI-only.** Natural C-order extent of the band's /// underlying `data` buffer, indexed by *source* axis (not visible @@ -489,6 +553,10 @@ pub trait BandRef { /// Use this only when you need to index directly into the raw `data` /// bytes (e.g. Arrow C Data Interface, numpy zero-copy views) and you /// also handle `view()` and the byte-stride layout from `nd_buffer()`. + /// + /// Returns `&[u64]` because this is a direct slice into the underlying + /// Arrow `UInt64Array` column; converting would require an allocation + /// per call. Callers that want i64 should cast at the call site. fn raw_source_shape(&self) -> &[u64]; /// Per-visible-dimension view entries describing how the band's @@ -496,10 +564,12 @@ pub trait BandRef { /// See `ViewEntry` for per-entry semantics. fn view(&self) -> &[ViewEntry]; - /// Size of a named dimension (None if doesn't exist) + /// Size of a named dimension (None if doesn't exist). + /// Cast i64 → u64 is lossless: `shape()` entries are non-negative + /// (`validate_view` enforces it). fn dim_size(&self, name: &str) -> Option { let idx = self.dim_index(name)?; - Some(self.shape()[idx]) + Some(self.shape()[idx] as u64) } /// Index of a named dimension (None if doesn't exist) @@ -550,7 +620,7 @@ pub trait BandRef { /// OutDb format — how to interpret the bytes at `outdb_uri` /// (e.g. `"geotiff"`, `"zarr"`). None means in-memory — the band's - /// `contiguous_data()` / `nd_buffer()` is authoritative. + /// `nd_buffer()` is authoritative. fn outdb_format(&self) -> Option<&str> { None } @@ -610,41 +680,20 @@ pub trait BandRef { /// `offset` are computed by composing the view with the source's /// natural C-order byte strides. Strides may be zero (broadcast) or /// negative (reverse iteration). + /// + /// The returned `shape`, `strides`, and `offset` are guaranteed + /// in-bounds for `buffer`: `RasterRef::band()` rejects malformed views, + /// overflowing stride composition, and source-shape/data-column length + /// mismatches at construction. Stride-aware consumers can walk the + /// returned layout without further bound checks against `buffer.len()`. + /// + /// This is the **sole** byte-access method on the trait — it is + /// zero-copy and never allocates. Contiguous-byte consumers call + /// [`NdBuffer::as_contiguous`] on the result (borrow-or-error); + /// materialization of a strided view is an explicit `RS_EnsureContiguous` + /// step, never a transparent allocation behind this interface. fn nd_buffer(&self) -> Result, ArrowError>; - /// Contiguous row-major bytes covering the *visible* region. Zero-copy - /// (`Cow::Borrowed`) when the view is full identity over a C-order - /// source buffer; copies into a new buffer when the view slices, - /// broadcasts, or permutes. Most RS_* functions use this. - fn contiguous_data(&self) -> Result, ArrowError>; - - /// Pre-N-D compatibility shim: raw row-major bytes for InDb, - /// identity-view bands. Panics on anything else (OutDb, non-identity - /// view, or a `contiguous_data` error) — corresponds to main's - /// infallible `BandRef::data() -> &[u8]` which only ever ran against - /// identity-view InDb bands. - fn data(&self) -> &[u8] { - // Compatibility shim: returns the same bytes pre-N-D callers expect - // from `BandRef::data() -> &[u8]`. Delegates to `contiguous_data()` - // so identity-view bands surface the borrowed in-line bytes, - // matching the pre-N-D behavior exactly. View-materialized - // (`Cow::Owned`) bands can't be returned through `&[u8]` because - // the owned `Vec` would die at the end of this call — implementers - // that need view-materialized bytes via `data()` must override and - // anchor the materialized buffer on `Self`; other consumers should - // reach for `contiguous_data()` directly. - match self - .contiguous_data() - .expect("BandRef::data() requires an in-db band with bytes") - { - Cow::Borrowed(b) => b, - Cow::Owned(_) => panic!( - "BandRef::data() can't return view-materialized bytes; \ - use contiguous_data() for sliced/permuted bands" - ), - } - } - /// Nodata value interpreted as f64. /// /// Returns `Ok(None)` when no nodata value is defined, `Ok(Some(f64))` on @@ -883,7 +932,7 @@ mod tests { struct StubBand { dim_names: Vec, source_shape: Vec, - shape: Vec, + shape: Vec, view: Vec, } @@ -894,7 +943,7 @@ mod tests { fn dim_names(&self) -> Vec<&str> { self.dim_names.iter().map(String::as_str).collect() } - fn shape(&self) -> &[u64] { + fn shape(&self) -> &[i64] { &self.shape } fn raw_source_shape(&self) -> &[u64] { @@ -912,17 +961,14 @@ mod tests { fn nd_buffer(&self) -> Result, ArrowError> { unimplemented!("not used in is_2d tests") } - fn contiguous_data(&self) -> Result, ArrowError> { - unimplemented!("not used in is_2d tests") - } } fn band(dims: &[&str], source_shape: &[u64], view: &[ViewEntry]) -> StubBand { - let shape = view.iter().map(|v| v.steps as u64).collect(); + let entries = crate::view_entries::ViewEntries::new(view.to_vec()); StubBand { dim_names: dims.iter().map(|s| (*s).to_string()).collect(), source_shape: source_shape.to_vec(), - shape, + shape: entries.visible_shape(), view: view.to_vec(), } } @@ -989,4 +1035,117 @@ mod tests { let b = band(&["lat", "lon"], &[4, 5], &[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); assert!(!b.is_2d()); } + + // ---- NdBuffer::is_contiguous / as_contiguous ---- + // + // The contiguity predicate is a pure function of (shape, strides, + // data_type), so it is exercised here directly on NdBuffer literals + // rather than through the full RasterBuilder → reader path. The + // builder/reader tests in builder.rs and array.rs cover that the view + // composition produces these strides/offsets in the first place. + + fn ndbuf<'a>( + buffer: &'a [u8], + shape: &[i64], + strides: &[i64], + offset: i64, + data_type: BandDataType, + ) -> NdBuffer<'a> { + NdBuffer { + buffer, + shape: shape.to_vec(), + strides: strides.to_vec(), + offset, + data_type, + } + } + + #[test] + fn is_contiguous_identity_2d_uint8_borrows_full_buffer() { + let bytes: Vec = (0..6).collect(); + let b = ndbuf(&bytes, &[2, 3], &[3, 1], 0, BandDataType::UInt8); + assert!(b.is_contiguous()); + assert_eq!(b.as_contiguous().unwrap(), &bytes[..]); + } + + #[test] + fn is_contiguous_identity_multibyte_float32() { + // shape [2, 3] Float32 → C-order byte strides [12, 4]. + let bytes = vec![7u8; 24]; + let b = ndbuf(&bytes, &[2, 3], &[12, 4], 0, BandDataType::Float32); + assert!(b.is_contiguous()); + assert_eq!(b.as_contiguous().unwrap().len(), 24); + } + + #[test] + fn is_contiguous_is_offset_agnostic_for_outer_axis_slice() { + // Take rows 1..3 of a [3, 3] UInt8 source: offset 3, packed strides. + // Contiguous-but-not-identity → borrows the sub-range zero-copy. + let bytes: Vec = (0..9).collect(); + let b = ndbuf(&bytes, &[2, 3], &[3, 1], 3, BandDataType::UInt8); + assert!(b.is_contiguous()); + assert_eq!(b.as_contiguous().unwrap(), &bytes[3..9]); + } + + #[test] + fn is_contiguous_false_for_broadcast_zero_stride() { + let bytes = vec![0u8; 3]; + let b = ndbuf(&bytes, &[4, 3], &[0, 1], 0, BandDataType::UInt8); + assert!(!b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } + + #[test] + fn is_contiguous_false_for_negative_stride() { + let bytes: Vec = (0..8).collect(); + let b = ndbuf(&bytes, &[3], &[-2], 6, BandDataType::UInt8); + assert!(!b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } + + #[test] + fn is_contiguous_false_for_permuted_inner_stride() { + // shape [3, 2], strides [1, 6] — inner stride 6 != dtype_size. + let bytes = vec![0u8; 12]; + let b = ndbuf(&bytes, &[3, 2], &[1, 6], 0, BandDataType::UInt8); + assert!(!b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } + + #[test] + fn is_contiguous_false_for_inner_strided_multibyte() { + // UInt16 is 2 bytes; a step-2 view gives byte stride 4 != 2. + let bytes = vec![0u8; 12]; + let b = ndbuf(&bytes, &[3], &[4], 0, BandDataType::UInt16); + assert!(!b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } + + #[test] + fn is_contiguous_true_for_zero_extent_axis_borrows_empty() { + // A zero-extent axis addresses no bytes — trivially contiguous, + // regardless of the surrounding strides. + let bytes = vec![0u8; 8]; + let b = ndbuf(&bytes, &[3, 0, 5], &[0, 5, 1], 0, BandDataType::UInt8); + assert!(b.is_contiguous()); + assert!(b.as_contiguous().unwrap().is_empty()); + } + + #[test] + fn is_contiguous_false_for_shape_strides_length_mismatch() { + let bytes = vec![0u8; 6]; + let b = ndbuf(&bytes, &[2, 3], &[3], 0, BandDataType::UInt8); + assert!(!b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } + + #[test] + fn as_contiguous_errors_when_region_exceeds_buffer() { + // Layout is C-order packed, but offset pushes the region past the + // buffer end — the defensive bounds check must reject it. + let bytes = vec![0u8; 4]; + let b = ndbuf(&bytes, &[4], &[1], 2, BandDataType::UInt8); + assert!(b.is_contiguous()); + assert!(b.as_contiguous().is_err()); + } } diff --git a/rust/sedona-raster/src/view_entries.rs b/rust/sedona-raster/src/view_entries.rs new file mode 100644 index 000000000..8411704f6 --- /dev/null +++ b/rust/sedona-raster/src/view_entries.rs @@ -0,0 +1,594 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! View entries — per-axis slice / broadcast / permutation specs for an +//! N-D raster band. The [`ViewEntries`] newtype owns a `Vec` +//! and is the entry point for all view-machinery operations +//! (validation, identity check, visible-shape derivation, composition). + +use arrow_schema::ArrowError; + +/// One per-dimension entry of a band's logical view. Describes how a +/// visible axis maps onto an axis of the underlying source buffer. +/// +/// - `source_axis`: index into the band's `source_shape` that this visible +/// axis reads from. Across a band's full view, `source_axis` values must +/// form a permutation of `0..ndim` — axis-dropping and axis-introducing +/// views are not supported today. +/// - `start`: starting index along the source axis (in elements, not bytes). +/// - `step`: stride between consecutive visible elements along the source +/// axis. `step == 0` means broadcast (the same source element is +/// exposed `steps` times); negative `step` means reverse iteration. +/// - `steps`: number of visible elements along this axis. `steps == 0` is +/// allowed (empty axis). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ViewEntry { + pub source_axis: i64, + pub start: i64, + pub step: i64, + pub steps: i64, +} + +/// A band's full view, one [`ViewEntry`] per visible axis. +/// +/// Use [`ViewEntries::new`] to wrap an existing `Vec` or +/// [`ViewEntries::identity_for_shape`] to build the canonical +/// no-op view over a given source shape. Operations on the view live +/// as methods on this type — `validate`, `visible_shape`, +/// `is_identity`, `compose`. The newtype gives Vec a name +/// and a single place to attach helpers and tests. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct ViewEntries(Vec); + +impl ViewEntries { + /// Wrap a pre-built vector of entries. The view is not validated + /// here — call [`Self::validate`] before relying on it. + pub fn new(inner: Vec) -> Self { + Self(inner) + } + + /// Build the canonical identity view over `source_shape`: + /// `[(source_axis=k, start=0, step=1, steps=source_shape[k])]` for + /// each k. Always self-validates. + pub fn identity_for_shape(source_shape: &[i64]) -> Self { + Self( + source_shape + .iter() + .enumerate() + .map(|(k, &s)| ViewEntry { + source_axis: k as i64, + start: 0, + step: 1, + steps: s, + }) + .collect(), + ) + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn as_slice(&self) -> &[ViewEntry] { + &self.0 + } + + pub fn iter(&self) -> std::slice::Iter<'_, ViewEntry> { + self.0.iter() + } + + /// Visible shape derived from `[v.steps for v in self]`. `validate` + /// guarantees `steps >= 0`, so callers can treat the result as + /// non-negative after a successful validation. + pub fn visible_shape(&self) -> Vec { + self.0.iter().map(|v| v.steps).collect() + } + + /// True iff this view is the canonical identity over a C-order + /// source buffer: every visible axis `k` is a no-op (no + /// permutation, no slice offset, no stride/reverse, full coverage + /// of the corresponding source axis). + /// + /// Identity views are always contiguous, so the reader borrows the + /// underlying `data` column directly through `NdBuffer::as_contiguous()` + /// with no allocation or copy. Non-identity views may still be + /// contiguous (e.g. an outer-axis slice); strided ones are rejected by + /// `as_contiguous()` and must be repacked via an explicit plan node. + pub fn is_identity(&self, source_shape: &[i64]) -> bool { + if self.0.len() != source_shape.len() { + return false; + } + self.0.iter().enumerate().all(|(k, v)| { + v.source_axis == k as i64 && v.start == 0 && v.step == 1 && v.steps == source_shape[k] + }) + } + + /// Validate against a band's `source_shape`. `Ok(())` iff: + /// + /// - `self.len() == source_shape.len()`. + /// - `source_axis` values across `self` form a permutation of + /// `0..source_shape.len()` (no axis duplicated, none missing). + /// - Every `source_shape[k] >= 0`. + /// - Every `steps >= 0`. + /// - When `steps > 0`: `start ∈ [0, source_shape[source_axis])`, + /// and when `step != 0` the last addressed element + /// `start + (steps - 1) * step` is also in that range. + pub fn validate(&self, source_shape: &[i64]) -> Result<(), ArrowError> { + let ndim = source_shape.len(); + if self.0.len() != ndim { + return Err(ArrowError::InvalidArgumentError(format!( + "view length ({}) must equal source_shape length ({ndim})", + self.0.len() + ))); + } + let mut seen = vec![false; ndim]; + for (k, v) in self.0.iter().enumerate() { + if v.source_axis < 0 || (v.source_axis as usize) >= ndim { + return Err(ArrowError::InvalidArgumentError(format!( + "view[{k}].source_axis = {} is out of range [0, {ndim})", + v.source_axis + ))); + } + let sa = v.source_axis as usize; + if seen[sa] { + return Err(ArrowError::InvalidArgumentError(format!( + "view source_axis values must be a permutation of 0..{ndim}; \ + axis {sa} appears more than once" + ))); + } + seen[sa] = true; + + if v.steps < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "view[{k}].steps = {} must be >= 0", + v.steps + ))); + } + if source_shape[sa] < 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "source_shape[{sa}] = {} must be >= 0", + source_shape[sa] + ))); + } + if v.steps > 0 { + let s = source_shape[sa]; + if v.start < 0 || v.start >= s { + return Err(ArrowError::InvalidArgumentError(format!( + "view[{k}].start = {} is out of range [0, {s}) for source axis {sa}", + v.start + ))); + } + if v.step != 0 { + // Checked arithmetic: a malicious or corrupted view can't + // wrap (steps-1)*step or start+… into an in-range value and + // bypass the bound check. Any overflow is a validation + // error. + let last = (v.steps - 1) + .checked_mul(v.step) + .and_then(|d| v.start.checked_add(d)) + .ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "view[{k}] last-element index overflows i64 for \ + start={}, step={}, steps={} on source axis {sa}", + v.start, v.step, v.steps + )) + })?; + if last < 0 || last >= s { + return Err(ArrowError::InvalidArgumentError(format!( + "view[{k}] addresses element {last} which is out of range \ + [0, {s}) for source axis {sa}" + ))); + } + } + } + } + Ok(()) + } + + /// Compose `next` (a view spec over `self`'s visible axes) with + /// `self` (a view spec over a band's `source_shape`), producing a + /// single view over the same `source_shape` that represents the + /// result of viewing through `next`. + /// + /// `next[k].source_axis` indexes into `self`'s visible axes + /// (`0..self.len()`), NOT into the underlying source axes. This + /// lets callers (e.g. lazy slicing) describe the new view in terms + /// of what they see, without knowing the input's internal layout. + /// + /// Math: for each output axis `k`, walking through `self` resolves + /// the source axis and translates start/step: + /// - `out.source_axis = self[next.source_axis].source_axis` + /// - `out.start = self.start + next.start * self.step` + /// - `out.step = next.step * self.step` + /// - `out.steps = next.steps` + /// + /// Uses checked arithmetic at every step. The caller must + /// [`validate`] the result against the source shape before use. + /// + /// [`validate`]: Self::validate + pub fn compose(&self, next: &Self) -> Result { + let mut out = Vec::with_capacity(next.0.len()); + for (k, next_entry) in next.0.iter().enumerate() { + if next_entry.source_axis < 0 || (next_entry.source_axis as usize) >= self.0.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "compose: next[{k}].source_axis ({}) is out of range \ + for input with {} visible axes", + next_entry.source_axis, + self.0.len() + ))); + } + let input = &self.0[next_entry.source_axis as usize]; + let step = next_entry.step.checked_mul(input.step).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "compose: step product overflows i64 at axis {k} \ + (next.step={}, input.step={})", + next_entry.step, input.step + )) + })?; + let start_offset = next_entry.start.checked_mul(input.step).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "compose: next.start * input.step overflows i64 at axis {k} \ + (next.start={}, input.step={})", + next_entry.start, input.step + )) + })?; + let start = input.start.checked_add(start_offset).ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "compose: composed start overflows i64 at axis {k} \ + (input.start={}, offset={})", + input.start, start_offset + )) + })?; + out.push(ViewEntry { + source_axis: input.source_axis, + start, + step, + steps: next_entry.steps, + }); + } + Ok(Self(out)) + } +} + +impl From> for ViewEntries { + fn from(inner: Vec) -> Self { + Self(inner) + } +} + +impl AsRef<[ViewEntry]> for ViewEntries { + fn as_ref(&self) -> &[ViewEntry] { + &self.0 + } +} + +impl std::ops::Index for ViewEntries { + type Output = ViewEntry; + fn index(&self, i: usize) -> &ViewEntry { + &self.0[i] + } +} + +impl<'a> IntoIterator for &'a ViewEntries { + type Item = &'a ViewEntry; + type IntoIter = std::slice::Iter<'a, ViewEntry>; + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +/// Build a [`ViewEntries`] from numpy-style slice syntax. Each `a:b` +/// becomes a `ViewEntry { source_axis: k, start: a, step: 1, steps: b - a }` +/// on the next axis. Step ≠ 1 isn't supported by the macro — use +/// the struct constructor directly for that. +/// +/// ```ignore +/// let v = view_entries![0:4, 1:5]; +/// // → axis 0: start=0, step=1, steps=4 +/// // → axis 1: start=1, step=1, steps=4 +/// ``` +#[macro_export] +macro_rules! view_entries { + ($($start:literal : $stop:literal),* $(,)?) => { + $crate::view_entries::ViewEntries::new({ + #[allow(unused_mut)] + let mut entries: Vec<$crate::view_entries::ViewEntry> = Vec::new(); + // Pre-increment so the last write into `axis` is always read by + // the following push — no dangling assignment on the final entry, + // and no warning on single-entry uses. + #[allow(unused_mut)] + let mut axis: i64 = -1; + $( + axis += 1; + entries.push($crate::view_entries::ViewEntry { + source_axis: axis, + start: $start, + step: 1, + steps: ($stop as i64) - ($start as i64), + }); + )* + entries + }) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ve(source_axis: i64, start: i64, step: i64, steps: i64) -> ViewEntry { + ViewEntry { + source_axis, + start, + step, + steps, + } + } + + fn entries(v: &[ViewEntry]) -> ViewEntries { + ViewEntries::new(v.to_vec()) + } + + // ---- macro ---- + + #[test] + fn macro_expands_python_style_slice_to_view_entries() { + let v = view_entries![0:4, 1:5]; + assert_eq!(v.len(), 2); + assert_eq!(v[0], ve(0, 0, 1, 4)); + assert_eq!(v[1], ve(1, 1, 1, 4)); + } + + // ---- visible_shape ---- + + #[test] + fn visible_shape_pulls_steps_from_each_entry() { + let v = entries(&[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); + assert_eq!(v.visible_shape(), vec![4, 5]); + } + + // ---- identity_for_shape ---- + + #[test] + fn identity_for_shape_round_trips_through_is_identity() { + let v = ViewEntries::identity_for_shape(&[4, 5, 6]); + assert!(v.is_identity(&[4, 5, 6])); + } + + // ---- validate ---- + + #[test] + fn validate_accepts_identity() { + let v = entries(&[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); + v.validate(&[4, 5]).unwrap(); + } + + #[test] + fn validate_rejects_length_mismatch() { + let v = entries(&[ve(0, 0, 1, 4)]); + let err = v.validate(&[4, 5]).unwrap_err(); + assert!(err.to_string().contains("must equal"), "got {err}"); + } + + #[test] + fn validate_rejects_negative_source_axis() { + let v = entries(&[ve(-1, 0, 1, 4)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("source_axis"), "got {err}"); + } + + #[test] + fn validate_rejects_oob_source_axis() { + let v = entries(&[ve(2, 0, 1, 4)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("source_axis"), "got {err}"); + } + + #[test] + fn validate_rejects_duplicate_source_axis() { + let v = entries(&[ve(0, 0, 1, 2), ve(0, 0, 1, 2)]); + let err = v.validate(&[2, 3]).unwrap_err(); + assert!(err.to_string().contains("permutation"), "got {err}"); + } + + #[test] + fn validate_rejects_negative_steps() { + let v = entries(&[ve(0, 0, 1, -1)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("steps"), "got {err}"); + } + + #[test] + fn validate_rejects_negative_start() { + let v = entries(&[ve(0, -1, 1, 1)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("start"), "got {err}"); + } + + #[test] + fn validate_rejects_start_at_source_size() { + let v = entries(&[ve(0, 4, 1, 1)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("start"), "got {err}"); + } + + #[test] + fn validate_rejects_negative_step_underrun() { + // start=0, step=-1, steps=2 addresses element 0 then -1 → underrun. + let v = entries(&[ve(0, 0, -1, 2)]); + let err = v.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("out of range"), "got {err}"); + } + + #[test] + fn validate_accepts_negative_step_full_reverse() { + // start=3, step=-1, steps=4 addresses 3,2,1,0 — all in range. + let v = entries(&[ve(0, 3, -1, 4)]); + v.validate(&[4]).unwrap(); + } + + #[test] + fn validate_accepts_steps_zero_with_unconstrained_start() { + let v = entries(&[ve(0, 999, 1, 0)]); + v.validate(&[4]).unwrap(); + } + + #[test] + fn validate_steps_one_only_checks_start() { + let v = entries(&[ve(0, 3, 999, 1)]); + v.validate(&[4]).unwrap(); + } + + #[test] + fn validate_step_zero_broadcast_within_bounds() { + let v_ok = entries(&[ve(0, 3, 0, 100)]); + v_ok.validate(&[4]).unwrap(); + let v_bad = entries(&[ve(0, 4, 0, 1)]); + let err = v_bad.validate(&[4]).unwrap_err(); + assert!(err.to_string().contains("start"), "got {err}"); + } + + #[test] + fn validate_permutation_with_slice_ok() { + let v = entries(&[ve(1, 0, 1, 3), ve(0, 1, 1, 1)]); + v.validate(&[2, 3]).unwrap(); + } + + #[test] + fn validate_rejects_i64_overflow_in_last_element() { + let v = entries(&[ve(0, 10, i64::MAX, 3)]); + let err = v.validate(&[100]).unwrap_err(); + assert!(err.to_string().contains("overflow"), "got: {err}"); + } + + #[test] + fn validate_rejects_i64_overflow_in_start_plus_offset() { + let v = entries(&[ve(0, 2, 1, i64::MAX)]); + let err = v.validate(&[100]).unwrap_err(); + assert!(err.to_string().contains("overflow"), "got: {err}"); + } + + // ---- is_identity ---- + + #[test] + fn is_identity_accepts_canonical_identity() { + let v = entries(&[ve(0, 0, 1, 4), ve(1, 0, 1, 5)]); + assert!(v.is_identity(&[4, 5])); + } + + #[test] + fn is_identity_rejects_axis_permutation_even_with_identity_step() { + // start/step/steps all look identity-shaped, but source_axis is + // permuted. Indistinguishable from identity in start/step terms — + // the source_axis check is the only thing that catches it. + let v = entries(&[ve(1, 0, 1, 5), ve(0, 0, 1, 4)]); + assert!(!v.is_identity(&[4, 5])); + } + + #[test] + fn is_identity_rejects_partial_axis_coverage() { + let v = entries(&[ve(0, 0, 1, 3), ve(1, 0, 1, 5)]); + assert!(!v.is_identity(&[4, 5])); + } + + #[test] + fn is_identity_rejects_non_zero_start() { + let v = entries(&[ve(0, 1, 1, 3), ve(1, 0, 1, 5)]); + assert!(!v.is_identity(&[4, 5])); + } + + #[test] + fn is_identity_rejects_non_unit_step() { + let v = entries(&[ve(0, 0, 2, 2), ve(1, 0, 1, 5)]); + assert!(!v.is_identity(&[4, 5])); + } + + #[test] + fn is_identity_rejects_length_mismatch() { + let v = entries(&[ve(0, 0, 1, 4)]); + assert!(!v.is_identity(&[4, 5])); + } + + // ---- compose ---- + + #[test] + fn compose_identity_input_returns_next_unchanged() { + let identity = entries(&[ve(0, 0, 1, 8)]); + let next = entries(&[ve(0, 2, 3, 2)]); + let composed = identity.compose(&next).unwrap(); + assert_eq!(composed.len(), 1); + assert_eq!(composed[0], ve(0, 2, 3, 2)); + } + + #[test] + fn compose_slice_of_slice_collapses_into_one_view() { + // Input view samples source[0..8] at indices 1, 3, 5. + // Next view samples that visible region at index 1 only. + // Composed: samples source axis 0 at index 3 (= 1 + 1*2), step=2, steps=1. + let input = entries(&[ve(0, 1, 2, 3)]); + let next = entries(&[ve(0, 1, 1, 1)]); + let composed = input.compose(&next).unwrap(); + assert_eq!(composed.as_slice(), &[ve(0, 3, 2, 1)]); + } + + #[test] + fn compose_preserves_source_axis_through_permutation() { + // Input transposes axes: visible 0 = source 1, visible 1 = source 0. + // Next picks axis 1 (which is source axis 0), so the composed + // view's sole entry must have source_axis = 0. + let input = entries(&[ve(1, 0, 1, 5), ve(0, 0, 1, 4)]); + let next = entries(&[ve(1, 2, 1, 2)]); + let composed = input.compose(&next).unwrap(); + assert_eq!(composed.len(), 1); + assert_eq!(composed[0].source_axis, 0); + assert_eq!(composed[0].start, 2); + assert_eq!(composed[0].step, 1); + assert_eq!(composed[0].steps, 2); + } + + #[test] + fn compose_step_multiplies() { + // Input step=3, next step=2 → composed step=6. + let input = entries(&[ve(0, 0, 3, 5)]); + let next = entries(&[ve(0, 1, 2, 2)]); + let composed = input.compose(&next).unwrap(); + assert_eq!(composed.as_slice(), &[ve(0, 3, 6, 2)]); + } + + #[test] + fn compose_rejects_next_source_axis_out_of_range() { + let input = entries(&[ve(0, 0, 1, 4)]); + let next = entries(&[ve(2, 0, 1, 1)]); + let err = input.compose(&next).unwrap_err(); + assert!(err.to_string().contains("out of range"), "got {err}"); + } + + #[test] + fn compose_rejects_step_product_overflow() { + let input = entries(&[ve(0, 0, i64::MAX, 2)]); + let next = entries(&[ve(0, 0, 2, 1)]); + let err = input.compose(&next).unwrap_err(); + assert!( + err.to_string().contains("step product overflows"), + "got {err}" + ); + } +} diff --git a/rust/sedona-schema/src/raster.rs b/rust/sedona-schema/src/raster.rs index 436baf8dd..16b3f50b5 100644 --- a/rust/sedona-schema/src/raster.rs +++ b/rust/sedona-schema/src/raster.rs @@ -113,9 +113,11 @@ impl RasterSchema { /// View list type — one entry per dimension in the band's *visible* /// order. Each entry is a `(source_axis, start, step, steps)` quadruple /// describing how the visible axis maps onto the band's source shape. - /// The field is nullable: a null view denotes the identity view + /// The field is nullable: a NULL view denotes the identity view /// `[(i, 0, 1, source_shape[i]) for i in 0..ndim]` and is the canonical - /// representation for any band whose data has not been sliced. See + /// representation for any band whose data has not been sliced. A + /// non-NULL list must have length equal to the band's `source_shape` + /// — empty (zero-length, non-NULL) lists are malformed. See /// `RasterSchema` doc for full semantics. pub fn view_type() -> DataType { DataType::List(FieldRef::new(Field::new( diff --git a/rust/sedona-testing/src/rasters.rs b/rust/sedona-testing/src/rasters.rs index a29b72f9e..99af28ae6 100644 --- a/rust/sedona-testing/src/rasters.rs +++ b/rust/sedona-testing/src/rasters.rs @@ -475,7 +475,17 @@ pub fn assert_raster_equal(raster1: &impl RasterRef, raster2: &impl RasterRef) { "Band outdb band IDs do not match" ); - assert_eq!(band1.data(), band2.data(), "Band data does not match"); + assert_eq!( + band1.is_indb(), + band2.is_indb(), + "Band storage (in/out-db) does not match" + ); + if band1.is_indb() { + // Identity-view InDb fixtures: compare the packed visible bytes. + let b1 = band1.nd_buffer().unwrap().as_contiguous().unwrap(); + let b2 = band2.nd_buffer().unwrap().as_contiguous().unwrap(); + assert_eq!(b1, b2, "Band data does not match"); + } } } @@ -513,7 +523,8 @@ mod tests { assert_eq!(band_metadata.outdb_url(), None); assert_eq!(band_metadata.outdb_band_id(), None); - let band_data = band.data(); + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); let expected_pixel_count = (i + 1) * (i + 2); // width * height // Convert raw bytes back to u16 values for comparison @@ -550,7 +561,8 @@ mod tests { let band_metadata = band.metadata(); assert_eq!(band_metadata.data_type().unwrap(), BandDataType::UInt8); assert_eq!(band_metadata.storage_type().unwrap(), StorageType::InDb); - let band_data = band.data(); + let ndb = band.nd_buffer().unwrap(); + let band_data = ndb.as_contiguous().unwrap(); assert_eq!(band_data.len(), 64 * 64); // 4096 pixels } } @@ -619,7 +631,10 @@ mod tests { let b1 = bands.band(1).unwrap(); assert_eq!(b1.metadata().data_type().unwrap(), BandDataType::UInt8); assert_eq!(b1.metadata().nodata_value(), Some(&[255u8][..])); - assert_eq!(b1.data(), &[1u8, 2, 3, 4]); + assert_eq!( + b1.nd_buffer().unwrap().as_contiguous().unwrap(), + &[1u8, 2, 3, 4] + ); // Band 2: UInt16, nodata=0 let b2 = bands.band(2).unwrap();