Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
d27e8f4
feat(raster-zarr): sedona-raster-zarr crate + sd_read_zarr UDTF
james-willis May 19, 2026
3fd6420
feat(python/sedonadb): add sd_read_zarr Python wrapper
james-willis May 19, 2026
ca47786
feat(raster-zarr): review fixes for sd_read_zarr
james-willis May 19, 2026
1ad69cf
test(raster-zarr): migrate fixtures off deprecated store_chunk_elements
james-willis May 19, 2026
ccbb742
refactor(raster-zarr): rename sd_read_zarr `indb` option to `load_eager`
james-willis May 20, 2026
7a391cb
fix(raster-zarr): review-round small fixes from PR #858
james-willis May 20, 2026
ecf12a8
refactor(raster-zarr): collapse loader to a single OutDb path
james-willis May 20, 2026
52dc4ce
refactor(raster-zarr): plugin architecture — streaming reader + Exter…
james-willis May 20, 2026
8b1d180
feat(sedonadb-zarr): new Python plugin package wiring zarr support
james-willis May 20, 2026
490591a
feat(sedonadb-zarr): expose ExternalFormatSpec via Python ZarrFormatSpec
james-willis May 20, 2026
a637399
test(sedonadb-zarr): inspect raster cell as Python dict via as_py()
james-willis May 20, 2026
06e479b
ci: fix three CI failures from the plugin refactor
james-willis May 20, 2026
b681acc
fix: gate sedonadb's pymodule behind `extension-module` feature
james-willis May 20, 2026
940cedf
docs(sedonadb-zarr): README accurately reflects the shipped surface
james-willis May 20, 2026
a0c7454
refactor(sedona-raster-zarr): drop the `zarr` feature gate
james-willis May 20, 2026
ce0bdea
chore(sedona): drop redundant comment about zarr plugin
james-willis May 21, 2026
4423128
style(sedonadb): collapse cfg_attr to one line per rustfmt
james-willis May 21, 2026
4462b94
ci: don't leak sedonadb's s2geography feature into the plugin build
james-willis May 21, 2026
21c6383
fix: move sedonadb's extension-module out of default features
james-willis May 21, 2026
4e97bd4
ci: pass sedonadb's `extension-module` feature in maturin builds
james-willis May 21, 2026
b441a04
fix: drop self-referential sedonadb workspace dep, inline the path
james-willis May 21, 2026
ea3a540
fix: rename sedonadb-zarr's pymodule to `_zarr_lib`
james-willis May 21, 2026
61387ad
fix(sedonadb-zarr): cross-extension UDTF handoff via PyCapsule
james-willis May 21, 2026
cf2fe80
fix(sedonadb): drop mimalloc from defaults to coexist with plugins
james-willis May 21, 2026
44b0db6
test(sedonadb-zarr): drop premature read_format tests
james-willis May 21, 2026
3621555
feat(sedona-datasource): single-object table provider for directory f…
james-willis May 21, 2026
3b63088
fix(sedonadb): harden plugin extension surface from review
james-willis May 21, 2026
1673608
refactor(sedona-datasource): resolve ObjectStore for single-object pa…
james-willis May 21, 2026
171e4a3
chore(sedona-raster-zarr): trim scope and dependencies from PR review
james-willis May 21, 2026
583abde
refactor(sedonadb): UDTF handoff via datafusion-ffi; restore mimalloc
james-willis May 21, 2026
5a45265
fix(sedonadb): keep FFI codec's TaskContextProvider alive for session…
james-willis May 21, 2026
e678bbc
chore(sedonadb-zarr): pin sedonadb>=0.4.0
james-willis May 21, 2026
3289cdf
chore(sedona-raster-zarr): drop Windows MinGW blosc gate
james-willis May 21, 2026
008a332
test(sedonadb-zarr): parameterise smoke test over numpy dtypes
james-willis May 21, 2026
ee77599
chore(sedonadb): drop extension-module feature and `_zarr_lib` naming
james-willis May 21, 2026
8e44f57
chore(sedonadb,plugin): trim dead scaffolding and over-verbose comments
james-willis May 21, 2026
16eafd4
chore(sedonadb-zarr): defer `sd_read_zarr` SQL UDTF to a follow-up PR
james-willis May 21, 2026
b0a160b
fix(ci): unblock docs job + drop stray sedona-raster dev-dep
james-willis May 22, 2026
df700f8
fix(ci): mark both internal Python cdylibs as `doc = false`
james-willis May 22, 2026
01f4d88
chore(ci): trim stale `MATURIN_PEP517_ARGS` comment in python.yml
james-willis May 22, 2026
38d8465
feat(sedonadb): add `SedonaContext.read_format()` for plugin formats
james-willis May 22, 2026
03dc461
chore: remove unreachable code surfaces from post-strip PR
james-willis May 22, 2026
d809871
chore(sedonadb): drop dead hasattr fallback for list_single_object
james-willis May 22, 2026
0cf9d64
fix(ci): add TYPE_CHECKING import for `ExternalFormatSpec`
james-willis May 22, 2026
1432842
chore: post-merge audit sweep (validate options, trim public surface,…
james-willis May 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
587 changes: 584 additions & 3 deletions Cargo.lock

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ members = [
"rust/sedona-raster",
"rust/sedona-raster-functions",
"rust/sedona-raster-gdal",
"rust/sedona-raster-zarr",
"rust/sedona-schema",
"rust/sedona-spatial-join",
"rust/sedona-spatial-join-geography",
Expand Down Expand Up @@ -133,6 +134,9 @@ tokio = { version = "1.52", features = ["macros", "rt", "sync"] }
url = "2.5.7"
wkb = "0.9.2"
wkt = "0.14.0"
zarrs = { version = "0.23", default-features = false }
zarrs_filesystem = "0.3"
zarrs_object_store = "0.6"

# Workspace path dependencies for internal crates
sedona = { version = "0.4.0", path = "rust/sedona" }
Expand All @@ -150,6 +154,7 @@ sedona-pointcloud = { version = "0.4.0", path = "rust/sedona-pointcloud" }
sedona-raster = { version = "0.4.0", path = "rust/sedona-raster" }
sedona-raster-functions = { version = "0.4.0", path = "rust/sedona-raster-functions" }
sedona-raster-gdal = { version = "0.4.0", path = "rust/sedona-raster-gdal" }
sedona-raster-zarr = { version = "0.4.0", path = "rust/sedona-raster-zarr" }
sedona-schema = { version = "0.4.0", path = "rust/sedona-schema" }
sedona-spatial-join = { version = "0.4.0", path = "rust/sedona-spatial-join" }
sedona-spatial-join-gpu = { version = "0.4.0", path = "rust/sedona-spatial-join-gpu" }
Expand Down
67 changes: 66 additions & 1 deletion python/sedonadb/python/sedonadb/functions/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# under the License.

import json
from typing import Optional, Literal, Union, Tuple, Iterable
from typing import Optional, Literal, Union, Tuple, Iterable, List

from sedonadb.dataframe import DataFrame
from sedonadb.utility import sedona # noqa: F401
Expand Down Expand Up @@ -112,3 +112,68 @@ def sd_random_geometry(
args = {k: v for k, v in args.items() if v is not None}

return self._ctx.sql(f"SELECT * FROM sd_random_geometry('{json.dumps(args)}')")

def sd_read_zarr(
self,
uri: str,
*,
indb: Optional[bool] = None,
rows_per_batch: Optional[int] = None,
num_partitions: Optional[int] = None,
arrays: Optional[List[str]] = None,
) -> DataFrame:
"""
Read a Zarr group as a DataFrame of N-D rasters.

Returns a single-column DataFrame ``raster: Raster`` with one row per
chunk position in the Zarr group's chunk grid. Each row's bands are
the corresponding chunks of each array in the group. All ``RS_*``
UDFs operate on the column unchanged.

Only local filesystem stores are supported (``file://`` URIs or
bare paths).

Parameters
----------
uri : str
Zarr group URI. ``file:///path/to/foo.zarr`` or a bare local path.
indb : bool, optional
``True`` (default) materializes every chunk's bytes into the
Arrow ``data`` column eagerly. ``False`` emits chunk-anchor
URIs only; byte resolution depends on the OutDb resolver
being registered (follow-up PR).
rows_per_batch : int, optional
Chunks per ``RecordBatch`` (default 1024).
num_partitions : int, optional
Scan partitions. Only ``1`` is supported; ``> 1`` errors.
arrays : list of str, optional
Names of arrays in the group to read. By default every
multi-dimensional array is read; 1-D arrays (typical xarray
coord variables) are auto-skipped. Passing an explicit list
reads exactly those arrays. 1-D arrays are always rejected
(a raster band needs at least 2 dimensions); naming one
explicitly errors. Unknown names also error.

Examples
--------
>>> sd = sedona.db.connect()
>>> sd.funcs.table.sd_read_zarr("file:///path/to/datacube.zarr") # doctest: +SKIP
>>> sd.funcs.table.sd_read_zarr(
... "file:///path/to/datacube.zarr",
... arrays=["temperature", "pressure"],
... ) # doctest: +SKIP
"""

args = {
"indb": indb,
"rows_per_batch": rows_per_batch,
"num_partitions": num_partitions,
"arrays": arrays,
}
args = {k: v for k, v in args.items() if v is not None}

if args:
return self._ctx.sql(
f"SELECT * FROM sd_read_zarr('{uri}', '{json.dumps(args)}')"
)
return self._ctx.sql(f"SELECT * FROM sd_read_zarr('{uri}')")
32 changes: 32 additions & 0 deletions python/sedonadb/tests/test_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,35 @@ def test_random_geometry(con):

# Ensure the output is reproducible
assert df.to_arrow_table() == df.to_arrow_table()


def test_read_zarr(con, tmp_path):
# Skip cleanly if the optional `zarr` Python lib isn't installed —
# the binding is exercised by the Rust-side integration tests; this
# test only verifies the Python wrapper threads arguments through.
import pytest
Comment thread
james-willis marked this conversation as resolved.
Outdated

zarr = pytest.importorskip("zarr")
np = pytest.importorskip("numpy")
Comment thread
james-willis marked this conversation as resolved.
Outdated

# Build a 2x2 UInt8 array with two chunks, dim_names=["y","x"], inside
# a group at the temp path. Matches the minimal shape sd_read_zarr
# expects (2-D array with [y, x] suffix).
root = zarr.open_group(str(tmp_path), mode="w")
arr = root.create_array(
"temperature",
shape=(2, 2),
chunks=(1, 2),
dtype="uint8",
dimension_names=["y", "x"],
)
arr[:] = np.array([[10, 11], [20, 21]], dtype=np.uint8)

# Default-mode (InDb) read — every chunk materialized into the
# Arrow `data` column.
df = con.funcs.table.sd_read_zarr(f"file://{tmp_path}")
assert df.count() == 2

# Options thread through: rows_per_batch slices the output.
df = con.funcs.table.sd_read_zarr(f"file://{tmp_path}", rows_per_batch=1)
assert df.count() == 2
Comment thread
james-willis marked this conversation as resolved.
Outdated
53 changes: 53 additions & 0 deletions rust/sedona-raster-zarr/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
[package]
name = "sedona-raster-zarr"
version.workspace = true
license.workspace = true
keywords.workspace = true
categories.workspace = true
homepage.workspace = true
repository.workspace = true
description.workspace = true
readme.workspace = true
edition.workspace = true
rust-version.workspace = true

[lints.clippy]
result_large_err = "allow"

[dependencies]
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
datafusion-common = { workspace = true }
log = { workspace = true }
sedona-common = { workspace = true }
sedona-raster = { workspace = true }
sedona-schema = { workspace = true }
serde_json = { workspace = true }
zarrs = { workspace = true, features = ["filesystem", "gzip", "zstd", "crc32c", "sharding", "transpose"] }
zarrs_filesystem = { workspace = true }

# `blosc` is gated off Windows: c-blosc (statically linked) bundles its own
# `pthread_create` / `pthread_cond_*` symbols, which conflict with rtools45's
# `libpthread.a` during the MinGW link of the R `sedonadb.dll`. Non-Windows
# targets get the full blosc-compressed Zarr reading capability.
[target.'cfg(not(target_os = "windows"))'.dependencies]
Comment thread
james-willis marked this conversation as resolved.
Outdated
zarrs = { workspace = true, features = ["blosc"] }

[dev-dependencies]
tempfile = { workspace = true }
66 changes: 66 additions & 0 deletions rust/sedona-raster-zarr/src/dtype.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Mapping between Zarr datatypes and SedonaDB's `BandDataType`.
//!
//! zarrs 0.23 models `DataType` as a wrapper around `Arc<dyn DataTypeTraits>`,
//! so we discriminate via the type-erased `is::<T>()` check rather than
//! pattern-matching an enum.

use arrow_schema::ArrowError;
use sedona_schema::raster::BandDataType;
use zarrs::array::data_type::{
BoolDataType, Float32DataType, Float64DataType, Int16DataType, Int32DataType, Int64DataType,
Int8DataType, UInt16DataType, UInt32DataType, UInt64DataType, UInt8DataType,
};
use zarrs::array::DataType as ZarrDataType;

/// Map a Zarr `DataType` to a SedonaDB `BandDataType`.
///
/// Bool maps to UInt8 losslessly (Zarr packs bools to one byte each, matching
/// our representation). Variable-length, complex, and extended-precision
/// types error with `NotYetImplemented` — they have no numeric counterpart
/// in `BandDataType` today.
pub fn zarr_to_band_data_type(dt: &ZarrDataType) -> Result<BandDataType, ArrowError> {
if dt.is::<BoolDataType>() {
Ok(BandDataType::UInt8)
Comment thread
james-willis marked this conversation as resolved.
} else if dt.is::<Int8DataType>() {
Ok(BandDataType::Int8)
} else if dt.is::<UInt8DataType>() {
Ok(BandDataType::UInt8)
} else if dt.is::<Int16DataType>() {
Ok(BandDataType::Int16)
} else if dt.is::<UInt16DataType>() {
Ok(BandDataType::UInt16)
} else if dt.is::<Int32DataType>() {
Ok(BandDataType::Int32)
} else if dt.is::<UInt32DataType>() {
Ok(BandDataType::UInt32)
} else if dt.is::<Int64DataType>() {
Ok(BandDataType::Int64)
} else if dt.is::<UInt64DataType>() {
Ok(BandDataType::UInt64)
} else if dt.is::<Float32DataType>() {
Ok(BandDataType::Float32)
} else if dt.is::<Float64DataType>() {
Ok(BandDataType::Float64)
} else {
Err(ArrowError::NotYetImplemented(format!(
"Zarr datatype {dt:?} has no BandDataType mapping yet"
)))
}
}
Loading
Loading