Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- [issue/405] (https://github.com/podaac/l2ss-py/issues/405) Chunk logic now handles data without dimensions in root node.
- Optimize performance of ScanTime variable computation
- [issue/435] (https://github.com/podaac/l2ss-py/issues/435) Extra unneeded dimension scales included in output
Comment thread
ocsmit marked this conversation as resolved.
### Security
- Updated dependency libraries

Expand Down
86 changes: 0 additions & 86 deletions podaac/subsetter/datatree_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,6 @@ def process_node(
and (indexers.keys() - dataset[variable_name].dims)
and set(indexers.keys()).intersection(dataset[variable_name].dims)
):

missing_dim = sorted(indexers.keys() - dataset[variable_name].dims)[0]
var_indexers = {
dim_name: dim_value
Expand Down Expand Up @@ -800,91 +799,6 @@ def tree_get_spatial_bounds(
return np.array([[min(min_lons), max(max_lons)], [min(min_lats), max(max_lats)]])


def get_vars_with_paths(tree: DataTree) -> list[str]:
"""
Get all variables and coordinates with their full paths from a DataTree

Parameters
----------
tree : DataTree
The input DataTree

Returns
-------
List[str]
List of variable paths in format '/group/var' or '/var' for root level,
including coordinate variables at root level

Examples
--------
>>> ds = xr.Dataset({'var1': [1], 'var2': [2], 'time': ('time', [0])})
>>> tree = DataTree(data=ds)
>>> tree['group1'] = DataTree(data=ds.copy())
>>> paths = get_vars_with_paths(tree)
>>> print(paths)
['/time', '/var1', '/var2', '/group1/var1', '/group1/var2']
"""
paths = []

def collect_vars(node: DataTree, current_path: str = "") -> None:
# Add data variables from current node
for var_name in node.ds.data_vars:
paths.append(f"{current_path}/{var_name}")

# Recursively process child nodes
for child_name in node.children:
new_path = f"{current_path}/{child_name}" if current_path else f"/{child_name}"
collect_vars(node[child_name], new_path)

collect_vars(tree)
return sorted(paths) # Sort for consistent ordering


def drop_vars_by_path(tree: DataTree, var_paths: str | list[str]) -> DataTree:
"""
Drop variables from a DataTree using paths in the format '/group/var' or '/var' for root level

Parameters
----------
tree : DataTree
The input DataTree
var_paths : str or List[str]
Paths to variables to drop in format '/group/var' or '/var' for root level
Examples:
- '/var1' # root level variable
- '/group1/var1' # variable in group1
- '/group1/subgroup/var1' # variable in nested group

Returns
-------
DataTree
Modified DataTree with variables dropped
"""
if isinstance(var_paths, str):
var_paths = [var_paths]

for path in var_paths:
# Split the path into group path and variable name
parts = path.strip("/").split("/")

if len(parts) == 1:
# Root level variable
var_name = parts[0]
# Modify the dataset in-place using xarray's drop_vars
tree.ds = tree.ds.drop_vars([var_name], errors="ignore")
else:
# Group variable
group_path = "/".join(parts[:-1])
var_name = parts[-1]
try:
node = tree[group_path]
node.ds = node.ds.drop_vars([var_name], errors="ignore")
except KeyError:
pass

return tree


def prepare_basic_encoding(datasets: DataTree, time_encoding) -> dict:
"""
Prepare basic encoding dictionary for DataTree organized by groups.
Expand Down
24 changes: 10 additions & 14 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ def subset_with_bbox(
iterator = zip(lat_var_names, lon_var_names, time_var_names)

for lat_var_name, lon_var_name, time_var_name in iterator:

lat_path = file_utils.get_path(lat_var_name)
lon_path = file_utils.get_path(lon_var_name)

Expand Down Expand Up @@ -376,13 +375,11 @@ def subset(

if args["decode_times"]:
with xr.open_datatree(file_to_subset, decode_times=False) as dataset:

lat_var_names, lon_var_names, time_var_names = coordinate_utils.get_coordinate_variable_names(
dataset=dataset, lat_var_names=lat_var_names, lon_var_names=lon_var_names, time_var_names=time_var_names
)

for time in time_var_names:

time_var = dataset[time]
var_name = os.path.basename(time)
group_path = os.path.dirname(time)
Expand All @@ -408,7 +405,6 @@ def subset(
args["decode_times"] = False

with xr.open_datatree(file_to_subset, **args) as dataset:

if hdf_type:
dataset = hdf_utils.rename_phony_dims(dataset)

Expand Down Expand Up @@ -439,21 +435,21 @@ def subset(
if hdf_type and (min_time or max_time):
dataset, _ = tree_time_converting.convert_to_datetime(dataset, time_var_names, hdf_type)

all_vars = variables_utils.get_all_variable_names_from_dtree(dataset)
all_vars = variables_utils.get_vars_with_paths(dataset)
if variables:
# Drop variables that aren't explicitly requested, except lat_var_name and
# lon_var_name which are needed for subsetting
normalized_variables = [f"/{s.replace('__', '/').lstrip('/')}".upper() for s in variables]
# add in root "/" to variable path if not present so that
# matching with `all_data_variables` is works correctly
normalized_variables = variables_utils.normalize_candidate_paths_against_dtree(variables, all_vars)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are not flattening data anymore, I don't think there is a need to check for __ in place of /.

keep_variables = normalized_variables + lon_var_names + lat_var_names + time_var_names
keep_variables = variables_utils.normalize_candidate_paths_against_dtree(keep_variables, all_vars)

all_data_variables = datatree_subset.get_vars_with_paths(dataset)
drop_variables = [
var for var in all_data_variables if var not in keep_variables and var.upper() not in keep_variables
]
keep_coords = coordinate_utils.collect_coordinate_variables(dataset, keep_variables)

keep_set = set(keep_variables) | keep_coords

drop_variables = all_vars - keep_set

dataset = datatree_subset.drop_vars_by_path(dataset, drop_variables)
variables_utils.drop_vars_by_path(dataset, drop_variables)

lon_var_names = variables_utils.normalize_candidate_paths_against_dtree(lon_var_names, all_vars)
lat_var_names = variables_utils.normalize_candidate_paths_against_dtree(lat_var_names, all_vars)
Expand Down
83 changes: 83 additions & 0 deletions podaac/subsetter/utils/coordinate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,89 @@ def get_coordinate_variable_names(
return lat_var_names, lon_var_names, time_var_names


def find_coordinate_origin_node(
tree: xr.DataTree,
node_path: str,
coord_name: str,
) -> str | None:
"""
Find the path of the DataTree node where a coordinate is actually defined.

Walks up the ancestry chain from the given node, checking each node's
own dataset (``node.ds``) for the coordinate. Returns the path of the
first ancestor that owns it, or ``None`` if the coordinate does not
exist anywhere in the ancestry.

Parameters
----------
tree : DataTree
The root DataTree.
node_path : str
The path of the node to start searching from (e.g. "/group/subgroup").
coord_name : str
The name of the coordinate to locate.

Returns
-------
str or None
The path string of the node that defines the coordinate, or ``None``.

Examples
--------
>>> origin = find_coordinate_origin_node(dt, "/group/subgroup", "time")
>>> # Returns "/" if time is defined at the root
"""
Comment thread
ocsmit marked this conversation as resolved.
node: xr.DataTree | None = tree[node_path]

# iterate from current node to root via closest parents (inclusive
# of current node)
for n in (node, *node.parents):
# have to use to_dataset so that we can specificy *not* to
# include the inherited coords
if coord_name in n.to_dataset(inherit=False).coords:
return n.path

return None
Comment thread
ocsmit marked this conversation as resolved.


def collect_coordinate_variables(tree: xr.DataTree, variables: list[str]) -> set[str]:
"""
Collect and construct the full set of paths to coordinate
variables (if any) which each variable depends on.

Parameters
----------
tree : DataTree
The root DataTree.
variables : list[str]
The name of the coordinate to locate.

Returns
-------
set[str]
A set containing the paths to the coordinate variables
"""
keep_coords: set[str] = set()
for var in variables:
try:
var_node = tree[var]
except KeyError:
continue

node_path = var.rsplit("/", 1)[0] # get the prefix path
for leaf in var_node.coords:
# want to find where the dimension variable
# actually lives, continuing if none present
owning_node = find_coordinate_origin_node(tree, node_path, leaf)
if not owning_node:
continue
# strip root "/", otherwise we end up with something like "//corner"
if owning_node == "/":
owning_node = ""
keep_coords.add(f"{owning_node}/{leaf}")
return keep_coords
Comment thread
ocsmit marked this conversation as resolved.


def _compute_utc_name(dataset: xr.Dataset) -> str | None:
"""
Get the name of the utc variable if it is there to determine origine time
Expand Down
70 changes: 49 additions & 21 deletions podaac/subsetter/utils/variables_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,63 @@
import xarray as xr


def get_all_variable_names_from_dtree(dtree: xr.DataTree) -> list[str]:
def get_vars_with_paths(tree: xr.DataTree) -> set[str]:
"""
Recursively extract all variable names (with full paths) from an xarray DataTree.
Get all variables and coordinates with their full paths from a DataTree

Parameters
----------
dtree : xr.DataTree
The root of the DataTree.
tree : DataTree
The input DataTree

Returns
-------
List[str]
A list of variable full paths (e.g. '/group1/var').
set[str]
Unordered set of variable and coordinate paths in format
'/group/var' or '/var' for root level.

Examples
--------
>>> ds = xr.Dataset({'var1': [1], 'var2': [2], 'time': ('time', [0])})
>>> tree = DataTree(data=ds)
>>> tree['group1'] = DataTree(data=ds.copy())
>>> paths = get_vars_with_paths(tree)
>>> print(paths)
{'/time', '/var1', '/var2', '/group1/var1', '/group1/var2'}
"""
paths: set[str] = set()
for node in tree.subtree:
prefix = node.path.rstrip("/") + "/"
for name in set(node.data_vars) | set(node.to_dataset(inherit=False).coords):
paths.add(f"{prefix}{name}")
return paths


def drop_vars_by_path(tree: xr.DataTree, var_paths: str | list[str] | set[str]) -> None:
"""
Drop variables *in place* from a DataTree using paths in the
format '/group/var' or '/var' for root level.

Parameters
----------
tree : DataTree
The input DataTree
var_paths : str or list[str] or set[str]
Paths to variables to drop in format '/group/var' or '/var' for root level
Examples:
- '/var1' # root level variable
- '/group1/var1' # variable in group1
- '/group1/subgroup/var1' # variable in nested group

"""
var_names = []

def recurse(node: xr.DataTree):
group_path = node.path
for var_name in node.data_vars:
if group_path in ("", "/"):
full_path = f"/{var_name}"
else:
full_path = f"{group_path}/{var_name}"
var_names.append(full_path)
for child in node.children.values():
recurse(child)

recurse(dtree)
return var_names
# guard for single string being passed
drop: set[str] = {var_paths} if isinstance(var_paths, str) else set(var_paths)

for node in tree.subtree:
prefix = node.path.rstrip("/") + "/"
to_drop = [name for name in node.variables if f"{prefix}{name}" in drop]
if to_drop:
node.dataset = node.dataset.drop_vars(to_drop, errors="ignore")


def _normalize_for_matching(path: str) -> str:
Expand Down
Loading