Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- [issue/405] (https://github.com/podaac/l2ss-py/issues/405) Chunk logic now handles data without dimensions in root node.
- Optimize performance of ScanTime variable computation
- [issue/435] (https://github.com/podaac/l2ss-py/issues/435) Extra unneeded dimension scales included in output
Comment thread
ocsmit marked this conversation as resolved.
### Security
- Updated dependency libraries

Expand Down
73 changes: 23 additions & 50 deletions podaac/subsetter/datatree_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,6 @@ def process_node(
and (indexers.keys() - dataset[variable_name].dims)
and set(indexers.keys()).intersection(dataset[variable_name].dims)
):

missing_dim = sorted(indexers.keys() - dataset[variable_name].dims)[0]
var_indexers = {
dim_name: dim_value
Expand Down Expand Up @@ -800,7 +799,7 @@ def tree_get_spatial_bounds(
return np.array([[min(min_lons), max(max_lons)], [min(min_lats), max(max_lats)]])


def get_vars_with_paths(tree: DataTree) -> list[str]:
def get_vars_with_paths(tree: DataTree) -> set[str]:
"""
Get all variables and coordinates with their full paths from a DataTree

Expand All @@ -811,9 +810,9 @@ def get_vars_with_paths(tree: DataTree) -> list[str]:

Returns
-------
List[str]
List of variable paths in format '/group/var' or '/var' for root level,
including coordinate variables at root level
set[str]
Unordered set of variable and coordinate paths in format
'/group/var' or '/var' for root level.

Examples
--------
Expand All @@ -822,67 +821,41 @@ def get_vars_with_paths(tree: DataTree) -> list[str]:
>>> tree['group1'] = DataTree(data=ds.copy())
>>> paths = get_vars_with_paths(tree)
>>> print(paths)
['/time', '/var1', '/var2', '/group1/var1', '/group1/var2']
{'/time', '/var1', '/var2', '/group1/var1', '/group1/var2'}
"""
paths = []

def collect_vars(node: DataTree, current_path: str = "") -> None:
# Add data variables from current node
for var_name in node.ds.data_vars:
paths.append(f"{current_path}/{var_name}")

# Recursively process child nodes
for child_name in node.children:
new_path = f"{current_path}/{child_name}" if current_path else f"/{child_name}"
collect_vars(node[child_name], new_path)

collect_vars(tree)
return sorted(paths) # Sort for consistent ordering
paths: set[str] = set()
for node in tree.subtree:
prefix = node.path.rstrip("/") + "/"
for name in set(node.data_vars) | set(node.to_dataset(inherit=False).coords):

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to get coordinate variables at each node and NOT include inherited coordinates we have to directly coerce to a dataset and specify no inheritance. This was the simplest (only?) way I could find to accomplish this. Otherwise we get coordinate variables at every single node instead of where they are originally.

paths.add(f"{prefix}{name}")
return paths


def drop_vars_by_path(tree: DataTree, var_paths: str | list[str]) -> DataTree:
def drop_vars_by_path(tree: DataTree, var_paths: str | list[str] | set[str]) -> None:
"""
Drop variables from a DataTree using paths in the format '/group/var' or '/var' for root level
Drop variables *in place* from a DataTree using paths in the
format '/group/var' or '/var' for root level.

Parameters
----------
tree : DataTree
The input DataTree
var_paths : str or List[str]
var_paths : str or list[str] or set[str]
Paths to variables to drop in format '/group/var' or '/var' for root level
Examples:
- '/var1' # root level variable
- '/group1/var1' # variable in group1
- '/group1/subgroup/var1' # variable in nested group

Returns
-------
DataTree
Modified DataTree with variables dropped
"""
if isinstance(var_paths, str):
var_paths = [var_paths]

for path in var_paths:
# Split the path into group path and variable name
parts = path.strip("/").split("/")

if len(parts) == 1:
# Root level variable
var_name = parts[0]
# Modify the dataset in-place using xarray's drop_vars
tree.ds = tree.ds.drop_vars([var_name], errors="ignore")
else:
# Group variable
group_path = "/".join(parts[:-1])
var_name = parts[-1]
try:
node = tree[group_path]
node.ds = node.ds.drop_vars([var_name], errors="ignore")
except KeyError:
pass

return tree
# guard for single string being passed
drop: set[str] = {var_paths} if isinstance(var_paths, str) else set(var_paths)

for node in tree.subtree:
prefix = node.path.rstrip("/") + "/"
to_drop = [name for name in node.variables if f"{prefix}{name}" in drop]
if to_drop:
node.dataset = node.dataset.drop_vars(to_drop, errors="ignore")

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function simplified greatly. Return value is dropped because it is working in place on reference to the tree and its underlying datasets are being overridden



def prepare_basic_encoding(datasets: DataTree, time_encoding) -> dict:
Expand Down
22 changes: 10 additions & 12 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ def subset_with_bbox(
iterator = zip(lat_var_names, lon_var_names, time_var_names)

for lat_var_name, lon_var_name, time_var_name in iterator:

lat_path = file_utils.get_path(lat_var_name)
lon_path = file_utils.get_path(lon_var_name)

Expand Down Expand Up @@ -376,13 +375,11 @@ def subset(

if args["decode_times"]:
with xr.open_datatree(file_to_subset, decode_times=False) as dataset:

lat_var_names, lon_var_names, time_var_names = coordinate_utils.get_coordinate_variable_names(
dataset=dataset, lat_var_names=lat_var_names, lon_var_names=lon_var_names, time_var_names=time_var_names
)

for time in time_var_names:

time_var = dataset[time]
var_name = os.path.basename(time)
group_path = os.path.dirname(time)
Expand All @@ -408,7 +405,6 @@ def subset(
args["decode_times"] = False

with xr.open_datatree(file_to_subset, **args) as dataset:

if hdf_type:
dataset = hdf_utils.rename_phony_dims(dataset)

Expand Down Expand Up @@ -441,19 +437,21 @@ def subset(

all_vars = variables_utils.get_all_variable_names_from_dtree(dataset)
if variables:
# Drop variables that aren't explicitly requested, except lat_var_name and
# lon_var_name which are needed for subsetting
normalized_variables = [f"/{s.replace('__', '/').lstrip('/')}".upper() for s in variables]
# add in root "/" to variable path if not present so that
# matching with `all_data_variables` is works correctly
normalized_variables = [item if item.startswith("/") else "/" + item for item in variables]

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are not flattening data anymore, I don't think there is a need to check for __ in place of /.

keep_variables = normalized_variables + lon_var_names + lat_var_names + time_var_names
keep_variables = variables_utils.normalize_candidate_paths_against_dtree(keep_variables, all_vars)

all_data_variables = datatree_subset.get_vars_with_paths(dataset)
drop_variables = [
var for var in all_data_variables if var not in keep_variables and var.upper() not in keep_variables
]

dataset = datatree_subset.drop_vars_by_path(dataset, drop_variables)
keep_coords = coordinate_utils.collect_coordinate_variables(dataset, keep_variables)

keep_set = set(keep_variables) | keep_coords

drop_variables = all_data_variables - keep_set

datatree_subset.drop_vars_by_path(dataset, drop_variables)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Core logic for constructing set of variables to keep and drop which now accounts for coordinate variables.

Comment thread
ocsmit marked this conversation as resolved.
Outdated

lon_var_names = variables_utils.normalize_candidate_paths_against_dtree(lon_var_names, all_vars)
lat_var_names = variables_utils.normalize_candidate_paths_against_dtree(lat_var_names, all_vars)
Expand Down
79 changes: 79 additions & 0 deletions podaac/subsetter/utils/coordinate_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,85 @@ def get_coordinate_variable_names(
return lat_var_names, lon_var_names, time_var_names


def find_coordinate_origin_node(
tree: xr.DataTree,
node_path: str,
coord_name: str,
) -> str | None:
"""
Find the path of the DataTree node where a coordinate is actually defined.

Walks up the ancestry chain from the given node, checking each node's
own dataset (``node.ds``) for the coordinate. Returns the path of the
first ancestor that owns it, or ``None`` if the coordinate does not
exist anywhere in the ancestry.

Parameters
----------
tree : DataTree
The root DataTree.
node_path : str
The path of the node to start searching from (e.g. "/group/subgroup").
coord_name : str
The name of the coordinate to locate.

Returns
-------
str or None
The path string of the node that defines the coordinate, or ``None``.

Examples
--------
>>> origin = find_coordinate_origin(dt, "/group/subgroup", "time")
>>> # Returns "/" if time is defined at the root
"""
Comment thread
ocsmit marked this conversation as resolved.
node: xr.DataTree | None = tree[node_path]

# iterate from current node to root via closest parents (inclusive
# of current node)
for n in node.parents:
# have to use to_dataset so that we can specificy *not* to
# include the inherited coords
if coord_name in n.to_dataset(inherit=False).coords:
return n.path

return None
Comment thread
ocsmit marked this conversation as resolved.


def collect_coordinate_variables(tree: xr.DataTree, variables: list[str]) -> set[str]:
"""
Collect and construct the full set of paths to coordinate
variables (if any) which each variable depends on.

Parameters
----------
tree : DataTree
The root DataTree.
variables : list[str]
The name of the coordinate to locate.

Returns
-------
set[str]
A set containing the paths to the coordinate variables
"""
keep_coords: set[str] = set()
for i in variables:
node_path = i.rsplit("/", 1)[0] # get the prefix path

for leaf in tree[i].coords:
# want to find where the dimension variable
# actually lives, continuing if none present
owning_node = find_coordinate_origin_node(tree, node_path, leaf)
if not owning_node:
continue
# strip root "/", otherwise we end up with something like "//corner"
if owning_node == "/":
owning_node = ""
keep_coords.add(f"{owning_node}/{leaf}")
return keep_coords
Comment thread
ocsmit marked this conversation as resolved.


def _compute_utc_name(dataset: xr.Dataset) -> str | None:
"""
Get the name of the utc variable if it is there to determine origine time
Expand Down
Loading
Loading