-
Notifications
You must be signed in to change notification settings - Fork 16
Coordinate cleanup #436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Coordinate cleanup #436
Changes from 8 commits
c94bd9a
731b9f3
5a4ceef
e50752b
442305a
d18c715
fdf3a8d
1103569
e84ba0f
4fc047a
27cb8a7
1bc7640
06e117b
36aefd3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -295,7 +295,6 @@ def process_node( | |
| and (indexers.keys() - dataset[variable_name].dims) | ||
| and set(indexers.keys()).intersection(dataset[variable_name].dims) | ||
| ): | ||
|
|
||
| missing_dim = sorted(indexers.keys() - dataset[variable_name].dims)[0] | ||
| var_indexers = { | ||
| dim_name: dim_value | ||
|
|
@@ -800,7 +799,7 @@ def tree_get_spatial_bounds( | |
| return np.array([[min(min_lons), max(max_lons)], [min(min_lats), max(max_lats)]]) | ||
|
|
||
|
|
||
| def get_vars_with_paths(tree: DataTree) -> list[str]: | ||
| def get_vars_with_paths(tree: DataTree) -> set[str]: | ||
| """ | ||
| Get all variables and coordinates with their full paths from a DataTree | ||
|
|
||
|
|
@@ -811,9 +810,9 @@ def get_vars_with_paths(tree: DataTree) -> list[str]: | |
|
|
||
| Returns | ||
| ------- | ||
| List[str] | ||
| List of variable paths in format '/group/var' or '/var' for root level, | ||
| including coordinate variables at root level | ||
| set[str] | ||
| Unordered set of variable and coordinate paths in format | ||
| '/group/var' or '/var' for root level. | ||
|
|
||
| Examples | ||
| -------- | ||
|
|
@@ -822,67 +821,41 @@ def get_vars_with_paths(tree: DataTree) -> list[str]: | |
| >>> tree['group1'] = DataTree(data=ds.copy()) | ||
| >>> paths = get_vars_with_paths(tree) | ||
| >>> print(paths) | ||
| ['/time', '/var1', '/var2', '/group1/var1', '/group1/var2'] | ||
| {'/time', '/var1', '/var2', '/group1/var1', '/group1/var2'} | ||
| """ | ||
| paths = [] | ||
|
|
||
| def collect_vars(node: DataTree, current_path: str = "") -> None: | ||
| # Add data variables from current node | ||
| for var_name in node.ds.data_vars: | ||
| paths.append(f"{current_path}/{var_name}") | ||
|
|
||
| # Recursively process child nodes | ||
| for child_name in node.children: | ||
| new_path = f"{current_path}/{child_name}" if current_path else f"/{child_name}" | ||
| collect_vars(node[child_name], new_path) | ||
|
|
||
| collect_vars(tree) | ||
| return sorted(paths) # Sort for consistent ordering | ||
| paths: set[str] = set() | ||
| for node in tree.subtree: | ||
| prefix = node.path.rstrip("/") + "/" | ||
| for name in set(node.data_vars) | set(node.to_dataset(inherit=False).coords): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to get coordinate variables at each node and NOT include inherited coordinates we have to directly coerce to a dataset and specify no inheritance. This was the simplest (only?) way I could find to accomplish this. Otherwise we get coordinate variables at every single node instead of where they are originally. |
||
| paths.add(f"{prefix}{name}") | ||
| return paths | ||
|
|
||
|
|
||
| def drop_vars_by_path(tree: DataTree, var_paths: str | list[str]) -> DataTree: | ||
| def drop_vars_by_path(tree: DataTree, var_paths: str | list[str] | set[str]) -> None: | ||
| """ | ||
| Drop variables from a DataTree using paths in the format '/group/var' or '/var' for root level | ||
| Drop variables *in place* from a DataTree using paths in the | ||
| format '/group/var' or '/var' for root level. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| tree : DataTree | ||
| The input DataTree | ||
| var_paths : str or List[str] | ||
| var_paths : str or list[str] or set[str] | ||
| Paths to variables to drop in format '/group/var' or '/var' for root level | ||
| Examples: | ||
| - '/var1' # root level variable | ||
| - '/group1/var1' # variable in group1 | ||
| - '/group1/subgroup/var1' # variable in nested group | ||
|
|
||
| Returns | ||
| ------- | ||
| DataTree | ||
| Modified DataTree with variables dropped | ||
| """ | ||
| if isinstance(var_paths, str): | ||
| var_paths = [var_paths] | ||
|
|
||
| for path in var_paths: | ||
| # Split the path into group path and variable name | ||
| parts = path.strip("/").split("/") | ||
|
|
||
| if len(parts) == 1: | ||
| # Root level variable | ||
| var_name = parts[0] | ||
| # Modify the dataset in-place using xarray's drop_vars | ||
| tree.ds = tree.ds.drop_vars([var_name], errors="ignore") | ||
| else: | ||
| # Group variable | ||
| group_path = "/".join(parts[:-1]) | ||
| var_name = parts[-1] | ||
| try: | ||
| node = tree[group_path] | ||
| node.ds = node.ds.drop_vars([var_name], errors="ignore") | ||
| except KeyError: | ||
| pass | ||
|
|
||
| return tree | ||
| # guard for single string being passed | ||
| drop: set[str] = {var_paths} if isinstance(var_paths, str) else set(var_paths) | ||
|
|
||
| for node in tree.subtree: | ||
| prefix = node.path.rstrip("/") + "/" | ||
| to_drop = [name for name in node.variables if f"{prefix}{name}" in drop] | ||
| if to_drop: | ||
| node.dataset = node.dataset.drop_vars(to_drop, errors="ignore") | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function simplified greatly. Return value is dropped because it is working in place on reference to the tree and its underlying datasets are being overridden |
||
|
|
||
|
|
||
| def prepare_basic_encoding(datasets: DataTree, time_encoding) -> dict: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -184,7 +184,6 @@ def subset_with_bbox( | |
| iterator = zip(lat_var_names, lon_var_names, time_var_names) | ||
|
|
||
| for lat_var_name, lon_var_name, time_var_name in iterator: | ||
|
|
||
| lat_path = file_utils.get_path(lat_var_name) | ||
| lon_path = file_utils.get_path(lon_var_name) | ||
|
|
||
|
|
@@ -376,13 +375,11 @@ def subset( | |
|
|
||
| if args["decode_times"]: | ||
| with xr.open_datatree(file_to_subset, decode_times=False) as dataset: | ||
|
|
||
| lat_var_names, lon_var_names, time_var_names = coordinate_utils.get_coordinate_variable_names( | ||
| dataset=dataset, lat_var_names=lat_var_names, lon_var_names=lon_var_names, time_var_names=time_var_names | ||
| ) | ||
|
|
||
| for time in time_var_names: | ||
|
|
||
| time_var = dataset[time] | ||
| var_name = os.path.basename(time) | ||
| group_path = os.path.dirname(time) | ||
|
|
@@ -408,7 +405,6 @@ def subset( | |
| args["decode_times"] = False | ||
|
|
||
| with xr.open_datatree(file_to_subset, **args) as dataset: | ||
|
|
||
| if hdf_type: | ||
| dataset = hdf_utils.rename_phony_dims(dataset) | ||
|
|
||
|
|
@@ -441,19 +437,21 @@ def subset( | |
|
|
||
| all_vars = variables_utils.get_all_variable_names_from_dtree(dataset) | ||
| if variables: | ||
| # Drop variables that aren't explicitly requested, except lat_var_name and | ||
| # lon_var_name which are needed for subsetting | ||
| normalized_variables = [f"/{s.replace('__', '/').lstrip('/')}".upper() for s in variables] | ||
| # add in root "/" to variable path if not present so that | ||
| # matching with `all_data_variables` is works correctly | ||
| normalized_variables = [item if item.startswith("/") else "/" + item for item in variables] | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we are not flattening data anymore, I don't think there is a need to check for |
||
| keep_variables = normalized_variables + lon_var_names + lat_var_names + time_var_names | ||
| keep_variables = variables_utils.normalize_candidate_paths_against_dtree(keep_variables, all_vars) | ||
|
|
||
| all_data_variables = datatree_subset.get_vars_with_paths(dataset) | ||
| drop_variables = [ | ||
| var for var in all_data_variables if var not in keep_variables and var.upper() not in keep_variables | ||
| ] | ||
|
|
||
| dataset = datatree_subset.drop_vars_by_path(dataset, drop_variables) | ||
| keep_coords = coordinate_utils.collect_coordinate_variables(dataset, keep_variables) | ||
|
|
||
| keep_set = set(keep_variables) | keep_coords | ||
|
|
||
| drop_variables = all_data_variables - keep_set | ||
|
|
||
| datatree_subset.drop_vars_by_path(dataset, drop_variables) | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Core logic for constructing set of variables to keep and drop which now accounts for coordinate variables.
ocsmit marked this conversation as resolved.
Outdated
|
||
|
|
||
| lon_var_names = variables_utils.normalize_candidate_paths_against_dtree(lon_var_names, all_vars) | ||
| lat_var_names = variables_utils.normalize_candidate_paths_against_dtree(lat_var_names, all_vars) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.