Split utils key and JSON helpers

vmoens · vmoens · commit 706f5d3590ff · 2026-04-29T21:07:09.000+01:00
diff --git a/tensordict/_utils_key_json.py b/tensordict/_utils_key_json.py
@@ -0,0 +1,139 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import importlib.util
+import warnings
+
+from pyvers import get_backend, implement_for, register_backend, set_backend
+
+__all__ = [
+    "_decode_key_from_filesystem",
+    "_encode_key_for_filesystem",
+    "_get_robust_key_setting",
+    "_get_robust_key_setting_with_warning",
+    "_json_dumps",
+    "get_json_backend",
+    "json_dumps",
+    "set_json_backend",
+]
+
+
+def _encode_key_for_filesystem(key: str, *, robust: bool = True) -> str:
+    """Encode a TensorDict key to be safe for filesystem paths."""
+    if not robust:
+        return key
+
+    unsafe_chars = set('/<>:"|?*\\ \0%')
+    unsafe_chars.update(chr(i) for i in range(32))
+    unsafe_chars.add(chr(127))
+
+    encoded_parts = []
+    for char in key:
+        if char in unsafe_chars:
+            encoded_parts.append(f"%{ord(char):02X}")
+        else:
+            encoded_parts.append(char)
+
+    return "".join(encoded_parts)
+
+
+def _get_robust_key_setting_with_warning(key: str, robust_key) -> bool:
+    """Handle the robust_key parameter with smart deprecation warning."""
+    if robust_key is not None:
+        return robust_key
+
+    robust_encoded = _encode_key_for_filesystem(key, robust=True)
+    legacy_encoded = _encode_key_for_filesystem(key, robust=False)
+
+    if robust_encoded != legacy_encoded:
+        warnings.warn(
+            f"The key '{key}' contains characters that will be handled differently "
+            f"in TensorDict v0.12 for better cross-platform support. "
+            f"To opt into the new behavior now, use `robust_key=True`. "
+            f"To suppress this warning and keep the current behavior, use `robust_key=False`. "
+            f"See https://github.com/pytorch/tensordict/issues/1440 for details.",
+            FutureWarning,
+            stacklevel=3,
+        )
+
+    return False
+
+
+def _get_robust_key_setting(robust_key) -> bool:
+    """Handle the robust_key parameter without key-specific logic."""
+    if robust_key is None:
+        return False
+    return robust_key
+
+
+def _decode_key_from_filesystem(encoded_key: str) -> str:
+    """Decode a filesystem-safe key back to the original TensorDict key."""
+    decoded_parts = []
+    i = 0
+    while i < len(encoded_key):
+        if encoded_key[i] == "%" and i + 2 < len(encoded_key):
+            try:
+                hex_str = encoded_key[i + 1 : i + 3]
+                char_code = int(hex_str, 16)
+                decoded_parts.append(chr(char_code))
+                i += 3
+            except ValueError:
+                decoded_parts.append(encoded_key[i])
+                i += 1
+        else:
+            decoded_parts.append(encoded_key[i])
+            i += 1
+
+    return "".join(decoded_parts)
+
+
+register_backend(group="json", backends={"json": "json", "orjson": "orjson"})
+
+
+@implement_for("json")
+def _json_dumps(data, **kwargs):
+    """JSON serialization using standard json module."""
+    import json
+
+    return json.dumps(data, **kwargs)
+
+
+@implement_for("orjson")
+def _json_dumps(data, **kwargs):  # noqa: F811
+    """JSON serialization using orjson module."""
+    import orjson
+
+    if "separators" in kwargs:
+        kwargs.pop("separators")
+    return orjson.dumps(data, **kwargs)
+
+
+def json_dumps(data, **kwargs):
+    """Unified JSON serialization function that works with both json and orjson backends."""
+    return _json_dumps(data, **kwargs)
+
+
+def set_json_backend(backend):
+    """Set the JSON backend to use (either 'json' or 'orjson')."""
+    if backend not in ["json", "orjson"]:
+        raise ValueError("Backend must be either 'json' or 'orjson'")
+    set_backend("json", backend)
+
+
+def get_json_backend():
+    """Get the current JSON backend."""
+    return get_backend("json")
+
+
+if importlib.util.find_spec("orjson") is not None:
+    set_json_backend("orjson")
+else:
+    set_json_backend("json")
+
+
+for _name in __all__:
+    globals()[_name].__module__ = "tensordict.utils"
diff --git a/tensordict/utils.py b/tensordict/utils.py
@@ -7,7 +7,6 @@
 import collections
 import concurrent.futures
 import functools
-import importlib.util
 import itertools
 import logging
 
@@ -39,7 +38,7 @@
 
 import numpy as np
 import torch
-from pyvers import get_backend, implement_for, register_backend, set_backend
+from pyvers import implement_for
 
 from tensordict._C import (  # noqa: F401  # @manual=//pytorch/tensordict:_C
     _unravel_key_to_tuple as _unravel_key_to_tuple_cpp,
@@ -2730,145 +2729,12 @@ def unravel_key_list(keys):
     return [unravel_key(key) for key in keys]
 
 
-def _encode_key_for_filesystem(key: str, *, robust: bool = True) -> str:
-    """Encode a TensorDict key to be safe for filesystem paths.
-
-    This function provides a bijective mapping from TensorDict keys to
-    filesystem-safe filenames by percent-encoding problematic characters.
-
-    Args:
-        key (str): The original TensorDict key
-        robust (bool): If True, uses the new robust encoding that percent-encodes
-            problematic characters. If False, returns the key unchanged (legacy
-            behavior). Defaults to True.
-
-    Returns:
-        str: A filesystem-safe encoded key if robust=True, otherwise the original key
-
-    Examples:
-        >>> _encode_key_for_filesystem("a/b/c")
-        "a%2Fb%2Fc"
-        >>> _encode_key_for_filesystem("a/b/c", robust=False)
-        "a/b/c"
-        >>> _encode_key_for_filesystem("normal_key")
-        "normal_key"
-    """
-    if not robust:
-        # Legacy behavior: return key unchanged
-        return key
-
-    # Characters that are problematic across filesystems:
-    # - Unix/Linux: / (path separator), null
-    # - Windows: < > : " | ? * \ / (path separator), null, control chars
-    # - macOS: : (path separator in HFS), null
-    # - General: space, percent (for our encoding)
-    unsafe_chars = set('/<>:"|?*\\ \0%')
-
-    # Also encode control characters (0-31) and DEL (127)
-    unsafe_chars.update(chr(i) for i in range(32))
-    unsafe_chars.add(chr(127))
-
-    encoded_parts = []
-    for char in key:
-        if char in unsafe_chars:
-            # Percent-encode using uppercase hex for consistency
-            encoded_parts.append(f"%{ord(char):02X}")
-        else:
-            encoded_parts.append(char)
-
-    return "".join(encoded_parts)
-
-
-def _get_robust_key_setting_with_warning(key: str, robust_key) -> bool:
-    """Handle the robust_key parameter with smart deprecation warning.
-
-    Only warns when there's actually a difference between robust and legacy encoding.
-
-    Args:
-        key: The TensorDict key to check
-        robust_key: None (auto-detect with warning), False (legacy), True (robust)
-
-    Returns:
-        bool: The effective robust setting to use
-    """
-    if robust_key is not None:
-        return robust_key
-
-    # Check if robust encoding would produce a different filename
-    robust_encoded = _encode_key_for_filesystem(key, robust=True)
-    # Keep this in case we need some futher mapping
-    legacy_encoded = _encode_key_for_filesystem(key, robust=False)
-
-    if robust_encoded != legacy_encoded:
-        # Only warn when there's actually a difference
-        import warnings
-
-        warnings.warn(
-            f"The key '{key}' contains characters that will be handled differently "
-            f"in TensorDict v0.12 for better cross-platform support. "
-            f"To opt into the new behavior now, use `robust_key=True`. "
-            f"To suppress this warning and keep the current behavior, use `robust_key=False`. "
-            f"See https://github.com/pytorch/tensordict/issues/1440 for details.",
-            FutureWarning,
-            stacklevel=3,
-        )
-
-    # Always use legacy behavior when robust_key=None
-    return False
-
-
-def _get_robust_key_setting(robust_key) -> bool:
-    """Handle the robust_key parameter without key-specific logic.
-
-    Args:
-        robust_key: None (fallback to False), False (legacy), True (robust)
-
-    Returns:
-        bool: The effective robust setting to use
-    """
-    if robust_key is None:
-        return False
-    return robust_key
-
-
-def _decode_key_from_filesystem(encoded_key: str) -> str:
-    """Decode a filesystem-safe key back to the original TensorDict key.
-
-    This is the reverse of _encode_key_for_filesystem.
-
-    Args:
-        encoded_key (str): A filesystem-safe encoded key
-
-    Returns:
-        str: The original TensorDict key
-
-    Examples:
-        >>> _decode_key_from_filesystem("a%2Fb%2Fc")
-        "a/b/c"
-        >>> _decode_key_from_filesystem("key%20with%20spaces")
-        "key with spaces"
-        >>> _decode_key_from_filesystem("normal_key")
-        "normal_key"
-    """
-    decoded_parts = []
-    i = 0
-    while i < len(encoded_key):
-        if encoded_key[i] == "%" and i + 2 < len(encoded_key):
-            try:
-                # Decode the hex value
-                hex_str = encoded_key[i + 1 : i + 3]
-                char_code = int(hex_str, 16)
-                decoded_parts.append(chr(char_code))
-                i += 3
-            except ValueError:
-                # Invalid hex sequence, treat as literal %
-                decoded_parts.append(encoded_key[i])
-                i += 1
-        else:
-            decoded_parts.append(encoded_key[i])
-            i += 1
-
-    return "".join(decoded_parts)
+from tensordict._utils_key_json import (  # noqa: F401
+    _decode_key_from_filesystem,
+    _encode_key_for_filesystem,
+    _get_robust_key_setting,
+    _get_robust_key_setting_with_warning,
+)
 
 
 def _slice_indices(index: slice, len: int):
@@ -3302,51 +3168,12 @@ def _create_segments_from_list(
     return splits
 
 
-# Register JSON backends
-register_backend(group="json", backends={"json": "json", "orjson": "orjson"})
-
-
-@implement_for("json")
-def _json_dumps(data, **kwargs):
-    """JSON serialization using standard json module."""
-    import json
-
-    return json.dumps(data, **kwargs)
-
-
-@implement_for("orjson")
-def _json_dumps(data, **kwargs):  # noqa: F811
-    """JSON serialization using orjson module."""
-    import orjson
-
-    # orjson doesn't support separators parameter, so we need to handle it differently
-    if "separators" in kwargs:
-        # Remove separators for orjson and use default compact format
-        kwargs.pop("separators")
-    return orjson.dumps(data, **kwargs)
-
-
-def json_dumps(data, **kwargs):
-    """Unified JSON serialization function that works with both json and orjson backends."""
-    return _json_dumps(data, **kwargs)
-
-
-def set_json_backend(backend):
-    """Set the JSON backend to use (either 'json' or 'orjson')."""
-    if backend not in ["json", "orjson"]:
-        raise ValueError("Backend must be either 'json' or 'orjson'")
-    set_backend("json", backend)
-
-
-def get_json_backend():
-    """Get the current JSON backend."""
-    return get_backend("json")
-
-
-if importlib.util.find_spec("orjson") is not None:
-    set_json_backend("orjson")
-else:
-    set_json_backend("json")
+from tensordict._utils_key_json import (  # noqa: F401
+    _json_dumps,
+    get_json_backend,
+    json_dumps,
+    set_json_backend,
+)
 
 
 class LinkedList(list):
diff --git a/test/test_utils_key_json_split.py b/test/test_utils_key_json_split.py