-
Notifications
You must be signed in to change notification settings - Fork 5
feat: Add optional lz4 compression support for arrays passed via base64 or binref encoding
#579
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2d22710
1d988b8
e14a94c
ab9b03b
22f6d11
920e15f
a789f48
a601030
ddc8dd8
8757567
a11eb0f
dafd5de
52c0766
00b7b8f
828524d
f902448
eb3bb21
48e0236
b9ed79a
bf7a5a9
937daa6
98d9df7
1f7ac9d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -140,3 +140,13 @@ $ curl \ | |
| The `.bin` file references are relative to the `--output-path`. | ||
| ::: | ||
| :::: | ||
|
|
||
| ### binref + lz4 compression | ||
|
|
||
| Set `TESSERACT_BINREF_COMPRESSION=lz4` to compress arrays in `.bin` files. Each array is compressed individually, preserving offset-based random access. The compressed size is embedded directly in the buffer path (`<file>:<offset>:<compressed_size>`). | ||
|
Comment on lines
+144
to
+146
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This now also applies to |
||
|
|
||
| ```bash | ||
| $ TESSERACT_BINREF_COMPRESSION=lz4 tesseract run vectoradd apply -f "json+binref" -o /tmp/output @examples/vectoradd/example_inputs.json | ||
| $ cat /tmp/output/results.json | ||
| {"result":{"object_type":"array","shape":[3],"dtype":"float64","data":{"buffer":"....bin:0:35","encoding":"binref","compression":"lz4"}}} | ||
| ``` | ||
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,6 +64,28 @@ class ArrayDict(TypedDict): | |
| MAX_BINREF_BUFFER_SIZE = 100 * 1024 * 1024 # 100 MB | ||
|
|
||
|
|
||
| def _lz4_frame(): | ||
| import lz4.frame | ||
|
|
||
| return lz4.frame | ||
|
Comment on lines
+67
to
+70
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can live in global scope since the dep is now mandatory |
||
|
|
||
|
|
||
| def _compress(data: bytes, compression: str | None) -> bytes: | ||
| if compression is None: | ||
| return data | ||
| if compression == "lz4": | ||
| return _lz4_frame().compress(data) | ||
| raise ValueError(f"Unknown compression: {compression}") | ||
|
|
||
|
|
||
| def _decompress(data: bytes, compression: str | None) -> bytes: | ||
| if compression is None: | ||
| return data | ||
| if compression == "lz4": | ||
| return _lz4_frame().decompress(data) | ||
| raise ValueError(f"Unknown compression: {compression}") | ||
|
|
||
|
|
||
| # Base classes for the different array encodings | ||
| # The actual models are created dynamically based on the expected shape and dtype by get_array_model | ||
|
|
||
|
|
@@ -79,14 +101,22 @@ class Base64ArrayData(BaseModel): | |
| ), | ||
| ] | ||
| encoding: Literal["base64"] | ||
| compression: Literal["lz4"] | None = None | ||
| model_config = ConfigDict(extra="forbid") | ||
|
|
||
|
|
||
| class BinrefArrayData(BaseModel): | ||
| """Data structure that dumps array data to binary file.""" | ||
| """Data structure that dumps array data to binary file. | ||
|
|
||
| The buffer field format is ``<path>[:<offset>[:<compressed_size>]]``. | ||
| When compression is set, the buffer must include ``:<compressed_size>`` | ||
| so readers know how many compressed bytes to read. | ||
| """ | ||
|
|
||
| buffer: StrictStr = Field(pattern=r"^.+?(\:\d+)?$") | ||
| buffer: StrictStr = Field(pattern=r"^.+?(\:\d+(\:\d+)?)?$") | ||
| encoding: Literal["binref"] | ||
| compression: Literal["lz4"] | None = None | ||
|
|
||
| model_config = ConfigDict(extra="forbid") | ||
|
|
||
|
|
||
|
|
@@ -223,6 +253,7 @@ def _dump_binref_arraydict( | |
| subdir: Path | str | None, | ||
| current_binref_uuid: str, | ||
| max_file_size: int = MAX_BINREF_BUFFER_SIZE, | ||
| compression: str | None = None, | ||
| ) -> tuple[ArrayDict, str]: | ||
| """Dump array to json+binref encoded array dict.""" | ||
| target_name = f"{current_binref_uuid}.bin" | ||
|
|
@@ -241,28 +272,41 @@ def _dump_binref_arraydict( | |
| target_name = join_paths(subdir, target_name) | ||
| target_path = join_paths(base_dir, target_name) | ||
|
|
||
| write_to_path(_fast_tobytes(arr), target_path, append=True) | ||
| blob = _compress(_fast_tobytes(arr), compression) | ||
| write_to_path(blob, target_path, append=True) | ||
| offset = current_size | ||
|
|
||
| if compression is not None: | ||
| data = { | ||
| "buffer": f"{target_name}:{offset}:{len(blob)}", | ||
| "encoding": "binref", | ||
| "compression": compression, | ||
| } | ||
| else: | ||
| data = {"buffer": f"{target_name}:{offset}", "encoding": "binref"} | ||
| arraydict = { | ||
| "object_type": "array", | ||
| "shape": list(arr.shape), | ||
| "dtype": arr.dtype.name, | ||
| "data": {"buffer": f"{target_name}:{offset}", "encoding": "binref"}, | ||
| "data": data, | ||
| } | ||
| return arraydict, current_binref_uuid | ||
|
|
||
|
|
||
| def _dump_base64_arraydict(arr: ArrayLike) -> ArrayDict: | ||
| def _dump_base64_arraydict(arr: ArrayLike, compression: str | None = None) -> ArrayDict: | ||
| """Dump array to json+base64 encoded array dict (plain dict, no Pydantic models).""" | ||
| blob = _compress(_fast_tobytes(arr), compression) | ||
| data: dict[str, Any] = { | ||
| "buffer": pybase64.b64encode_as_string(blob), | ||
| "encoding": "base64", | ||
| } | ||
| if compression is not None: | ||
| data["compression"] = compression | ||
| return { | ||
| "object_type": "array", | ||
| "shape": list(arr.shape), | ||
| "dtype": arr.dtype.name, | ||
| "data": { | ||
| "buffer": pybase64.b64encode_as_string(_fast_tobytes(arr)), | ||
| "encoding": "base64", | ||
| }, | ||
| "data": data, | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -279,22 +323,27 @@ def _dump_json_arraydict(arr: ArrayLike) -> ArrayDict: | |
| def _load_base64_arraydict(val: ArrayDict) -> np.ndarray: | ||
| """Load array from json+base64 encoded array dict.""" | ||
| buffer = pybase64.b64decode(val["data"]["buffer"], validate=True) | ||
| buffer = _decompress(buffer, val["data"].get("compression")) | ||
| return np.frombuffer(buffer, dtype=val["dtype"]).reshape(val["shape"]) | ||
|
|
||
|
|
||
| def _load_binref_arraydict(val: ArrayDict, base_dir: str | Path | None) -> np.ndarray: | ||
| """Load array from json+binref encoded array dict.""" | ||
| path_match = re.match(r"^(?P<path>.+?)(\:(?P<offset>\d+))?$", val["data"]["buffer"]) | ||
| path_match = re.match( | ||
| r"^(?P<path>.+?)(\:(?P<offset>\d+)(\:(?P<compressed_size>\d+))?)?$", | ||
| val["data"]["buffer"], | ||
| ) | ||
| if not path_match: | ||
| raise ValueError( | ||
| f"Invalid binref path format: {val['data']['buffer']}. " | ||
| "Expected format is '<path>[:<offset>]'." | ||
| "Expected format is '<path>[:<offset>[:<compressed_size>]]'." | ||
| ) | ||
| bufferpath = path_match.group("path") | ||
| if path_match.group("offset") is None: | ||
| offset = 0 | ||
| else: | ||
| offset = int(path_match.group("offset")) | ||
| compressed_size_str = path_match.group("compressed_size") | ||
|
|
||
| uses_relative_path = not is_absolute_path(bufferpath) and not is_url(bufferpath) | ||
| if uses_relative_path and base_dir is None: | ||
|
|
@@ -308,10 +357,23 @@ def _load_binref_arraydict(val: ArrayDict, base_dir: str | Path | None) -> np.nd | |
| size = 1 if len(shape) == 0 else np.prod(shape) | ||
| num_bytes = int(size * dtype.itemsize) | ||
|
|
||
| compression = val["data"].get("compression") | ||
|
|
||
| if base_dir is not None: | ||
| bufferpath = join_paths(base_dir, bufferpath) | ||
|
|
||
| buffer = read_from_path(bufferpath, offset=offset, length=num_bytes) | ||
| if compression is None: | ||
| buffer = read_from_path(bufferpath, offset=offset, length=num_bytes) | ||
| else: | ||
| if compressed_size_str is None: | ||
| raise ValueError( | ||
| "compressed_size is required in buffer spec when compression is set " | ||
| "(expected format: '<path>:<offset>:<compressed_size>')" | ||
| ) | ||
| buffer = _decompress( | ||
| read_from_path(bufferpath, offset=offset, length=int(compressed_size_str)), | ||
| compression, | ||
| ) | ||
| return np.frombuffer(buffer, dtype=dtype).reshape(shape) | ||
|
|
||
|
|
||
|
|
@@ -520,7 +582,9 @@ def encode_array( | |
|
|
||
| array_encoding = context.get("array_encoding", "json") | ||
| if array_encoding == "base64": | ||
| return _dump_base64_arraydict(arr) | ||
| return _dump_base64_arraydict( | ||
| arr, compression=context.get("base64_compression") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest we use a single |
||
| ) | ||
| elif array_encoding == "binref": | ||
| base_dir = context.get("base_dir", get_config().output_path) | ||
| subdir = context.get("binref_dir", None) | ||
|
|
@@ -530,6 +594,7 @@ def encode_array( | |
| subdir=subdir, | ||
| current_binref_uuid=context.get("__binref_uuid", str(uuid4())), | ||
| max_file_size=context.get("max_file_size", MAX_BINREF_BUFFER_SIZE), | ||
| compression=context.get("binref_compression"), | ||
| ) | ||
| context["__binref_uuid"] = new_binref_uuid | ||
| return data | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,7 +4,7 @@ | |
| import ast | ||
| import os | ||
| from pathlib import Path | ||
| from typing import Annotated, Any | ||
| from typing import Annotated, Any, Literal | ||
|
|
||
| from pydantic import ( | ||
| BaseModel, | ||
|
|
@@ -40,6 +40,7 @@ class RuntimeConfig(BaseModel): | |
| output_path: str = "." | ||
| output_format: supported_format_type = "json" | ||
| output_file: str = "" | ||
| binref_compression: Literal["lz4"] | None = None | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only binref? |
||
| mlflow_tracking_uri: str = "" | ||
| mlflow_run_extra_args: Annotated[dict[str, Any], BeforeValidator(_eval_str)] = ( | ||
| Field(default_factory=dict) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.