Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions benchmarks/test_array_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,24 @@ class ArrayModel(BaseModel):
data: Array[(None,), Float64]


ENCODINGS = ["json", "base64", "binref"]
ENCODINGS = ["json", "base64", "binref", "base64+lz4", "binref+lz4"]

# Maps short encoding name to the format string used by output_to_bytes
_ENCODING_TO_FORMAT: dict[str, supported_format_type] = {
"json": "json",
"base64": "json+base64",
"binref": "json+binref",
"base64+lz4": "json+base64",
"binref+lz4": "json+binref",
}

# Maps short encoding name to extra kwargs passed to output_to_bytes
_ENCODING_TO_KWARGS: dict[str, dict] = {
"json": {},
"base64": {},
"binref": {},
"base64+lz4": {"base64_compression": "lz4"},
"binref+lz4": {"binref_compression": "lz4"},
}


Expand Down Expand Up @@ -78,44 +89,50 @@ def test_encoding(benchmark, encoding_and_size):
encoding, size = encoding_and_size
model = ArrayModel(data=create_test_array(size))
fmt = _ENCODING_TO_FORMAT[encoding]
extra_kwargs = _ENCODING_TO_KWARGS[encoding]
uses_binref = "binref" in encoding

with tempfile.TemporaryDirectory() as tmpdir:
if encoding == "binref":
if uses_binref:

def setup():
_clear_dir(tmpdir)

benchmark.pedantic(
output_to_bytes,
args=(model, fmt),
kwargs={"base_dir": tmpdir},
kwargs={"base_dir": tmpdir, **extra_kwargs},
setup=setup,
rounds=_binref_rounds(size),
)
else:
benchmark(output_to_bytes, model, fmt)
benchmark(output_to_bytes, model, fmt, **extra_kwargs)


def test_decoding(benchmark, encoding_and_size):
encoding, size = encoding_and_size
model = ArrayModel(data=create_test_array(size))
fmt = _ENCODING_TO_FORMAT[encoding]
extra_kwargs = _ENCODING_TO_KWARGS[encoding]
uses_binref = "binref" in encoding

with tempfile.TemporaryDirectory() as tmpdir:
ctx: dict[str, str] = {}
if encoding == "binref":
if uses_binref:
ctx["base_dir"] = tmpdir

encoded = output_to_bytes(model, fmt, base_dir=tmpdir)
encoded = output_to_bytes(model, fmt, base_dir=tmpdir, **extra_kwargs)

if encoding == "binref":
if uses_binref:
# binref filenames are random UUIDs, so we must re-encode in setup
# and pass the fresh payload to the decode call via a mutable wrapper.
payload = [encoded]

def setup():
_clear_dir(tmpdir)
payload[0] = output_to_bytes(model, fmt, base_dir=tmpdir)
payload[0] = output_to_bytes(
model, fmt, base_dir=tmpdir, **extra_kwargs
)

def decode():
ArrayModel.model_validate_json(payload[0], context=ctx)
Expand All @@ -129,17 +146,19 @@ def test_roundtrip(benchmark, encoding_and_size):
encoding, size = encoding_and_size
model = ArrayModel(data=create_test_array(size))
fmt = _ENCODING_TO_FORMAT[encoding]
extra_kwargs = _ENCODING_TO_KWARGS[encoding]
uses_binref = "binref" in encoding

with tempfile.TemporaryDirectory() as tmpdir:
ctx: dict[str, str] = {}
if encoding == "binref":
if uses_binref:
ctx["base_dir"] = tmpdir

def roundtrip():
enc = output_to_bytes(model, fmt, base_dir=tmpdir)
enc = output_to_bytes(model, fmt, base_dir=tmpdir, **extra_kwargs)
ArrayModel.model_validate_json(enc, context=ctx)

if encoding == "binref":
if uses_binref:

def setup():
_clear_dir(tmpdir)
Expand Down
10 changes: 10 additions & 0 deletions docs/content/using-tesseracts/array-encodings.md
Comment thread
angela-ko marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,13 @@ $ curl \
The `.bin` file references are relative to the `--output-path`.
:::
::::

### binref + lz4 compression

Set `TESSERACT_BINREF_COMPRESSION=lz4` to compress arrays in `.bin` files. Each array is compressed individually, preserving offset-based random access. The compressed size is embedded directly in the buffer path (`<file>:<offset>:<compressed_size>`).

```bash
$ TESSERACT_BINREF_COMPRESSION=lz4 tesseract run vectoradd apply -f "json+binref" -o /tmp/output @examples/vectoradd/example_inputs.json
$ cat /tmp/output/results.json
{"result":{"object_type":"array","shape":[3],"dtype":"float64","data":{"buffer":"....bin:0:35","encoding":"binref","compression":"lz4"}}}
```
7,034 changes: 3,425 additions & 3,609 deletions production.uv.lock

Large diffs are not rendered by default.

25 changes: 15 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ requires-python = ">=3.10,<3.15"
dependencies = [
"jinja2",
"rich",
"typer>=0.16",
"typer",
"pyyaml",
"pydantic",
"numpy",
Expand Down Expand Up @@ -40,16 +40,17 @@ tesseract-runtime = "tesseract_core.runtime.cli:main"
# do not edit manually. To add constraints, use other operators (e.g. <, >=, ~=, ==) as needed.
runtime = [
"pydantic<=2.13.4,>=2.10",
"fastapi<=0.138.0,>=0.115",
"fastapi<=0.136.1,>=0.115",
"requests<=2.34.2,>=2.32.4",
"uvicorn<=0.49.0,>=0.34",
"typer<=0.26.7,>=0.16",
"fsspec[http,s3]<=2026.6.0,>=2024.12",
"uvicorn<=0.47.0,>=0.34",
"click<=8.4.0,>=8.1",
"typer<=0.25.1,>=0.15",
"fsspec[http,s3]<=2026.4.0,>=2024.12",
"pybase64<=1.4.3,>=1.4",
"orjson<=3.11.9,>=3.10",
"numpy<=2.5.0,>=1.26",
"debugpy<=1.8.21,>=1.8.14",
"mlflow-skinny<=3.14.0,>=3.7.0",
"numpy<=2.4.5,>=1.26",
"debugpy<=1.8.20,>=1.8.14",
"mlflow-skinny<=3.12.0,>=3.7.0",
]
# END RUNTIME DEPENDENCIES

Expand All @@ -68,8 +69,7 @@ docs = [
]
dev = [
"docker",
"httpx", # required by fastapi older test client
"httpx2", # required by fastapi newer test client
"httpx", # required by fastapi test client
"pre-commit",
"pytest",
"pytest-cov",
Expand All @@ -85,6 +85,11 @@ dev = [
# also add all other extras here
"tesseract-core[runtime]",
"tesseract-core[docs]",
"tesseract-core[compression]",
]

compression = [
"lz4",
]

[project.urls]
Expand Down
16 changes: 8 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ annotated-doc==0.0.4
# via typer
annotated-types==0.7.0
# via pydantic
certifi==2026.6.17
certifi==2026.4.22
# via requests
charset-normalizer==3.4.7
# via requests
colorama==0.4.6 ; sys_platform == 'win32'
click==8.4.0
# via typer
idna==3.18
colorama==0.4.6 ; sys_platform == 'win32'
# via click
idna==3.15
# via requests
jinja2==3.1.6
# via tesseract-core
Expand All @@ -24,15 +26,13 @@ mdurl==0.1.2
# via markdown-it-py
numpy==2.2.6 ; python_full_version < '3.11'
# via tesseract-core
numpy==2.4.6 ; python_full_version == '3.11.*'
# via tesseract-core
numpy==2.5.0 ; python_full_version >= '3.12'
numpy==2.4.5 ; python_full_version >= '3.11'
# via tesseract-core
orjson==3.11.9
# via tesseract-core
packaging==26.2
# via tesseract-core
pip==26.1.2
pip==26.1.1
# via tesseract-core
pybase64==1.4.3
# via tesseract-core
Expand All @@ -52,7 +52,7 @@ rich==15.0.0
# typer
shellingham==1.5.4
# via typer
typer==0.26.7
typer==0.25.1
# via tesseract-core
typing-extensions==4.15.0
# via
Expand Down
Loading
Loading