From 0d19aac93c31faa824228e352e20cd70f0594dc9 Mon Sep 17 00:00:00 2001
From: Helmut Januschka <helmut@januschka.com>
Date: Mon, 8 Jun 2026 10:44:43 +0200
Subject: [PATCH 1/4] Avoid storing unused partial renders

Only retain progressive render snapshots in the CLI when they can be written to an output.
---
 jxl_cli/benches/decode.rs | 1 +
 jxl_cli/src/dec/mod.rs    | 5 +++--
 jxl_cli/src/lib.rs        | 2 ++
 jxl_cli/src/main.rs       | 2 ++
 4 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/jxl_cli/benches/decode.rs b/jxl_cli/benches/decode.rs
index a066c146e..f15a4bccd 100644
--- a/jxl_cli/benches/decode.rs
+++ b/jxl_cli/benches/decode.rs
@@ -68,6 +68,7 @@ fn decode_benches(c: &mut Criterion) {
                         false,
                         None,
                         false,
+                        false,
                     )
                     .unwrap();
                 })
diff --git a/jxl_cli/src/dec/mod.rs b/jxl_cli/src/dec/mod.rs
index d8e5e737f..b1d6b4f09 100644
--- a/jxl_cli/src/dec/mod.rs
+++ b/jxl_cli/src/dec/mod.rs
@@ -139,6 +139,7 @@ pub fn decode_frames<In: JxlBitstreamInputExt>(
     linear_output: bool,
     render_interval: Option<usize>,
     allow_partial_files: bool,
+    store_partial_renders: bool,
 ) -> Result<(DecodeOutput, Duration)> {
     let start = Instant::now();
 
@@ -282,7 +283,7 @@ pub fn decode_frames<In: JxlBitstreamInputExt>(
                     // render and retry.
                     if render_interval.is_some() && input.available_bytes()? > 0 {
                         has_rendered_data |= fallback.flush_pixels(&mut output_bufs)?;
-                        if has_rendered_data {
+                        if has_rendered_data && store_partial_renders {
                             partial_renders.push(
                                 outputs
                                     .iter()
@@ -332,7 +333,7 @@ pub fn decode_frames<In: JxlBitstreamInputExt>(
                     // render and retry.
                     if render_interval.is_some() && input.available_bytes()? > 0 {
                         has_rendered_data |= fallback.flush_pixels(&mut output_bufs)?;
-                        if has_rendered_data {
+                        if has_rendered_data && store_partial_renders {
                             partial_renders.push(
                                 outputs
                                     .iter()
diff --git a/jxl_cli/src/lib.rs b/jxl_cli/src/lib.rs
index 4f4b627ba..bf71087e0 100644
--- a/jxl_cli/src/lib.rs
+++ b/jxl_cli/src/lib.rs
@@ -77,6 +77,7 @@ mod tests {
             false,
             None,
             false,
+            false,
         )
         .unwrap()
         .0
@@ -188,6 +189,7 @@ mod tests {
                 false,
                 None,
                 false,
+                false,
             )
             .unwrap();
         }
diff --git a/jxl_cli/src/main.rs b/jxl_cli/src/main.rs
index d8abf491c..64faf4e94 100644
--- a/jxl_cli/src/main.rs
+++ b/jxl_cli/src/main.rs
@@ -164,6 +164,7 @@ fn main() -> Result<()> {
             let linear_output = matches!(output_format, Some(OutputFormat::Exr));
             #[cfg(not(feature = "exr"))]
             let linear_output = false;
+            let store_partial_renders = output_format.is_some() && opt.render_interval.is_some();
             let (mut output, duration) = dec::decode_frames(
                 $input,
                 options(skip_preview),
@@ -176,6 +177,7 @@ fn main() -> Result<()> {
                 linear_output,
                 opt.render_interval,
                 opt.allow_partial_files,
+                store_partial_renders,
             )?;
             if opt.preview {
                 output.frames.truncate(1);

From ea3af6eab7d52b61240adcf361e9786cab16fdc6 Mon Sep 17 00:00:00 2001
From: Helmut Januschka <helmut@januschka.com>
Date: Mon, 8 Jun 2026 11:07:51 +0200
Subject: [PATCH 2/4] Free squeeze neighbor buffers after final render

Inverse squeeze steps read neighbor grids (next average and previous decoded) that the transform graph counts as buffer uses, but the per-step code never released them, so those intermediate modular buffers stayed allocated for the whole frame. Mark them used on the final render so they are freed once consumed.
---
 jxl/src/frame/modular/transforms/apply.rs | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/jxl/src/frame/modular/transforms/apply.rs b/jxl/src/frame/modular/transforms/apply.rs
index 33fe5bd2a..101fc92df 100644
--- a/jxl/src/frame/modular/transforms/apply.rs
+++ b/jxl/src/frame/modular/transforms/apply.rs
@@ -382,6 +382,20 @@ impl TransformStepChunk {
                 }
                 buffers[buf_in[0]].buffer_grid[in_grid].mark_used(is_final);
                 buffers[buf_in[1]].buffer_grid[res_grid].mark_used(is_final);
+                // Release the weak neighbor grids read above (next average and previous
+                // decoded), which are counted as uses in the transform graph.
+                let (gx, gy) = self.grid_pos;
+                if gx + 1 < buffers[*buf_out].grid_shape.0 {
+                    let next_avg_grid =
+                        buffers[buf_in[0]].get_grid_idx(out_grid_kind, (gx + 1, gy));
+                    if next_avg_grid != in_grid {
+                        buffers[buf_in[0]].buffer_grid[next_avg_grid].mark_used(is_final);
+                    }
+                }
+                if gx > 0 {
+                    let prev_out_grid = buffers[*buf_out].get_grid_idx(out_grid_kind, (gx - 1, gy));
+                    buffers[*buf_out].buffer_grid[prev_out_grid].mark_used(is_final);
+                }
             }
             TransformStep::VSqueeze {
                 buf_in,
@@ -491,6 +505,20 @@ impl TransformStepChunk {
                 }
                 buffers[buf_in[0]].buffer_grid[in_grid].mark_used(is_final);
                 buffers[buf_in[1]].buffer_grid[res_grid].mark_used(is_final);
+                // Release the weak neighbor grids read above (next average and previous
+                // decoded), which are counted as uses in the transform graph.
+                let (gx, gy) = self.grid_pos;
+                if gy + 1 < buffers[*buf_out].grid_shape.1 {
+                    let next_avg_grid =
+                        buffers[buf_in[0]].get_grid_idx(out_grid_kind, (gx, gy + 1));
+                    if next_avg_grid != in_grid {
+                        buffers[buf_in[0]].buffer_grid[next_avg_grid].mark_used(is_final);
+                    }
+                }
+                if gy > 0 {
+                    let prev_out_grid = buffers[*buf_out].get_grid_idx(out_grid_kind, (gx, gy - 1));
+                    buffers[*buf_out].buffer_grid[prev_out_grid].mark_used(is_final);
+                }
             }
         };
 

From 6489571551e49f1a56d6760c2e8f861c55b92e80 Mon Sep 17 00:00:00 2001
From: Helmut Januschka <helmut@januschka.com>
Date: Mon, 8 Jun 2026 11:33:05 +0200
Subject: [PATCH 3/4] Bound render pipeline scratch buffer pool

Modular frames never reclaim center group buffers via get_buffer, so the scratch pool grew to a full-frame copy that was retained for the pipeline's lifetime. Cap it to the few buffers sequential rendering can actually reuse.
---
 jxl/src/render/low_memory_pipeline/group_scheduler.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/jxl/src/render/low_memory_pipeline/group_scheduler.rs b/jxl/src/render/low_memory_pipeline/group_scheduler.rs
index abc810ef8..e3213282b 100644
--- a/jxl/src/render/low_memory_pipeline/group_scheduler.rs
+++ b/jxl/src/render/low_memory_pipeline/group_scheduler.rs
@@ -113,7 +113,15 @@ impl LowMemoryRenderPipeline {
     }
 
     fn store_scratch_buffer(&mut self, channel: usize, kind: usize, image: OwnedRawImage) {
-        self.scratch_channel_buffers[channel * 3 + kind].push(image)
+        // The scratch pool only exists to recycle buffers for upcoming groups. Sequential
+        // rendering never needs more than a couple of buffers per (channel, kind) in flight, so
+        // bound the pool; otherwise pure-modular frames (which never reclaim center buffers via
+        // `get_buffer`) would retain a full-frame copy for the pipeline's lifetime.
+        const MAX_SCRATCH_BUFFERS: usize = 4;
+        let pool = &mut self.scratch_channel_buffers[channel * 3 + kind];
+        if pool.len() < MAX_SCRATCH_BUFFERS {
+            pool.push(image);
+        }
     }
 
     pub(super) fn render_with_new_group(

From 4446cea80e5d325f2c1bd8842207770fe3583126 Mon Sep 17 00:00:00 2001
From: Helmut Januschka <helmut@januschka.com>
Date: Mon, 8 Jun 2026 14:13:59 +0200
Subject: [PATCH 4/4] Revert "Bound render pipeline scratch buffer pool"

This reverts commit 6489571551e49f1a56d6760c2e8f861c55b92e80.
---
 jxl/src/render/low_memory_pipeline/group_scheduler.rs | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/jxl/src/render/low_memory_pipeline/group_scheduler.rs b/jxl/src/render/low_memory_pipeline/group_scheduler.rs
index e3213282b..abc810ef8 100644
--- a/jxl/src/render/low_memory_pipeline/group_scheduler.rs
+++ b/jxl/src/render/low_memory_pipeline/group_scheduler.rs
@@ -113,15 +113,7 @@ impl LowMemoryRenderPipeline {
     }
 
     fn store_scratch_buffer(&mut self, channel: usize, kind: usize, image: OwnedRawImage) {
-        // The scratch pool only exists to recycle buffers for upcoming groups. Sequential
-        // rendering never needs more than a couple of buffers per (channel, kind) in flight, so
-        // bound the pool; otherwise pure-modular frames (which never reclaim center buffers via
-        // `get_buffer`) would retain a full-frame copy for the pipeline's lifetime.
-        const MAX_SCRATCH_BUFFERS: usize = 4;
-        let pool = &mut self.scratch_channel_buffers[channel * 3 + kind];
-        if pool.len() < MAX_SCRATCH_BUFFERS {
-            pool.push(image);
-        }
+        self.scratch_channel_buffers[channel * 3 + kind].push(image)
     }
 
     pub(super) fn render_with_new_group(