From 2ebb7d382b37d3fabb342f9ced27a943ec7ca778 Mon Sep 17 00:00:00 2001
From: Cody Hsieh <codyjhsieh@gmail.com>
Date: Tue, 26 May 2026 13:06:06 -0400
Subject: [PATCH] BufferUtils: Zero-copy mono output when offsetSamples is 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem

copyJuceBufferIntoPyArray allocates a new NumPy array and copies all
sample data on every process() call, even when the output could be
returned directly. For mono audio with no latency trimming (the
common case for most plugins), this is a wasted allocation + memcpy.

Solution

Change the function signature from const& to by-value, enabling move
semantics. When offsetSamples is 0 and numChannels is 1, move the
JUCE buffer into a heap-allocated capsule and return a NumPy array
that points directly at its memory. Python's refcount frees the
capsule when the array is garbage collected.

Multichannel still copies because JUCE allocates each channel as a
separate heap block — NumPy needs contiguous memory.

Result

Mono processing avoids a full buffer copy on every call. Stereo and
latency-trimmed paths are unchanged.
---
 pedalboard/BufferUtils.h       | 34 +++++++++++++++++++------
 pedalboard/TimeStretch.h       |  2 +-
 pedalboard/process.h           |  2 +-
 tests/test_zero_copy_output.py | 46 ++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_zero_copy_output.py
diff --git a/pedalboard/BufferUtils.h b/pedalboard/BufferUtils.h
index 13f708586..ccef64138 100644
--- a/pedalboard/BufferUtils.h
+++ b/pedalboard/BufferUtils.h
@@ -204,7 +204,7 @@ const juce::AudioBuffer<T> convertPyArrayIntoJuceBuffer(
 }
 
 template <typename T>
-py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
+py::array_t<T> copyJuceBufferIntoPyArray(juce::AudioBuffer<T> juceBuffer,
                                          ChannelLayout channelLayout,
                                          int offsetSamples, int ndim = 2) {
   unsigned int numChannels = juceBuffer.getNumChannels();
@@ -212,7 +212,32 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
   unsigned int outputSampleCount =
       std::max((int)numSamples - (int)offsetSamples, 0);
 
-  // TODO: Avoid the need to copy here if offsetSamples is 0!
+  // Zero-copy path for mono with no offset: move the JUCE buffer into a
+  // capsule and let NumPy point directly at its memory. Multichannel can't
+  // use this because JUCE allocates each channel separately and NumPy needs
+  // contiguous memory.
+  if (offsetSamples == 0 && numChannels == 1 && numSamples > 0) {
+    auto *buf = new juce::AudioBuffer<T>(std::move(juceBuffer));
+    py::capsule owner(buf, [](void *p) {
+      delete static_cast<juce::AudioBuffer<T> *>(p);
+    });
+    T *data = buf->getWritePointer(0);
+    if (ndim == 2) {
+      switch (channelLayout) {
+      case ChannelLayout::NotInterleaved:
+        return py::array_t<T>({(unsigned int)1, numSamples},
+                              {numSamples * sizeof(T), sizeof(T)}, data, owner);
+      case ChannelLayout::Interleaved:
+        return py::array_t<T>({numSamples, (unsigned int)1},
+                              {sizeof(T), sizeof(T)}, data, owner);
+      default:
+        break;
+      }
+    } else {
+      return py::array_t<T>({numSamples}, {sizeof(T)}, data, owner);
+    }
+  }
+
   py::array_t<T> outputArray;
   if (ndim == 2) {
     switch (channelLayout) {
@@ -232,10 +257,6 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
 
   py::buffer_info outputInfo = outputArray.request();
 
-  // Depending on the input channel layout, we need to copy data
-  // differently. This loop is duplicated here to move the if statement
-  // outside of the tight loop, as we don't need to re-check that the input
-  // channel is still the same on every iteration of the loop.
   T *outputBasePointer = static_cast<T *>(outputInfo.ptr);
 
   if (juceBuffer.getNumSamples() > 0) {
@@ -243,7 +264,6 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
     case ChannelLayout::Interleaved:
       for (unsigned int i = 0; i < numChannels; i++) {
         const T *channelBuffer = juceBuffer.getReadPointer(i, offsetSamples);
-        // We're interleaving the data here, so we can't use copyFrom.
         for (unsigned int j = 0; j < outputSampleCount; j++) {
           outputBasePointer[j * numChannels + i] = channelBuffer[j];
         }
diff --git a/pedalboard/TimeStretch.h b/pedalboard/TimeStretch.h
index ecdbd7690..30ea99335 100644
--- a/pedalboard/TimeStretch.h
+++ b/pedalboard/TimeStretch.h
@@ -378,7 +378,7 @@ inline void init_time_stretch(py::module &m) {
                                preserveFormants);
         }
 
-        return copyJuceBufferIntoPyArray(output, detectChannelLayout(input), 0);
+        return copyJuceBufferIntoPyArray(std::move(output), detectChannelLayout(input), 0);
       },
       R"(
 Time-stretch (and optionally pitch-shift) a buffer of audio, changing its length.
diff --git a/pedalboard/process.h b/pedalboard/process.h
index 22e0e2653..935a2454f 100644
--- a/pedalboard/process.h
+++ b/pedalboard/process.h
@@ -270,7 +270,7 @@ processFloat32(const py::array_t<float, py::array::c_style> inputArray,
     totalOutputLatencySamples = ioBuffer.getNumSamples() - samplesReturned;
   }
 
-  return copyJuceBufferIntoPyArray(ioBuffer, inputChannelLayout,
+  return copyJuceBufferIntoPyArray(std::move(ioBuffer), inputChannelLayout,
                                    totalOutputLatencySamples,
                                    inputArray.request().ndim);
 }
diff --git a/tests/test_zero_copy_output.py b/tests/test_zero_copy_output.py
new file mode 100644
index 000000000..69152e772
--- /dev/null
+++ b/tests/test_zero_copy_output.py
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+#
+# Copyright 2021 Spotify AB
+#
+# Licensed under the GNU Public License, Version 3.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.gnu.org/licenses/gpl-3.0.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import pytest
+
+from pedalboard import Gain
+
+
+@pytest.mark.parametrize("sample_rate", [22050, 44100, 48000])
+def test_mono_output_not_copied(sample_rate):
+    """Mono output with no latency should reuse the buffer, not copy it."""
+    signal = np.sin(
+        2 * np.pi * 440 * np.arange(sample_rate) / sample_rate
+    ).astype(np.float32)
+    out = Gain(gain_db=0).process(signal, sample_rate)
+    assert out.flags["C_CONTIGUOUS"]
+    assert out.flags["WRITEABLE"]
+    np.testing.assert_allclose(out, signal, atol=1e-7)
+
+
+def test_mono_output_lifetime_independent():
+    """Each mono output must own its data independently."""
+    g = Gain(gain_db=0)
+    results = []
+    for freq in [440, 880, 1320]:
+        signal = np.sin(
+            2 * np.pi * freq * np.arange(44100) / 44100
+        ).astype(np.float32)
+        results.append((g.process(signal, 44100), signal))
+    for out, expected in results:
+        np.testing.assert_allclose(out, expected, atol=1e-7)