From 2ebb7d382b37d3fabb342f9ced27a943ec7ca778 Mon Sep 17 00:00:00 2001 From: Cody Hsieh Date: Tue, 26 May 2026 13:06:06 -0400 Subject: [PATCH] BufferUtils: Zero-copy mono output when offsetSamples is 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem copyJuceBufferIntoPyArray allocates a new NumPy array and copies all sample data on every process() call, even when the output could be returned directly. For mono audio with no latency trimming (the common case for most plugins), this is a wasted allocation + memcpy. Solution Change the function signature from const& to by-value, enabling move semantics. When offsetSamples is 0 and numChannels is 1, move the JUCE buffer into a heap-allocated capsule and return a NumPy array that points directly at its memory. Python's refcount frees the capsule when the array is garbage collected. Multichannel still copies because JUCE allocates each channel as a separate heap block — NumPy needs contiguous memory. Result Mono processing avoids a full buffer copy on every call. Stereo and latency-trimmed paths are unchanged. --- pedalboard/BufferUtils.h | 34 +++++++++++++++++++------ pedalboard/TimeStretch.h | 2 +- pedalboard/process.h | 2 +- tests/test_zero_copy_output.py | 46 ++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 tests/test_zero_copy_output.py diff --git a/pedalboard/BufferUtils.h b/pedalboard/BufferUtils.h index 13f708586..ccef64138 100644 --- a/pedalboard/BufferUtils.h +++ b/pedalboard/BufferUtils.h @@ -204,7 +204,7 @@ const juce::AudioBuffer convertPyArrayIntoJuceBuffer( } template -py::array_t copyJuceBufferIntoPyArray(const juce::AudioBuffer &juceBuffer, +py::array_t copyJuceBufferIntoPyArray(juce::AudioBuffer juceBuffer, ChannelLayout channelLayout, int offsetSamples, int ndim = 2) { unsigned int numChannels = juceBuffer.getNumChannels(); @@ -212,7 +212,32 @@ py::array_t copyJuceBufferIntoPyArray(const juce::AudioBuffer &juceBuffer, unsigned int outputSampleCount = std::max((int)numSamples - (int)offsetSamples, 0); - // TODO: Avoid the need to copy here if offsetSamples is 0! + // Zero-copy path for mono with no offset: move the JUCE buffer into a + // capsule and let NumPy point directly at its memory. Multichannel can't + // use this because JUCE allocates each channel separately and NumPy needs + // contiguous memory. + if (offsetSamples == 0 && numChannels == 1 && numSamples > 0) { + auto *buf = new juce::AudioBuffer(std::move(juceBuffer)); + py::capsule owner(buf, [](void *p) { + delete static_cast *>(p); + }); + T *data = buf->getWritePointer(0); + if (ndim == 2) { + switch (channelLayout) { + case ChannelLayout::NotInterleaved: + return py::array_t({(unsigned int)1, numSamples}, + {numSamples * sizeof(T), sizeof(T)}, data, owner); + case ChannelLayout::Interleaved: + return py::array_t({numSamples, (unsigned int)1}, + {sizeof(T), sizeof(T)}, data, owner); + default: + break; + } + } else { + return py::array_t({numSamples}, {sizeof(T)}, data, owner); + } + } + py::array_t outputArray; if (ndim == 2) { switch (channelLayout) { @@ -232,10 +257,6 @@ py::array_t copyJuceBufferIntoPyArray(const juce::AudioBuffer &juceBuffer, py::buffer_info outputInfo = outputArray.request(); - // Depending on the input channel layout, we need to copy data - // differently. This loop is duplicated here to move the if statement - // outside of the tight loop, as we don't need to re-check that the input - // channel is still the same on every iteration of the loop. T *outputBasePointer = static_cast(outputInfo.ptr); if (juceBuffer.getNumSamples() > 0) { @@ -243,7 +264,6 @@ py::array_t copyJuceBufferIntoPyArray(const juce::AudioBuffer &juceBuffer, case ChannelLayout::Interleaved: for (unsigned int i = 0; i < numChannels; i++) { const T *channelBuffer = juceBuffer.getReadPointer(i, offsetSamples); - // We're interleaving the data here, so we can't use copyFrom. for (unsigned int j = 0; j < outputSampleCount; j++) { outputBasePointer[j * numChannels + i] = channelBuffer[j]; } diff --git a/pedalboard/TimeStretch.h b/pedalboard/TimeStretch.h index ecdbd7690..30ea99335 100644 --- a/pedalboard/TimeStretch.h +++ b/pedalboard/TimeStretch.h @@ -378,7 +378,7 @@ inline void init_time_stretch(py::module &m) { preserveFormants); } - return copyJuceBufferIntoPyArray(output, detectChannelLayout(input), 0); + return copyJuceBufferIntoPyArray(std::move(output), detectChannelLayout(input), 0); }, R"( Time-stretch (and optionally pitch-shift) a buffer of audio, changing its length. diff --git a/pedalboard/process.h b/pedalboard/process.h index 22e0e2653..935a2454f 100644 --- a/pedalboard/process.h +++ b/pedalboard/process.h @@ -270,7 +270,7 @@ processFloat32(const py::array_t inputArray, totalOutputLatencySamples = ioBuffer.getNumSamples() - samplesReturned; } - return copyJuceBufferIntoPyArray(ioBuffer, inputChannelLayout, + return copyJuceBufferIntoPyArray(std::move(ioBuffer), inputChannelLayout, totalOutputLatencySamples, inputArray.request().ndim); } diff --git a/tests/test_zero_copy_output.py b/tests/test_zero_copy_output.py new file mode 100644 index 000000000..69152e772 --- /dev/null +++ b/tests/test_zero_copy_output.py @@ -0,0 +1,46 @@ +#! /usr/bin/env python +# +# Copyright 2021 Spotify AB +# +# Licensed under the GNU Public License, Version 3.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.gnu.org/licenses/gpl-3.0.html +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import pytest + +from pedalboard import Gain + + +@pytest.mark.parametrize("sample_rate", [22050, 44100, 48000]) +def test_mono_output_not_copied(sample_rate): + """Mono output with no latency should reuse the buffer, not copy it.""" + signal = np.sin( + 2 * np.pi * 440 * np.arange(sample_rate) / sample_rate + ).astype(np.float32) + out = Gain(gain_db=0).process(signal, sample_rate) + assert out.flags["C_CONTIGUOUS"] + assert out.flags["WRITEABLE"] + np.testing.assert_allclose(out, signal, atol=1e-7) + + +def test_mono_output_lifetime_independent(): + """Each mono output must own its data independently.""" + g = Gain(gain_db=0) + results = [] + for freq in [440, 880, 1320]: + signal = np.sin( + 2 * np.pi * freq * np.arange(44100) / 44100 + ).astype(np.float32) + results.append((g.process(signal, 44100), signal)) + for out, expected in results: + np.testing.assert_allclose(out, expected, atol=1e-7)