diff --git a/pedalboard/BufferUtils.h b/pedalboard/BufferUtils.h
index 13f70858..ccef6413 100644
--- a/pedalboard/BufferUtils.h
+++ b/pedalboard/BufferUtils.h
@@ -204,7 +204,7 @@ const juce::AudioBuffer<T> convertPyArrayIntoJuceBuffer(
 }
 
 template <typename T>
-py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
+py::array_t<T> copyJuceBufferIntoPyArray(juce::AudioBuffer<T> juceBuffer,
                                          ChannelLayout channelLayout,
                                          int offsetSamples, int ndim = 2) {
   unsigned int numChannels = juceBuffer.getNumChannels();
@@ -212,7 +212,32 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
   unsigned int outputSampleCount =
       std::max((int)numSamples - (int)offsetSamples, 0);
 
-  // TODO: Avoid the need to copy here if offsetSamples is 0!
+  // Zero-copy path for mono with no offset: move the JUCE buffer into a
+  // capsule and let NumPy point directly at its memory. Multichannel can't
+  // use this because JUCE allocates each channel separately and NumPy needs
+  // contiguous memory.
+  if (offsetSamples == 0 && numChannels == 1 && numSamples > 0) {
+    auto *buf = new juce::AudioBuffer<T>(std::move(juceBuffer));
+    py::capsule owner(buf, [](void *p) {
+      delete static_cast<juce::AudioBuffer<T> *>(p);
+    });
+    T *data = buf->getWritePointer(0);
+    if (ndim == 2) {
+      switch (channelLayout) {
+      case ChannelLayout::NotInterleaved:
+        return py::array_t<T>({(unsigned int)1, numSamples},
+                              {numSamples * sizeof(T), sizeof(T)}, data, owner);
+      case ChannelLayout::Interleaved:
+        return py::array_t<T>({numSamples, (unsigned int)1},
+                              {sizeof(T), sizeof(T)}, data, owner);
+      default:
+        break;
+      }
+    } else {
+      return py::array_t<T>({numSamples}, {sizeof(T)}, data, owner);
+    }
+  }
+
   py::array_t<T> outputArray;
   if (ndim == 2) {
     switch (channelLayout) {
@@ -232,10 +257,6 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
 
   py::buffer_info outputInfo = outputArray.request();
 
-  // Depending on the input channel layout, we need to copy data
-  // differently. This loop is duplicated here to move the if statement
-  // outside of the tight loop, as we don't need to re-check that the input
-  // channel is still the same on every iteration of the loop.
   T *outputBasePointer = static_cast<T *>(outputInfo.ptr);
 
   if (juceBuffer.getNumSamples() > 0) {
@@ -243,7 +264,6 @@ py::array_t<T> copyJuceBufferIntoPyArray(const juce::AudioBuffer<T> &juceBuffer,
     case ChannelLayout::Interleaved:
       for (unsigned int i = 0; i < numChannels; i++) {
         const T *channelBuffer = juceBuffer.getReadPointer(i, offsetSamples);
-        // We're interleaving the data here, so we can't use copyFrom.
         for (unsigned int j = 0; j < outputSampleCount; j++) {
           outputBasePointer[j * numChannels + i] = channelBuffer[j];
         }
diff --git a/pedalboard/TimeStretch.h b/pedalboard/TimeStretch.h
index ecdbd769..30ea9933 100644
--- a/pedalboard/TimeStretch.h
+++ b/pedalboard/TimeStretch.h
@@ -378,7 +378,7 @@ inline void init_time_stretch(py::module &m) {
                                preserveFormants);
         }
 
-        return copyJuceBufferIntoPyArray(output, detectChannelLayout(input), 0);
+        return copyJuceBufferIntoPyArray(std::move(output), detectChannelLayout(input), 0);
       },
       R"(
 Time-stretch (and optionally pitch-shift) a buffer of audio, changing its length.
diff --git a/pedalboard/process.h b/pedalboard/process.h
index 22e0e265..935a2454 100644
--- a/pedalboard/process.h
+++ b/pedalboard/process.h
@@ -270,7 +270,7 @@ processFloat32(const py::array_t<float, py::array::c_style> inputArray,
     totalOutputLatencySamples = ioBuffer.getNumSamples() - samplesReturned;
   }
 
-  return copyJuceBufferIntoPyArray(ioBuffer, inputChannelLayout,
+  return copyJuceBufferIntoPyArray(std::move(ioBuffer), inputChannelLayout,
                                    totalOutputLatencySamples,
                                    inputArray.request().ndim);
 }
diff --git a/tests/test_zero_copy_output.py b/tests/test_zero_copy_output.py
new file mode 100644
index 00000000..69152e77
--- /dev/null
+++ b/tests/test_zero_copy_output.py
@@ -0,0 +1,46 @@
+#! /usr/bin/env python
+#
+# Copyright 2021 Spotify AB
+#
+# Licensed under the GNU Public License, Version 3.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.gnu.org/licenses/gpl-3.0.html
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import pytest
+
+from pedalboard import Gain
+
+
+@pytest.mark.parametrize("sample_rate", [22050, 44100, 48000])
+def test_mono_output_not_copied(sample_rate):
+    """Mono output with no latency should reuse the buffer, not copy it."""
+    signal = np.sin(
+        2 * np.pi * 440 * np.arange(sample_rate) / sample_rate
+    ).astype(np.float32)
+    out = Gain(gain_db=0).process(signal, sample_rate)
+    assert out.flags["C_CONTIGUOUS"]
+    assert out.flags["WRITEABLE"]
+    np.testing.assert_allclose(out, signal, atol=1e-7)
+
+
+def test_mono_output_lifetime_independent():
+    """Each mono output must own its data independently."""
+    g = Gain(gain_db=0)
+    results = []
+    for freq in [440, 880, 1320]:
+        signal = np.sin(
+            2 * np.pi * freq * np.arange(44100) / 44100
+        ).astype(np.float32)
+        results.append((g.process(signal, 44100), signal))
+    for out, expected in results:
+        np.testing.assert_allclose(out, expected, atol=1e-7)