NVIDIA
diff --git a/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions b/‎CITATION.cff‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 11 additions & 9 deletions b/‎README.md‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎docs/_sources/api/creation/tensors/make.rst‎
Lines changed: 0 additions & 3 deletions b/‎docs/_sources/api/creation/tensors/make.rst‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/_sources/api/dft/fft/fft.rst‎
Lines changed: 4 additions & 4 deletions b/‎docs/_sources/api/dft/fft/fft.rst‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/_sources/api/dft/fft/ifft.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/_sources/api/dft/fft/ifft.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/_sources/api/logic/comparison/isclose.rst‎
Lines changed: 21 additions & 0 deletions b/‎docs/_sources/api/logic/comparison/isclose.rst‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎docs/_sources/api/logic/truth/allclose.rst‎
Lines changed: 20 additions & 0 deletions b/‎docs/_sources/api/logic/truth/allclose.rst‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/_sources/api/manipulation/rearranging/overlap.rst‎
Lines changed: 34 additions & 0 deletions b/‎docs/_sources/api/manipulation/rearranging/overlap.rst‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎docs/_sources/api/manipulation/selecting/at.rst‎
Lines changed: 31 additions & 0 deletions b/‎docs/_sources/api/manipulation/selecting/at.rst‎
Lines changed: 31 additions & 0 deletions
@@ -11,6 +11,6 @@ authors:
   given-names: "Adam"
   orcid: "https://orcid.org/0000-0001-9690-6357"
 title: "MatX Primitives Library for GPU-Accelerated Numerical Computing in C++"
-version: 0.1.0
-date-released: 2021-10-26
+version: 0.6.0
+date-released: 2023-10-02
 url: "https://github.com/NVIDIA/matx"
@@ -55,7 +55,7 @@ endif()
 project(MATX
         LANGUAGES CUDA CXX
         DESCRIPTION "A modern and efficient header-only C++ library for numerical computing on GPU"
-        VERSION 0.5.0
+        VERSION 0.6.0
         HOMEPAGE_URL "https://github.com/NVIDIA/MatX")
 
 if (NOT CMAKE_CUDA_ARCHITECTURES)
 
@@ -193,6 +193,17 @@ We provide a variety of training materials and examples to quickly learn the Mat
 - Finally, for new MatX developers, browsing the [example applications](examples) can provide familarity with the API and best practices.
 
 ## Release Major Features
+*v0.6.0*:
+- Breaking changes
+    * This marks the first release of using "transforms as operators". This allows transforms to be used in any operator expression, whereas the previous release required them to be on separate lines. For an example, please see: https://nvidia.github.io/MatX/basics/fusion.html. This also causes a breaking change with transform usage. Converting to the new format is as simple as moving the function parameters. For example: `matmul(C, A, B, stream);` becomes `(C = matmul(A,B)).run(stream);`. 
+- Features
+    * Polyphase channelizer
+    * Many new operators, including upsample, downsample, pwelch, overlap, at, etc
+    * Added more lvalue semantics for operators based on view manipulation
+- Bug fixes
+    * Fixed cache issues
+    * Fixed stride = 0 in matmul
+
 *v0.5.0*:
 * Polyphase resampler
 * Documentation overhaul with examples for each function
@@ -205,15 +216,6 @@ We provide a variety of training materials and examples to quickly learn the Mat
 * 16-bit float reductions
 * Output iterator support in CUB
 
-*v0.3.0*:
-* Many new operators, including `flatten`, `remap`, `lcollapse`. `rcollapse`, `fmod`, `clone`, `slice`
-* Extended N-D tensor support to more functions
-* Allow operators on reduction inputs
-* g++11 support
-* NVTX support
-* Many, many bug fixes
-
-
 ## Discussions
 We have an open discussions board [here](https://github.com/NVIDIA/MatX/discussions). We encourage any questions about the library to be posted here for other users to learn from and read through.
 
 
@@ -16,13 +16,11 @@ Return by Value
 .. doxygenfunction:: make_tensor( TensorType &tensor, const index_t (&shape)[TensorType::Rank()], matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor( ShapeType &&shape, matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor( TensorType &tensor, ShapeType &&shape,  matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
-.. doxygenfunction:: make_tensor( matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor( TensorType &tensor, matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor( T *data, const index_t (&shape)[RANK], bool owning = false)
 .. doxygenfunction:: make_tensor( TensorType &tensor, typename TensorType::scalar_type *data, const index_t (&shape)[TensorType::Rank()], bool owning = false)
 .. doxygenfunction:: make_tensor( T *data, ShapeType &&shape, bool owning = false)
 .. doxygenfunction:: make_tensor( TensorType &tensor, typename TensorType::scalar_type *data, typename TensorType::shape_container &&shape, bool owning = false)
-.. doxygenfunction:: make_tensor( T *ptr, bool owning = false)
 .. doxygenfunction:: make_tensor( TensorType &tensor, typename TensorType::scalar_type *ptr, bool owning = false)
 .. doxygenfunction:: make_tensor( Storage &&s, ShapeType &&shape)
 .. doxygenfunction:: make_tensor( TensorType &tensor, typename TensorType::storage_type &&s, typename TensorType::shape_container &&shape)
@@ -38,5 +36,4 @@ Return by Pointer
 .. doxygenfunction:: make_tensor_p( const index_t (&shape)[RANK],  matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor_p( ShapeType &&shape, matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor_p( TensorType &tensor, typename TensorType::shape_container &&shape, matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
-.. doxygenfunction:: make_tensor_p( matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0)
 .. doxygenfunction:: make_tensor_p( T *const data, ShapeType &&shape, bool owning = false)
@@ -9,8 +9,8 @@ Perform a 1D FFT
    These functions are currently not supported with host-based executors (CPU)
 
 
-.. doxygenfunction:: fft(OpA &&a, uint64_t fft_size = 0)
-.. doxygenfunction:: fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0)
+.. doxygenfunction:: fft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD)
+.. doxygenfunction:: fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD)
 
 Examples
 ~~~~~~~~
@@ -25,7 +25,7 @@ Examples
   :language: cpp
   :start-after: example-begin fft-2
   :end-before: example-end fft-2
-  :dedent:  
+  :dedent:
 
 .. literalinclude:: ../../../../test/00_transform/FFT.cu
   :language: cpp
@@ -43,4 +43,4 @@ Examples
   :language: cpp
   :start-after: example-begin fft-5
   :end-before: example-end fft-5
-  :dedent:  
+  :dedent:
@@ -9,8 +9,8 @@ Perform a 1D inverse FFT
    These functions are currently not supported with host-based executors (CPU)
 
 
-.. doxygenfunction:: ifft(OpA &&a, uint64_t fft_size = 0)
-.. doxygenfunction:: ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0)
+.. doxygenfunction:: ifft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD)
+.. doxygenfunction:: ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD)
 
 Examples
 ~~~~~~~~
 
@@ -0,0 +1,21 @@
+.. _isclose_func:
+
+isclose
+=======
+
+Determine the closeness of values across two operators using absolute and relative tolerances. The output
+from isclose is an ``int`` value since it's commonly used for reductions and ``bool`` reductions using
+atomics are not available in hardware.
+
+
+.. doxygenfunction:: isclose
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
+   :language: cpp
+   :start-after: example-begin isclose-test-1
+   :end-before: example-end isclose-test-1
+   :dedent:
+
@@ -0,0 +1,20 @@
+.. _allclose_func:
+
+allclose
+========
+
+Reduce the closeness of two operators to a single scalar (0D) output. The output
+from allclose is an ``int`` value since boolean reductions are not available in hardware
+
+
+.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, SingleThreadHostExecutor exec)
+.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/ReductionTests.cu
+   :language: cpp
+   :start-after: example-begin allclose-test-1
+   :end-before: example-end allclose-test-1
+   :dedent:
@@ -0,0 +1,34 @@
+.. _overlap_func:
+
+overlap
+#######
+
+Create an overlapping view an of input operator giving a higher-rank view of the input
+
+For example, the following 1D tensor [1 2 3 4 5] could be cloned into a 2d tensor with a
+window size of 2 and overlap of 1, resulting in::
+
+  [1 2
+   2 3
+   3 4
+   4 5]
+
+Currently this only works on 1D tensors going to 2D, but may be expanded
+for higher dimensions in the future. Note that if the window size does not
+divide evenly into the existing column dimension, the view may chop off the
+end of the data to make the tensor rectangular.
+
+.. note::
+    Only 1D input operators are accepted at this time
+
+.. doxygenfunction:: overlap( const OpType &op, const index_t (&windows)[N], const index_t (&strides)[N])
+.. doxygenfunction:: overlap( const OpType &op, const std::array<index_t, N> &windows, const std::array<index_t, N> &strides)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
+   :language: cpp
+   :start-after: example-begin overlap-test-1
+   :end-before: example-end overlap-test-1
+   :dedent:
@@ -0,0 +1,31 @@
+.. _at_func:
+
+at
+==
+
+Selects a single value from an operator. Since `at` is a lazily-evaluated operator, it should be used
+in situations where `operator()` cannot be used. For instance:
+
+.. code-block:: cpp
+
+    (a = b(5)).run();
+
+The code above creates a race condition where `b(5)` is evaluated on the host before launch, but the value may
+not be computed from a previous operation. Instead, the `at()` operator can be used to defer the load until 
+the operation is launched:
+
+.. code-block:: cpp
+
+    (a = at(b, 5)).run();
+
+.. doxygenfunction:: at(const Op op, Is... indices)
+
+Examples
+~~~~~~~~
+
+.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
+   :language: cpp
+   :start-after: example-begin at-test-1
+   :end-before: example-end at-test-1
+   :dedent:
+