Enabled tensor reduction kernels.

2021-02-26 15:32:19 -05:00 · 2021-02-26 15:32:19 -05:00 · 746b7b3247
commit 746b7b3247
parent abdf16a4d9
8 changed files with 3230 additions and 0 deletions
--- a/include/cutlass/reduction/device/tensor_reduce.h
+++ b/include/cutlass/reduction/device/tensor_reduce.h
@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
+#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
+template <
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename Layout_,
+  typename ReductionOp_,
+  int VectorLength_  = 1,
+  typename ElementCompute_ = ElementOutput_
+>
+struct TensorReduction {
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using Layout = Layout_;
+  using ReductionOp = ReductionOp_;
+  static int const kVectorLength = VectorLength_;
+  using ElementCompute = ElementCompute_;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Reduction operator
+  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  //
+  // Data members
+  //
+
+  ReductionDeviceStridedOperator reduction_strided;
+  ReductionDeviceContiguousOperator reduction_contiguous;
+  int reduction_index;
+
+  //
+  // Methods
+  //
+
+  ///
+  TensorReduction(
+    TensorCoord extent, 
+    int reduction_index_
+  ): 
+    reduction_index(reduction_index_) {
+
+    Coord<4> extent_affine;
+
+    switch (reduction_index) {
+    case 0:
+      extent_affine[0] = extent[1];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[0];
+      extent_affine[3] = extent[3];
+      break;
+    case 1:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[1];
+      extent_affine[3] = extent[3];
+      break;
+    case 2:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    case 3:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);  
+    }
+    else {
+      reduction_strided = ReductionDeviceStridedOperator(extent_affine);  
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.good();
+    }
+    return reduction_strided.good();
+  }
+
+  /// Size of one workspace
+  int64_t workspace_stride() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_stride();
+    }
+    else {
+      return reduction_strided.workspace_stride();
+    }
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_size();
+    }
+    else {
+      return reduction_strided.workspace_size();
+    }
+  }
+
+  /// Helper to use overloaded function call operator
+  Status reduce(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    int64_t src_stride[3];
+    int64_t dst_stride[2];
+
+    switch (reduction_index) {
+    case 0:
+      src_stride[0] = src_ref.stride()[1];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[2];
+      dst_stride[0] = dst_ref.stride()[1];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 1:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[1];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 2:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      break;
+    case 3:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      dst_stride[2] = dst_ref.stride()[0];
+
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      return reduction_contiguous(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+    else {
+      return reduction_strided(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+  }
+
+  Status operator()(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    return reduce(
+      dst_ref, 
+      src_ref, 
+      device_workspace_ptr, 
+      reduction_identity,
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@ -0,0 +1,367 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (e.g. ND => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguous {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineContiguous():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineContiguous(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p];
+    }
+
+    int cta_count_x = 1;
+    int cta_count_y = 1;
+    int cta_count_z = 1;
+
+    int cta_threads_x = kThreads;
+    int cta_threads_y = 1;
+    int cta_threads_z = 1;
+
+    // Determine CTA shape
+    int64_t inner_vector_count = inner_count / kVectorLength;
+
+    // Priority 1. Assign threadblocks to outer indices if possible
+    if (outer_count > target_threadblock_count) {
+      cta_count_x = 1;
+      cta_count_y = target_threadblock_count;
+      cta_count_z = 1;
+    }
+    else {
+
+      cta_count_y = int(outer_count);
+      int remaining_ctas = target_threadblock_count / cta_count_y;
+
+      // Priority 2. Assign inner dimensions to one CTA
+      if (inner_vector_count > cta_threads_x) {
+        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
+        if (cta_z_bound > remaining_ctas) {
+          cta_count_z = remaining_ctas;
+        }
+        else {
+          cta_count_z = int(cta_z_bound);
+        }
+      }
+      else {
+        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
+        cta_count_z = 1;
+      }
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
+
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    if (workspace_count) {
+
+      int final_threads = kThreads;
+      int final_ctas = 1;
+
+      if (outer_count > kThreads) {
+        final_ctas = int(outer_count + kThreads - 1) / kThreads;
+      }
+      else {
+        final_threads = int(outer_count);
+      }
+
+      grid_final = dim3(final_ctas, 1, 1);
+      threadblock_final = dim3(final_threads, 1, 1); 
+    }
+    else {
+      grid_final = dim3(0, 0, 0);
+      threadblock_final = dim3(0, 0, 0);
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    return outer_count * sizeof_bits<ElementCompute>::value / 8;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineContiguous<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+    }
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStrided {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineStrided():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineStrided(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p - 1];
+    }
+
+    // Compute plan for the reduction
+    int extent_c = extent[kRank - 1];
+    int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
+
+    // Determine CTA shape
+    int cta_width = kThreads * kVectorLength;
+    int cta_ways = reshape_pow2(extent_c, cta_width);
+    int cta_threads_x = kThreads / cta_ways;
+
+    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
+
+    // This leads to an error.
+    if (threadblock_shape.z > 1) {
+      if (threadblock_shape.y != 1) {
+        status = Status::kErrorInternal;
+        return;
+      }
+    }
+    
+    // Determine grid shape
+    int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
+    int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
+
+    // Limit the number of CTAs assigned to outer dimension
+    if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
+      cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
+    }
+
+    // Limit the number of CTAs assigned to inner dimension
+    int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
+    if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
+      cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    grid_final = dim3(cta_count_x, int(outer_count));
+    threadblock_final = dim3(cta_threads_x, 1, 1);
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size of one CTA's workspace
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
+
+    return extent[kRank - 1] * vector_size_bytes;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * outer_count * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,             ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineStrided<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+
+      // Check error condition
+      if (cudaPeekAtLastError() == cudaSuccess) {
+        status = Status::kSuccess;
+      }
+      else {
+        status = Status::kErrorInternal;
+      }
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(
+      dst_ptr, 
+      dst_stride, 
+      src_ptr, 
+      src_stride, 
+      device_workspace_ptr, 
+      reduction_identity, 
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@ -0,0 +1,600 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (i.e. number of outer ranks)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguousParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank];             /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element used by reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineContiguousParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineContiguousParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kRank - 1 - p]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguous {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into a coordinate of rank <kInnerRank>
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kRank - kInnerRank]);
+
+    // Compute an offset using the souce stride
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank - 1; ++i) {
+      src_offset += coord[i] * params.src_stride[kReducedRank + i];
+    }
+    src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices yielding a single element
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr,
+    int coord_c) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    //
+    // Early exit or initialize to identity element
+    //
+    if (!params.inner_count) {
+      return params.reduction_identity;
+    }
+
+    ComputeFragment accumulator;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < accumulator.size(); ++i) {
+      accumulator[i] = params.reduction_identity;
+    }
+    
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            cvt);
+        }
+      }
+    };
+
+    //
+    // Reduction of vectors to scalar
+    //
+
+    ElementCompute reduced_accumulator = accumulator[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kVectorLength; ++i) {
+      reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
+    }
+
+    //
+    // Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
+    //
+    // This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
+    //
+
+    int thread_count = blockDim.x * blockDim.z;
+    int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
+    int thread_i = threadIdx.y;
+
+    ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
+
+    frag_ptr[thread_j] = reduced_accumulator;
+
+    //
+    // Reduce
+    //
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (thread_count > 1) {
+      thread_count /= 2;
+
+      __syncthreads();
+
+      if (thread_j < thread_count) {
+        ElementCompute other = frag_ptr[thread_j + thread_count];
+
+        reduced_accumulator = reduction_op(reduced_accumulator, other);
+
+        frag_ptr[thread_j] = reduced_accumulator;
+      }
+
+      __syncthreads();
+    }
+
+
+    return reduced_accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
+    }
+
+    uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+
+          // Convert to output type and store
+          NumericConverter<ElementOutput, ElementCompute> convert_output;
+          ElementOutput cvt = convert_output(result);
+
+          *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        int64_t byte_offset = 
+          blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
+
+        // Store the result for final reduction
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+          *reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+      } // while
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguousFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute const *device_workspace) {
+
+    ReductionOp reduction_op(params.reduction_op);
+    char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
+
+    // Accumulated output
+    ElementCompute accumulator = params.reduction_identity;
+
+    for (int iter = 0; iter < params.workspace_count; ++iter) {
+      ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
+      
+      accumulator = reduction_op(accumulator, workspace_item);
+
+      src_byte_ptr += params.workspace_stride;
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
+
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
+
+      // Convert to output type and store
+      NumericConverter<ElementOutput, ElementCompute> convert_output;
+
+      *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
+
+      // Update indices and pointers
+      idx_linear += gridDim.x * blockDim.x;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@ -0,0 +1,635 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStridedParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 2];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank - 1];         /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int64_t workspace_outer_stride;               /// stride (units of bytes) between 'rows' of the workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Poitner to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element for reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineStridedParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineStridedParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_  = ElementCompute() ///< Identity element for reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_outer_stride(0),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank - 1; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    workspace_outer_stride = workspace_stride * workspace_count;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kReducedRank + p - 1]);
+    }
+  }
+};
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStrided {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kReducedRank]);
+
+    // Compute linear offset
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank; ++i) {
+      src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
+    }
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear coordinate
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute offset into tensors
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < identity_frag.size(); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    if (!params.inner_count) {
+      return identity_frag;
+    }
+    
+    ComputeFragment accumulator = identity_frag;
+
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += blockDim.z * gridDim.z;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+             accumulator, 
+             cvt);
+        }
+      }
+    };
+
+    // Optional reduction within a CTA
+    if (blockDim.z > 1) {
+
+      // Linearized thread ID
+      int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+
+      // all threads store to workspace
+      ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
+
+      frag_ptr[thread_idx] = accumulator;
+
+      __syncthreads();
+
+      if (threadIdx.z == 0) {
+        // Load all additional block indices
+        for (int z = 1; z < blockDim.z; ++z) {
+          ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            frag);
+        } 
+      }
+
+      __syncthreads();
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    }
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          // Convert to output type and store
+          NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+          auto cvt = convert_output(result);
+
+          *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+            reinterpret_cast<OutputFragment const &>(cvt);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with a device workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          int64_t byte_offset = 
+            blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
+
+          // No conversion - store in compute type
+          *reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) = 
+            reinterpret_cast<ComputeFragment const &>(result);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+        
+      } // while (outer index)
+    } // if ()
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStridedFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear index
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute tensor offset
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    char *src_byte_ptr) {
+
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < identity_frag.size(); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    ComputeFragment accumulator = identity_frag;
+    ComputeFragment workspace_fragments[kBatchSize];
+
+    // Partially unrolled loop
+    for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (idx + b < params.workspace_count) {
+          workspace_fragments[b] = 
+            *reinterpret_cast<ComputeFragment *>(src_byte_ptr);  
+        }
+        else {
+          workspace_fragments[b] = identity_frag;
+        }
+        src_byte_ptr += + params.workspace_stride;
+      }
+
+      // Perform a reduction
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorLength; ++i) {
+          accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
+        }
+      }
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
+
+      ComputeFragment result = reduce_indices_(
+        params, 
+        src_byte_ptr + src_byte_offset);
+
+      // Convert to output type and store
+      NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+      auto cvt = convert_output(result);
+
+      *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+        reinterpret_cast<OutputFragment const &>(cvt);
+
+      // Update indices and pointers
+      idx_linear += gridDim.y * blockDim.y;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/reduction/device/CMakeLists.txt
+++ b/test/unit/reduction/device/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_reduction_device
+  tensor_reduce_strided.cu
+  tensor_reduce_contiguous.cu
+)
+
--- a/test/unit/reduction/device/tensor_reduce_contiguous.cu
+++ b/test/unit/reduction/device/tensor_reduce_contiguous.cu
@ -0,0 +1,470 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for TensorReduce family of device-wide operators
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/reduction/device/tensor_reduce.h"
+
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This reduces the C dimension, transforming an NHWC tensor into NHWC with C=1.
+template <typename TensorReduction, typename ElementCompute = typename TensorReduction::ElementCompute>
+bool TestAllReduction_NHWC_reduce_c(ElementCompute reduction_identity = ElementCompute()) {
+
+  using Layout = typename TensorReduction::Layout;
+  using ElementOutput = typename TensorReduction::ElementOutput;
+  using ElementSource = typename TensorReduction::ElementSource;
+
+  int const kV = TensorReduction::kVectorLength;
+
+  int const N_indices[] = {3, 13};
+  int const H_indices[] = {5, 17};
+  int const W_indices[] = {7, 19};
+  int const C_indices[] = {2049, 2048, 2047, 384, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3, 2, 1};
+  
+  for (int N : N_indices) {
+    for (int H : H_indices) {
+      for (int W : W_indices) {
+        for (int Cx : C_indices) {
+
+          int C = Cx * kV;
+
+          cutlass::HostTensor<ElementSource, Layout> src_tensor({N, H, W, C});
+          cutlass::HostTensor<ElementOutput, Layout> dst_tensor({N, H, W, 1});
+
+          cutlass::reference::host::TensorFillRandomUniform(
+            src_tensor.host_view(), 17, 10, -10, 0);
+
+          dst_tensor.sync_device();
+          src_tensor.sync_device();
+
+          // Execute a tensor reduction over rank 3 (the 'C' dimension is reduced; NHWC => NHW)
+          TensorReduction reduction(src_tensor.extent(), 3);
+
+          cutlass::DeviceAllocation<uint8_t> device_workspace(reduction.workspace_size());
+
+          cutlass::Status status = reduction.reduce(
+            dst_tensor.device_ref(),
+            src_tensor.device_ref(),
+            device_workspace.get(),
+            reduction_identity
+          );
+
+          EXPECT_EQ(status, cutlass::Status::kSuccess);
+          EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+          
+          dst_tensor.sync_host();
+
+          typename TensorReduction::ReductionOp reduction_op;
+
+          //
+          // Reference check
+          //
+          for (int n = 0; n < src_tensor.extent().n(); ++n) {
+            for (int h = 0; h < src_tensor.extent().h(); ++h) {
+              for (int w = 0; w < src_tensor.extent().w(); ++w) {
+
+                ElementCompute c_accum = reduction_identity;
+
+                for (int c = 0; c < src_tensor.extent().c(); ++c) {
+                  c_accum = reduction_op(c_accum, ElementCompute(src_tensor.at({n, h, w, c})));
+                }
+
+                ElementCompute got = ElementCompute(dst_tensor.at({n, h, w, 0}));
+
+                bool equal = (c_accum == got);
+
+                EXPECT_TRUE(equal);
+                if (!equal) {
+
+                  std::cerr 
+                    << "Error at location (" << n << ", " << h << ", " << w << ", 0)" << std::endl;
+
+                  std::cerr 
+                    << "  expected: " << c_accum << std::endl
+                    << "       got: " << got << std::endl;
+
+                  std::cerr 
+                    << "Problem: " << src_tensor.extent() << " -> " 
+                    << dst_tensor.extent() << std::endl;
+
+                  std::cerr 
+                    << "   Grid: " << reduction.reduction_strided.grid_shape 
+                    << "\n   Block: " << reduction.reduction_strided.threadblock_shape << std::endl
+                    << "  FInal: " << reduction.reduction_strided.grid_final 
+                    << "\n   Block: " << reduction.reduction_strided.threadblock_final << "\n";
+
+                  return false;
+                }
+
+              } //w
+            } // h
+          } // n
+          
+          //
+          // Next problem
+          //
+
+        } // C
+      } // W
+    } // H
+  } // N
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x1) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x1_f16x1) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x2) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 2;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x2_f16x2) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  int const kV = 2;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x4) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 4;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x4_f16x4) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  int const kV = 4;
+  
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_maximum_c_f32x4) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 4;
+  
+  // Define the functor
+  using Functor = cutlass::maximum<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( -std::numeric_limits<float>::max() ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_minimum_c_f32x4) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 4;
+  
+  // Define the functor
+  using Functor = cutlass::minimum<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( std::numeric_limits<float>::max() ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_ANY_c_s32) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::logical_or<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(0) ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_ALL_c_s32) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::logical_and<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(1) ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_ANY_c_f32) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::logical_or<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(0) ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHW
+TEST(Reduction_TensorReduce, nhwc_ALL_c_f32) {
+
+  using Layout = cutlass::layout::TensorNHWC;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  int const kV = 1;
+  
+  // Define the functor
+  using Functor = cutlass::logical_and<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+  
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(1) ));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/reduction/device/tensor_reduce_strided.cu
+++ b/test/unit/reduction/device/tensor_reduce_strided.cu
@ -0,0 +1,517 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for TensorReduce family of device-wide operators
+*/
+
+#include <iostream>
+#include <limits>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/reduction/device/tensor_reduce.h"
+
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This reduces the W dimension, transforming an NHWC tensor into NHWC with W=1.
+template <
+  typename TensorReduction, 
+  typename ElementCompute = typename TensorReduction::ElementCompute
+>
+bool TestAllReduction_NHWC_reduce_w(ElementCompute reduction_identity = ElementCompute()) {
+
+  using Layout = typename TensorReduction::Layout;
+  using ElementOutput = typename TensorReduction::ElementOutput;
+  using ElementSource = typename TensorReduction::ElementSource;
+
+  int const kV = TensorReduction::kVectorLength;
+
+  int const N_indices[] = {1, 2, 5, 10};
+  int const H_indices[] = {1, 3, 9 };
+  int const W_indices[] = {1, 5, 19, 40, 224};
+  int const C_indices[] = {
+    kV, 
+    2 * kV, 
+    5 * kV, 
+    9 * kV, 
+    17 * kV, 
+    39 * kV, 
+    257 * kV, 
+    kV * 760
+  };
+
+  using Element = int;
+
+  for (int N : N_indices) {
+    for (int H : H_indices) {
+      for (int W : W_indices) {
+        for (int C : C_indices) {
+
+          cutlass::HostTensor<ElementSource, Layout> src_tensor({N, H, W, C});
+          cutlass::HostTensor<ElementOutput, Layout> dst_tensor({N, H, 1, C});
+
+          cutlass::reference::host::TensorFillRandomUniform(
+            src_tensor.host_view(), 17, 10, -10, 0);
+
+          cutlass::reference::host::BlockFillSequential(
+            dst_tensor.host_data(), dst_tensor.capacity());
+
+          dst_tensor.sync_device();
+          src_tensor.sync_device();
+
+          // Execute a tensor reduction over rank 2 (the 'W' dimension is reduced; NHWC => NHC)
+          TensorReduction reduction(src_tensor.extent(), 2);
+
+          cutlass::DeviceAllocation<uint8_t> device_workspace(reduction.workspace_size());
+
+          cutlass::Status status = reduction.reduce(
+            dst_tensor.device_ref(),
+            src_tensor.device_ref(),
+            device_workspace.get(),
+            reduction_identity
+          );
+
+          EXPECT_EQ(status, cutlass::Status::kSuccess);
+          EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+          // Reference check
+          dst_tensor.sync_host();
+
+          typename TensorReduction::ReductionOp reduction_op;
+
+          for (int n = 0; n < src_tensor.extent().n(); ++n) {
+            for (int h = 0; h < src_tensor.extent().h(); ++h) {
+              for (int c = 0; c < src_tensor.extent().c(); ++c) {
+
+                ElementCompute w_accum = reduction_identity;
+
+                for (int w = 0; w < src_tensor.extent().w(); ++w) {
+                  w_accum = reduction_op(w_accum, ElementCompute(src_tensor.at({n, h, w, c})));
+                }
+
+                ElementCompute got = ElementCompute(dst_tensor.at({n, h, 0, c}));
+
+                bool equal = (w_accum == got);
+
+                EXPECT_TRUE(equal);
+                if (!equal) {
+
+                  std::cerr 
+                    << "Error at location (" << n << ", " << h << ", 0, " << c << ")" << std::endl;
+
+                  std::cerr 
+                    << "  expected: " << w_accum << std::endl
+                    << "       got: " << got << std::endl;
+
+                  std::cerr 
+                    << "Problem: " << src_tensor.extent() << " -> " 
+                    << dst_tensor.extent() << std::endl;
+
+                  std::cerr 
+                    << "   Grid: " << reduction.reduction_strided.grid_shape 
+                    << "\n  Block: " << reduction.reduction_strided.threadblock_shape << std::endl
+                    << "  Final: " << reduction.reduction_strided.grid_final 
+                    << "\n  Block: " << reduction.reduction_strided.threadblock_final << "\n";
+
+                  return false;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x8_f16x8) {
+
+  int const kV = 8;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x2_f16x2) {
+
+  int const kV = 2;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x1_f16x1) {
+
+  int const kV = 1;
+  using ElementOutput = float;
+  using ElementSource = cutlass::half_t;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_reduce_w_s32x4) {
+
+  int const kV = 4;
+  using Element = int;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::plus<Element>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    Element,
+    Element,
+    Layout,
+    Functor,
+    kV,
+    Element
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_reduce_w_cf32) {
+
+  int const kV = 1;
+  using ElementOutput = cutlass::complex<float>;
+  using ElementSource = cutlass::complex<float>;
+  using ElementCompute = cutlass::complex<float>;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::plus<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_maximum_w_cf32) {
+
+  int const kV = 1;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::maximum<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>( -std::numeric_limits<float>::max() ));          
+}
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_minimum_w_cf32) {
+
+  int const kV = 1;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::minimum<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(std::numeric_limits<float>::max()));          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_XOR_w_u32) {
+
+  int const kV = 1;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::bit_xor<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_AND_w_s32) {
+
+  int const kV = 1;
+  using ElementOutput = unsigned;
+  using ElementSource = unsigned;
+  using ElementCompute = unsigned;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::bit_and<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(0xffffffff));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_OR_w_u32) {
+
+  int const kV = 1;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::bit_or<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_ANY_w_s32) {
+
+  int const kV = 1;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::logical_or<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_ALL_w_s32) {
+
+  int const kV = 1;
+  using ElementOutput = int;
+  using ElementSource = int;
+  using ElementCompute = int;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::logical_and<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_ANY_w_f32) {
+
+  int const kV = 1;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::logical_or<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test tensor reduction from NHWC to NHC
+TEST(Reduction_TensorReduce, nhwc_ALL_w_f32) {
+
+  int const kV = 1;
+  using ElementOutput = float;
+  using ElementSource = float;
+  using ElementCompute = float;
+  using Layout = cutlass::layout::TensorNHWC;
+
+  // Define the functor
+  using Functor = cutlass::logical_and<ElementCompute>;
+
+  using TensorReduction = cutlass::reduction::device::TensorReduction<
+    ElementOutput,
+    ElementSource,
+    Layout,
+    Functor,
+    kV,
+    ElementCompute
+  >;
+
+  EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));          
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////