Enabled tensor reduction kernels.
This commit is contained in:
parent
abdf16a4d9
commit
746b7b3247
258
include/cutlass/reduction/device/tensor_reduce.h
Normal file
258
include/cutlass/reduction/device/tensor_reduce.h
Normal file
@ -0,0 +1,258 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over one or more ranks of an affine tensor
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
|
||||
#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace device {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
|
||||
template <
|
||||
typename ElementOutput_,
|
||||
typename ElementSource_,
|
||||
typename Layout_,
|
||||
typename ReductionOp_,
|
||||
int VectorLength_ = 1,
|
||||
typename ElementCompute_ = ElementOutput_
|
||||
>
|
||||
struct TensorReduction {
|
||||
|
||||
using ElementOutput = ElementOutput_;
|
||||
using ElementSource = ElementSource_;
|
||||
using Layout = Layout_;
|
||||
using ReductionOp = ReductionOp_;
|
||||
static int const kVectorLength = VectorLength_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
|
||||
using TensorCoord = typename Layout::TensorCoord;
|
||||
|
||||
/// Reduction operator
|
||||
using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
|
||||
4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
|
||||
>;
|
||||
|
||||
using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
|
||||
4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
|
||||
>;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
ReductionDeviceStridedOperator reduction_strided;
|
||||
ReductionDeviceContiguousOperator reduction_contiguous;
|
||||
int reduction_index;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
///
|
||||
TensorReduction(
|
||||
TensorCoord extent,
|
||||
int reduction_index_
|
||||
):
|
||||
reduction_index(reduction_index_) {
|
||||
|
||||
Coord<4> extent_affine;
|
||||
|
||||
switch (reduction_index) {
|
||||
case 0:
|
||||
extent_affine[0] = extent[1];
|
||||
extent_affine[1] = extent[2];
|
||||
extent_affine[2] = extent[0];
|
||||
extent_affine[3] = extent[3];
|
||||
break;
|
||||
case 1:
|
||||
extent_affine[0] = extent[0];
|
||||
extent_affine[1] = extent[2];
|
||||
extent_affine[2] = extent[1];
|
||||
extent_affine[3] = extent[3];
|
||||
break;
|
||||
case 2:
|
||||
extent_affine[0] = extent[0];
|
||||
extent_affine[1] = extent[1];
|
||||
extent_affine[2] = extent[2];
|
||||
extent_affine[3] = extent[3];
|
||||
break;
|
||||
case 3:
|
||||
extent_affine[0] = extent[0];
|
||||
extent_affine[1] = extent[1];
|
||||
extent_affine[2] = extent[2];
|
||||
extent_affine[3] = extent[3];
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (reduction_index == 3) {
|
||||
reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);
|
||||
}
|
||||
else {
|
||||
reduction_strided = ReductionDeviceStridedOperator(extent_affine);
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple check to verify the object is initialized correctly
|
||||
bool good() const {
|
||||
if (reduction_index == 3) {
|
||||
return reduction_contiguous.good();
|
||||
}
|
||||
return reduction_strided.good();
|
||||
}
|
||||
|
||||
/// Size of one workspace
|
||||
int64_t workspace_stride() const {
|
||||
if (reduction_index == 3) {
|
||||
return reduction_contiguous.workspace_stride();
|
||||
}
|
||||
else {
|
||||
return reduction_strided.workspace_stride();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
|
||||
int64_t workspace_size() const {
|
||||
if (reduction_index == 3) {
|
||||
return reduction_contiguous.workspace_size();
|
||||
}
|
||||
else {
|
||||
return reduction_strided.workspace_size();
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper to use overloaded function call operator
|
||||
Status reduce(
|
||||
TensorRef<ElementOutput, Layout> dst_ref,
|
||||
TensorRef<ElementSource, Layout> src_ref,
|
||||
void *device_workspace_ptr = nullptr,
|
||||
ElementCompute reduction_identity = ElementCompute(),
|
||||
ReductionOp reduction_op = ReductionOp(),
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
int64_t src_stride[3];
|
||||
int64_t dst_stride[2];
|
||||
|
||||
switch (reduction_index) {
|
||||
case 0:
|
||||
src_stride[0] = src_ref.stride()[1];
|
||||
src_stride[1] = src_ref.stride()[0];
|
||||
src_stride[2] = src_ref.stride()[2];
|
||||
dst_stride[0] = dst_ref.stride()[1];
|
||||
dst_stride[1] = dst_ref.stride()[0];
|
||||
break;
|
||||
case 1:
|
||||
src_stride[0] = src_ref.stride()[2];
|
||||
src_stride[1] = src_ref.stride()[0];
|
||||
src_stride[2] = src_ref.stride()[1];
|
||||
dst_stride[0] = dst_ref.stride()[2];
|
||||
dst_stride[1] = dst_ref.stride()[0];
|
||||
break;
|
||||
case 2:
|
||||
src_stride[0] = src_ref.stride()[2];
|
||||
src_stride[1] = src_ref.stride()[1];
|
||||
src_stride[2] = src_ref.stride()[0];
|
||||
dst_stride[0] = dst_ref.stride()[2];
|
||||
dst_stride[1] = dst_ref.stride()[1];
|
||||
break;
|
||||
case 3:
|
||||
src_stride[0] = src_ref.stride()[2];
|
||||
src_stride[1] = src_ref.stride()[1];
|
||||
src_stride[2] = src_ref.stride()[0];
|
||||
|
||||
dst_stride[0] = dst_ref.stride()[2];
|
||||
dst_stride[1] = dst_ref.stride()[1];
|
||||
dst_stride[2] = dst_ref.stride()[0];
|
||||
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (reduction_index == 3) {
|
||||
return reduction_contiguous(
|
||||
dst_ref.data(),
|
||||
dst_stride,
|
||||
src_ref.data(),
|
||||
src_stride,
|
||||
device_workspace_ptr,
|
||||
reduction_identity,
|
||||
reduction_op,
|
||||
stream);
|
||||
}
|
||||
else {
|
||||
return reduction_strided(
|
||||
dst_ref.data(),
|
||||
dst_stride,
|
||||
src_ref.data(),
|
||||
src_stride,
|
||||
device_workspace_ptr,
|
||||
reduction_identity,
|
||||
reduction_op,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
Status operator()(
|
||||
TensorRef<ElementOutput, Layout> dst_ref,
|
||||
TensorRef<ElementSource, Layout> src_ref,
|
||||
void *device_workspace_ptr = nullptr,
|
||||
ElementCompute reduction_identity = ElementCompute(),
|
||||
ReductionOp reduction_op = ReductionOp(),
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
return reduce(
|
||||
dst_ref,
|
||||
src_ref,
|
||||
device_workspace_ptr,
|
||||
reduction_identity,
|
||||
reduction_op,
|
||||
stream);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -0,0 +1,367 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over one or more ranks of an affine tensor
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace device {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Tensor reduction operator on layouts which are affine
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (e.g. ND => 2)
|
||||
typename ElementOutput_,
|
||||
typename ElementSource_,
|
||||
typename ReductionOp_,
|
||||
int VectorLength = 1,
|
||||
typename ElementCompute_ = ElementOutput_,
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
struct TensorReductionAffineContiguous {
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
|
||||
using ElementOutput = ElementOutput_;
|
||||
using ElementSource = ElementSource_;
|
||||
using ReductionOp = ReductionOp_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Internal status field
|
||||
Status status;
|
||||
|
||||
/// Extent of tensor in source layout
|
||||
Coord<kRank> extent;
|
||||
|
||||
/// Number of points in the outer index space
|
||||
int64_t outer_count;
|
||||
|
||||
/// Number of elements in the inner index space
|
||||
int64_t inner_count;
|
||||
|
||||
/// Number of workspaces needed
|
||||
int workspace_count;
|
||||
|
||||
/// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
|
||||
dim3 grid_shape;
|
||||
|
||||
/// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
|
||||
dim3 threadblock_shape;
|
||||
|
||||
/// CUDA grid shape for the final reduction step if needed
|
||||
dim3 grid_final;
|
||||
|
||||
/// CUDA threadblock shape for the final reduction step if needed
|
||||
dim3 threadblock_final;
|
||||
|
||||
private:
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Helper to reshape 'count' such that it is less than 2 x 'ext'
|
||||
static int reshape_pow2(int ext, int count) {
|
||||
if (ext > count) {
|
||||
return 1;
|
||||
}
|
||||
int x = 1;
|
||||
for (; count >= ext * 2; ) {
|
||||
count >>= 1;
|
||||
x <<= 1;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Default ctor
|
||||
TensorReductionAffineContiguous():
|
||||
status(Status::kErrorInvalidProblem),
|
||||
extent(),
|
||||
outer_count(0),
|
||||
inner_count(0),
|
||||
workspace_count(0),
|
||||
grid_shape(0, 0, 0),
|
||||
threadblock_shape(0, 0, 0) { }
|
||||
|
||||
/// Constructor
|
||||
TensorReductionAffineContiguous(
|
||||
Coord<kRank> extent_,
|
||||
int target_threadblock_count = 128
|
||||
):
|
||||
status(Status::kSuccess),
|
||||
extent(extent_),
|
||||
outer_count(0),
|
||||
inner_count(0),
|
||||
workspace_count(0) {
|
||||
|
||||
//
|
||||
// Plan the parallel mapping strategy.
|
||||
//
|
||||
|
||||
outer_count = 1;
|
||||
inner_count = 1;
|
||||
|
||||
// Compute number of elements in strided ranks
|
||||
for (int p = 0; p < kReducedRank; ++p) {
|
||||
outer_count *= extent[p];
|
||||
}
|
||||
|
||||
for (int p = 0; p < kInnerRank; ++p) {
|
||||
inner_count *= extent[kReducedRank + p];
|
||||
}
|
||||
|
||||
int cta_count_x = 1;
|
||||
int cta_count_y = 1;
|
||||
int cta_count_z = 1;
|
||||
|
||||
int cta_threads_x = kThreads;
|
||||
int cta_threads_y = 1;
|
||||
int cta_threads_z = 1;
|
||||
|
||||
// Determine CTA shape
|
||||
int64_t inner_vector_count = inner_count / kVectorLength;
|
||||
|
||||
// Priority 1. Assign threadblocks to outer indices if possible
|
||||
if (outer_count > target_threadblock_count) {
|
||||
cta_count_x = 1;
|
||||
cta_count_y = target_threadblock_count;
|
||||
cta_count_z = 1;
|
||||
}
|
||||
else {
|
||||
|
||||
cta_count_y = int(outer_count);
|
||||
int remaining_ctas = target_threadblock_count / cta_count_y;
|
||||
|
||||
// Priority 2. Assign inner dimensions to one CTA
|
||||
if (inner_vector_count > cta_threads_x) {
|
||||
int64_t cta_z_bound = inner_vector_count / cta_threads_x;
|
||||
if (cta_z_bound > remaining_ctas) {
|
||||
cta_count_z = remaining_ctas;
|
||||
}
|
||||
else {
|
||||
cta_count_z = int(cta_z_bound);
|
||||
}
|
||||
}
|
||||
else {
|
||||
cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
|
||||
cta_count_z = 1;
|
||||
}
|
||||
}
|
||||
|
||||
grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
|
||||
threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
|
||||
|
||||
workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
|
||||
|
||||
// Determine shape of final reduction kernel if needed
|
||||
if (workspace_count) {
|
||||
|
||||
int final_threads = kThreads;
|
||||
int final_ctas = 1;
|
||||
|
||||
if (outer_count > kThreads) {
|
||||
final_ctas = int(outer_count + kThreads - 1) / kThreads;
|
||||
}
|
||||
else {
|
||||
final_threads = int(outer_count);
|
||||
}
|
||||
|
||||
grid_final = dim3(final_ctas, 1, 1);
|
||||
threadblock_final = dim3(final_threads, 1, 1);
|
||||
}
|
||||
else {
|
||||
grid_final = dim3(0, 0, 0);
|
||||
threadblock_final = dim3(0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple check to verify the object is initialized correctly
|
||||
bool good() const {
|
||||
return status == Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Size (in bytes) of <outer_count> workspace elements which are densely packed together
|
||||
int64_t workspace_stride() const {
|
||||
|
||||
// Error condition
|
||||
if (!good()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return outer_count * sizeof_bits<ElementCompute>::value / 8;
|
||||
}
|
||||
|
||||
/// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
|
||||
int64_t workspace_size() const {
|
||||
|
||||
// Error condition
|
||||
if (!good()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// No reduction across CTAs
|
||||
if (grid_shape.z == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return workspace_stride() * grid_shape.z;
|
||||
}
|
||||
|
||||
/// Performs a reduction
|
||||
Status reduce(
|
||||
ElementOutput *dst_ptr, ///< Pointer to destination tensor
|
||||
int64_t dst_stride[], ///< Stride vector (of length kReducedRank - 1)
|
||||
ElementSource const *src_ptr, ///< Pointer to source tensor
|
||||
int64_t src_stride[], ///< Stride vector (of length kRank - 1)
|
||||
void *device_workspace_ptr = nullptr, ///< Device workspace
|
||||
ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
|
||||
ReductionOp reduction_op = ReductionOp(), ///< Reduction operator
|
||||
cudaStream_t stream = nullptr) { ///< CUDA Stream into which all kernels are launched
|
||||
|
||||
// Initial status check
|
||||
if (!good()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Guard against null workspace
|
||||
if (workspace_count > 1 && device_workspace_ptr == nullptr) {
|
||||
return Status::kErrorWorkspaceNull;
|
||||
}
|
||||
|
||||
// Define reduction kernel
|
||||
using ReductionKernel = kernel::TensorReductionAffineContiguous<
|
||||
kRank,
|
||||
kReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
kVectorLength,
|
||||
ElementCompute,
|
||||
kThreads>;
|
||||
|
||||
using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
|
||||
kRank,
|
||||
kReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
kVectorLength,
|
||||
ElementCompute,
|
||||
kThreads>;
|
||||
|
||||
using Params = typename ReductionKernel::Params;
|
||||
|
||||
// Construct the parameters
|
||||
Params params(
|
||||
extent,
|
||||
dst_ptr,
|
||||
dst_stride,
|
||||
src_ptr,
|
||||
src_stride,
|
||||
static_cast<ElementCompute *>(device_workspace_ptr),
|
||||
workspace_stride(),
|
||||
workspace_count,
|
||||
reduction_op,
|
||||
reduction_identity);
|
||||
|
||||
// Shared memory size
|
||||
int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
|
||||
|
||||
// Launch the kernel
|
||||
Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
|
||||
|
||||
// Check error condition
|
||||
if (cudaPeekAtLastError() == cudaSuccess) {
|
||||
status = Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
status = Status::kErrorInternal;
|
||||
}
|
||||
|
||||
// Final reduction kernel
|
||||
if (workspace_count) {
|
||||
Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
|
||||
}
|
||||
|
||||
// Check error condition
|
||||
if (cudaPeekAtLastError() == cudaSuccess) {
|
||||
status = Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
status = Status::kErrorInternal;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/// Helper to use overloaded function call operator
|
||||
Status operator()(
|
||||
ElementOutput *dst_ptr, ///< Pointer to destination tensor
|
||||
int64_t dst_stride[], ///< Stride vector (of length kReducedRank - 1)
|
||||
ElementSource const *src_ptr, ///< Pointer to source tensor
|
||||
int64_t src_stride[], ///< Stride vector (of length kRank - 1)
|
||||
void *device_workspace_ptr = nullptr, ///< Pointer to device workspace
|
||||
ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
|
||||
ReductionOp reduction_op = ReductionOp(), ///< Reduction operator
|
||||
cudaStream_t stream = nullptr) { ///< CUDA Stream into which all kernels are launched
|
||||
|
||||
return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
355
include/cutlass/reduction/device/tensor_reduce_affine_strided.h
Normal file
355
include/cutlass/reduction/device/tensor_reduce_affine_strided.h
Normal file
@ -0,0 +1,355 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over one or more ranks of an affine tensor
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace device {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Tensor reduction operator on layouts which are affine
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput_,
|
||||
typename ElementSource_,
|
||||
typename ReductionOp_,
|
||||
int VectorLength = 1,
|
||||
typename ElementCompute_ = ElementOutput_,
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
struct TensorReductionAffineStrided {
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
|
||||
using ElementOutput = ElementOutput_;
|
||||
using ElementSource = ElementSource_;
|
||||
using ReductionOp = ReductionOp_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Internal status field
|
||||
Status status;
|
||||
|
||||
/// Extent of tensor in source layout
|
||||
Coord<kRank> extent;
|
||||
|
||||
/// Number of points in the outer index space
|
||||
int64_t outer_count;
|
||||
|
||||
/// Number of elements in the inner index space
|
||||
int64_t inner_count;
|
||||
|
||||
/// Number of workspaces needed
|
||||
int workspace_count;
|
||||
|
||||
/// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
|
||||
dim3 grid_shape;
|
||||
|
||||
/// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
|
||||
dim3 threadblock_shape;
|
||||
|
||||
/// CUDA grid shape for the final reduction step if needed
|
||||
dim3 grid_final;
|
||||
|
||||
/// CUDA threadblock shape for the final reduction step if needed
|
||||
dim3 threadblock_final;
|
||||
|
||||
private:
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Helper to reshape 'count' such that it is less than 2 x 'ext'
|
||||
static int reshape_pow2(int ext, int count) {
|
||||
if (ext > count) {
|
||||
return 1;
|
||||
}
|
||||
int x = 1;
|
||||
for (; count >= ext * 2; ) {
|
||||
count >>= 1;
|
||||
x <<= 1;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Default ctor
|
||||
TensorReductionAffineStrided():
|
||||
status(Status::kErrorInvalidProblem),
|
||||
extent(),
|
||||
outer_count(0),
|
||||
inner_count(0),
|
||||
workspace_count(0),
|
||||
grid_shape(0, 0, 0),
|
||||
threadblock_shape(0, 0, 0) { }
|
||||
|
||||
/// Constructor
|
||||
TensorReductionAffineStrided(
|
||||
Coord<kRank> extent_,
|
||||
int target_threadblock_count = 128
|
||||
):
|
||||
status(Status::kSuccess),
|
||||
extent(extent_),
|
||||
outer_count(0),
|
||||
inner_count(0),
|
||||
workspace_count(0) {
|
||||
|
||||
//
|
||||
// Plan the parallel mapping strategy.
|
||||
//
|
||||
|
||||
outer_count = 1;
|
||||
inner_count = 1;
|
||||
|
||||
// Compute number of elements in strided ranks
|
||||
for (int p = 0; p < kReducedRank - 1; ++p) {
|
||||
outer_count *= extent[p];
|
||||
}
|
||||
|
||||
for (int p = 0; p < kInnerRank; ++p) {
|
||||
inner_count *= extent[kReducedRank + p - 1];
|
||||
}
|
||||
|
||||
// Compute plan for the reduction
|
||||
int extent_c = extent[kRank - 1];
|
||||
int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
|
||||
|
||||
// Determine CTA shape
|
||||
int cta_width = kThreads * kVectorLength;
|
||||
int cta_ways = reshape_pow2(extent_c, cta_width);
|
||||
int cta_threads_x = kThreads / cta_ways;
|
||||
|
||||
threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
|
||||
|
||||
// This leads to an error.
|
||||
if (threadblock_shape.z > 1) {
|
||||
if (threadblock_shape.y != 1) {
|
||||
status = Status::kErrorInternal;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine grid shape
|
||||
int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
|
||||
int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
|
||||
|
||||
// Limit the number of CTAs assigned to outer dimension
|
||||
if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
|
||||
cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
|
||||
}
|
||||
|
||||
// Limit the number of CTAs assigned to inner dimension
|
||||
int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
|
||||
if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
|
||||
cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
|
||||
}
|
||||
|
||||
grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
|
||||
workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
|
||||
|
||||
// Determine shape of final reduction kernel if needed
|
||||
grid_final = dim3(cta_count_x, int(outer_count));
|
||||
threadblock_final = dim3(cta_threads_x, 1, 1);
|
||||
}
|
||||
|
||||
/// Simple check to verify the object is initialized correctly
|
||||
bool good() const {
|
||||
return status == Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Size of one CTA's workspace
|
||||
int64_t workspace_stride() const {
|
||||
|
||||
// Error condition
|
||||
if (!good()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
|
||||
|
||||
return extent[kRank - 1] * vector_size_bytes;
|
||||
}
|
||||
|
||||
/// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
|
||||
int64_t workspace_size() const {
|
||||
|
||||
// Error condition
|
||||
if (!good()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// No reduction across CTAs
|
||||
if (grid_shape.z == 1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return workspace_stride() * outer_count * grid_shape.z;
|
||||
}
|
||||
|
||||
/// Performs a reduction
|
||||
Status reduce(
|
||||
ElementOutput *dst_ptr, ///< Pointer to destination tensor
|
||||
int64_t dst_stride[], ///< Stride vector (of length kReducedRank - 1)
|
||||
ElementSource const *src_ptr, ///< Pointer to source tensor
|
||||
int64_t src_stride[], ///< Stride vector (of length kRank - 1)
|
||||
void *device_workspace_ptr = nullptr, ///< Device workspace
|
||||
ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
|
||||
ReductionOp reduction_op = ReductionOp(), ///< Reduction operator
|
||||
cudaStream_t stream = nullptr) { ///< CUDA Stream into which all kernels are launched
|
||||
|
||||
// Initial status check
|
||||
if (!good()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Guard against null workspace
|
||||
if (workspace_count > 1 && device_workspace_ptr == nullptr) {
|
||||
return Status::kErrorWorkspaceNull;
|
||||
}
|
||||
|
||||
// Define reduction kernel
|
||||
using ReductionKernel = kernel::TensorReductionAffineStrided<
|
||||
kRank,
|
||||
kReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
kVectorLength,
|
||||
ElementCompute,
|
||||
kThreads>;
|
||||
|
||||
using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
|
||||
kRank,
|
||||
kReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
kVectorLength,
|
||||
ElementCompute,
|
||||
kThreads>;
|
||||
|
||||
using Params = typename ReductionKernel::Params;
|
||||
|
||||
// Construct the parameters
|
||||
Params params(
|
||||
extent,
|
||||
dst_ptr,
|
||||
dst_stride,
|
||||
src_ptr,
|
||||
src_stride,
|
||||
static_cast<ElementCompute *>(device_workspace_ptr),
|
||||
workspace_stride(),
|
||||
workspace_count,
|
||||
reduction_op,
|
||||
reduction_identity);
|
||||
|
||||
// Shared memory size
|
||||
int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
|
||||
|
||||
// Launch the kernel
|
||||
Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
|
||||
|
||||
// Check error condition
|
||||
if (cudaPeekAtLastError() == cudaSuccess) {
|
||||
status = Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
status = Status::kErrorInternal;
|
||||
}
|
||||
|
||||
// Final reduction kernel
|
||||
if (workspace_count) {
|
||||
|
||||
Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
|
||||
|
||||
// Check error condition
|
||||
if (cudaPeekAtLastError() == cudaSuccess) {
|
||||
status = Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
status = Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/// Helper to use overloaded function call operator
|
||||
Status operator()(
|
||||
ElementOutput *dst_ptr, ///< Pointer to destination tensor
|
||||
int64_t dst_stride[], ///< Stride vector (of length kReducedRank - 1)
|
||||
ElementSource const *src_ptr, ///< Pointer to source tensor
|
||||
int64_t src_stride[], ///< Stride vector (of length kRank - 1)
|
||||
void *device_workspace_ptr = nullptr, ///< Pointer to device workspace
|
||||
ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
|
||||
ReductionOp reduction_op = ReductionOp(), ///< Reduction operator
|
||||
cudaStream_t stream = nullptr) { ///< CUDA Stream into which all kernels are launched
|
||||
|
||||
return reduce(
|
||||
dst_ptr,
|
||||
dst_stride,
|
||||
src_ptr,
|
||||
src_stride,
|
||||
device_workspace_ptr,
|
||||
reduction_identity,
|
||||
reduction_op,
|
||||
stream);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace device
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -0,0 +1,600 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over one or more ranks of an affine tensor
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#include "cutlass/reduction/thread/reduction_operators.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
namespace kernel {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Parameters structure
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (i.e. number of outer ranks)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
struct TensorReductionAffineContiguousParams {
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
|
||||
Coord<kRank> extent; /// Extent of source tensor
|
||||
FastDivmodU64 divmod[kRank - 1]; /// FastDivmod by each strided rank
|
||||
int64_t dst_stride[kReducedRank]; /// stride (units of bytes) - I, J
|
||||
int64_t src_stride[kRank - 1]; /// stride (units of bytes) - I, J, K
|
||||
int64_t workspace_stride; /// stride (units of bytes) between workspace
|
||||
int workspace_count; /// number of workspaces
|
||||
|
||||
uint64_t inner_count; /// Number of elements in reduced index space
|
||||
uint64_t outer_count; /// Number of elements in outer index space
|
||||
|
||||
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
||||
ElementSource const * source; /// Poitner to source pointer of rank kRank
|
||||
ReductionOp reduction_op; /// Reduction operator
|
||||
ElementCompute reduction_identity; /// Identity element used by reduction operator
|
||||
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorReductionAffineContiguousParams() {
|
||||
|
||||
}
|
||||
|
||||
/// Ctor
|
||||
TensorReductionAffineContiguousParams(
|
||||
Coord<kRank> extent_, ///< Extent of source tensor
|
||||
ElementOutput * dst_ptr_, ///< Output tensor data
|
||||
int64_t dst_stride_[], ///< Stride (units of elements)
|
||||
ElementSource const * src_ptr_, ///< Source tensor data
|
||||
int64_t src_stride_[], ///< Stride (units of elements)
|
||||
ElementCompute *device_workspace_, ///< Pointer to device workspace for inter-CTA reductions
|
||||
int64_t workspace_stride_, ///< Stride between workspaces
|
||||
int workspace_count_, ///< Number of workspaces
|
||||
ReductionOp reduction_op_, ///< Reduction operator
|
||||
ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
|
||||
):
|
||||
extent(extent_),
|
||||
inner_count(1),
|
||||
outer_count(1),
|
||||
destination(dst_ptr_),
|
||||
source(src_ptr_),
|
||||
device_workspace(device_workspace_),
|
||||
workspace_stride(workspace_stride_),
|
||||
workspace_count(workspace_count_),
|
||||
reduction_op(reduction_op_),
|
||||
reduction_identity(reduction_identity_) {
|
||||
|
||||
// Initialize divisors for fast div-mod
|
||||
for (int p = 1; p < kRank; ++p) {
|
||||
divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
|
||||
}
|
||||
|
||||
int input_size_bits = sizeof_bits<ElementSource>::value;
|
||||
int output_size_bits = sizeof_bits<ElementOutput>::value;
|
||||
|
||||
// Compute strides in units of bytes
|
||||
for (int p = 0; p < kReducedRank; ++p) {
|
||||
dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
|
||||
}
|
||||
|
||||
for (int p = 0; p < kRank - 1; ++p) {
|
||||
src_stride[p] = src_stride_[p] * input_size_bits / 8;
|
||||
}
|
||||
|
||||
// Compute number of elements in strided ranks
|
||||
for (int p = 0; p < kReducedRank; ++p) {
|
||||
outer_count *= uint64_t(extent[p]);
|
||||
}
|
||||
|
||||
for (int p = 0; p < kInnerRank; ++p) {
|
||||
inner_count *= uint64_t(extent[kRank - 1 - p]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
|
||||
/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
class TensorReductionAffineContiguous {
|
||||
public:
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
using ComputeFragment = Array<ElementCompute, VectorLength>;
|
||||
using SourceFragment = AlignedArray<ElementSource, VectorLength>;
|
||||
using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
|
||||
|
||||
/// Shared memory allocation used for reduction within the CTA
|
||||
struct SharedStorage {
|
||||
Array<ElementCompute, kThreads * kVectorLength> workspace;
|
||||
};
|
||||
|
||||
/// Parameters structure
|
||||
using Params = TensorReductionAffineContiguousParams<
|
||||
Rank,
|
||||
ReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
VectorLength,
|
||||
ElementCompute,
|
||||
Threads,
|
||||
BatchSize
|
||||
>;
|
||||
|
||||
private:
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_inner_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kInnerRank> & coord,
|
||||
int64_t &src_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose into a coordinate of rank <kInnerRank>
|
||||
coord = CoordinateDecomposition<kInnerRank>(linear_idx, ¶ms.divmod[kRank - kInnerRank]);
|
||||
|
||||
// Compute an offset using the souce stride
|
||||
src_offset = 0;
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kInnerRank - 1; ++i) {
|
||||
src_offset += coord[i] * params.src_stride[kReducedRank + i];
|
||||
}
|
||||
src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
|
||||
}
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_outer_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kReducedRank> & coord,
|
||||
int64_t &dst_offset,
|
||||
int64_t &src_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose into coordinate of rank <kReducedRank>
|
||||
coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
|
||||
|
||||
// Compute offsets using destination and source strides
|
||||
dst_offset = 0;
|
||||
src_offset = 0;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kReducedRank; ++i) {
|
||||
dst_offset += params.dst_stride[i] * coord[i];
|
||||
src_offset += params.src_stride[i] * coord[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Reduces over the reduction indices yielding a single element
|
||||
CUTLASS_DEVICE
|
||||
ElementCompute reduce_indices_(
|
||||
Params const ¶ms,
|
||||
ElementCompute *threadblock_workspace,
|
||||
char const *src_byte_ptr,
|
||||
int coord_c) {
|
||||
|
||||
NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
|
||||
ReductionOp reduction_op(params.reduction_op);
|
||||
|
||||
//
|
||||
// Early exit or initialize to identity element
|
||||
//
|
||||
if (!params.inner_count) {
|
||||
return params.reduction_identity;
|
||||
}
|
||||
|
||||
ComputeFragment accumulator;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < accumulator.size(); ++i) {
|
||||
accumulator[i] = params.reduction_identity;
|
||||
}
|
||||
|
||||
// Compute the coordinate of the first access
|
||||
int64_t src_byte_offset = 0;
|
||||
Coord<kInnerRank> coord;
|
||||
|
||||
uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
|
||||
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
|
||||
|
||||
// Load the first vector
|
||||
SourceFragment source_fragment[kBatchSize];
|
||||
|
||||
bool not_done = true;
|
||||
|
||||
// Iterate over vectors in a linearized reduction index space
|
||||
while (not_done) {
|
||||
|
||||
bool guards[kBatchSize];
|
||||
|
||||
// Issue a batch of loads
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
|
||||
if (linear_idx < params.inner_count) {
|
||||
source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
|
||||
guards[b] = true;
|
||||
}
|
||||
else {
|
||||
guards[b] = false;
|
||||
not_done = false;
|
||||
}
|
||||
|
||||
linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
|
||||
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
|
||||
}
|
||||
|
||||
// Perform a batch of reduction operations
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
if (guards[b]) {
|
||||
auto cvt = convert_source(source_fragment[b]);
|
||||
|
||||
accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
|
||||
reduction_op,
|
||||
accumulator,
|
||||
cvt);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Reduction of vectors to scalar
|
||||
//
|
||||
|
||||
ElementCompute reduced_accumulator = accumulator[0];
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 1; i < kVectorLength; ++i) {
|
||||
reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
|
||||
}
|
||||
|
||||
//
|
||||
// Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
|
||||
//
|
||||
// This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
|
||||
//
|
||||
|
||||
int thread_count = blockDim.x * blockDim.z;
|
||||
int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
|
||||
int thread_i = threadIdx.y;
|
||||
|
||||
ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
|
||||
|
||||
frag_ptr[thread_j] = reduced_accumulator;
|
||||
|
||||
//
|
||||
// Reduce
|
||||
//
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
while (thread_count > 1) {
|
||||
thread_count /= 2;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (thread_j < thread_count) {
|
||||
ElementCompute other = frag_ptr[thread_j + thread_count];
|
||||
|
||||
reduced_accumulator = reduction_op(reduced_accumulator, other);
|
||||
|
||||
frag_ptr[thread_j] = reduced_accumulator;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
|
||||
return reduced_accumulator;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Perform a reduction
|
||||
CUTLASS_DEVICE
|
||||
void operator()(Params const ¶ms, SharedStorage &shared_storage) {
|
||||
|
||||
int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
|
||||
|
||||
char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
|
||||
char * dst_byte_ptr = nullptr;
|
||||
|
||||
// If performing a reduction across CTAs, redirect output to device workspace
|
||||
if (gridDim.z == 1) {
|
||||
dst_byte_ptr = reinterpret_cast<char *>(params.destination);
|
||||
}
|
||||
else {
|
||||
dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
|
||||
}
|
||||
|
||||
uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
// Use modulo division to compute location
|
||||
Coord<kReducedRank> outer_coord;
|
||||
int64_t dst_byte_offset;
|
||||
int64_t src_byte_offset;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
if (gridDim.z == 1) {
|
||||
|
||||
/// Complete the reduction with no workspace
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
ElementCompute result = reduce_indices_(
|
||||
params,
|
||||
shared_storage.workspace.data(),
|
||||
src_byte_ptr + src_byte_offset,
|
||||
coord_c);
|
||||
|
||||
// Store the result after possible final reduction within the CTA
|
||||
if (threadIdx.z == 0 && threadIdx.x == 0) {
|
||||
|
||||
// Convert to output type and store
|
||||
NumericConverter<ElementOutput, ElementCompute> convert_output;
|
||||
ElementOutput cvt = convert_output(result);
|
||||
|
||||
*reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.y * blockDim.y;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
} // while
|
||||
}
|
||||
else {
|
||||
|
||||
/// Complete the reduction with workspace
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
ElementCompute result = reduce_indices_(
|
||||
params,
|
||||
shared_storage.workspace.data(),
|
||||
src_byte_ptr + src_byte_offset,
|
||||
coord_c);
|
||||
|
||||
int64_t byte_offset =
|
||||
blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
|
||||
|
||||
// Store the result for final reduction
|
||||
if (threadIdx.z == 0 && threadIdx.x == 0) {
|
||||
*reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.y * blockDim.y;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
} // while
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel to perform final reduction
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
class TensorReductionAffineContiguousFinal {
|
||||
public:
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
|
||||
/// Shared memory
|
||||
struct SharedStorage { };
|
||||
|
||||
/// Parameters structure
|
||||
using Params = TensorReductionAffineContiguousParams<
|
||||
Rank,
|
||||
ReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
VectorLength,
|
||||
ElementCompute,
|
||||
Threads,
|
||||
BatchSize
|
||||
>;
|
||||
|
||||
private:
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_outer_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kReducedRank> & coord,
|
||||
int64_t &dst_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose into coordinate of rank <kReducedRank>
|
||||
coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
|
||||
|
||||
// Compute offsets using destination and source strides
|
||||
dst_offset = 0;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kReducedRank; ++i) {
|
||||
dst_offset += params.dst_stride[i] * coord[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Reduces over the reduction indices
|
||||
CUTLASS_DEVICE
|
||||
ElementCompute reduce_indices_(
|
||||
Params const ¶ms,
|
||||
ElementCompute const *device_workspace) {
|
||||
|
||||
ReductionOp reduction_op(params.reduction_op);
|
||||
char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
|
||||
|
||||
// Accumulated output
|
||||
ElementCompute accumulator = params.reduction_identity;
|
||||
|
||||
for (int iter = 0; iter < params.workspace_count; ++iter) {
|
||||
ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
|
||||
|
||||
accumulator = reduction_op(accumulator, workspace_item);
|
||||
|
||||
src_byte_ptr += params.workspace_stride;
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Perform a reduction
|
||||
CUTLASS_DEVICE
|
||||
void operator()(Params const ¶ms, SharedStorage &shared_storage) {
|
||||
|
||||
uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
|
||||
|
||||
// Use modulo division to compute location
|
||||
Coord<kReducedRank> outer_coord;
|
||||
int64_t dst_byte_offset;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
/// Complete the reduction
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
|
||||
|
||||
// Convert to output type and store
|
||||
NumericConverter<ElementOutput, ElementCompute> convert_output;
|
||||
|
||||
*reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.x * blockDim.x;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
idx_linear);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
635
include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
Normal file
635
include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
Normal file
@ -0,0 +1,635 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Kernel performing a reduction over one or more ranks of an affine tensor
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/numeric_conversion.h"
|
||||
#include "cutlass/device_kernel.h"
|
||||
|
||||
#include "cutlass/reduction/thread/reduction_operators.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace reduction {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace kernel {
|
||||
|
||||
/// Parameters structure
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
struct TensorReductionAffineStridedParams {
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
|
||||
Coord<kRank> extent; /// Extent of source tensor
|
||||
FastDivmodU64 divmod[kRank - 2]; /// FastDivmod by each strided rank
|
||||
int64_t dst_stride[kReducedRank - 1]; /// stride (units of bytes) - I, J
|
||||
int64_t src_stride[kRank - 1]; /// stride (units of bytes) - I, J, K
|
||||
int64_t workspace_stride; /// stride (units of bytes) between workspace
|
||||
int64_t workspace_outer_stride; /// stride (units of bytes) between 'rows' of the workspace
|
||||
int workspace_count; /// number of workspaces
|
||||
|
||||
uint64_t inner_count; /// Number of elements in reduced index space
|
||||
uint64_t outer_count; /// Number of elements in outer index space
|
||||
|
||||
ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank
|
||||
ElementSource const * source; /// Poitner to source pointer of rank kRank
|
||||
ReductionOp reduction_op; /// Reduction operator
|
||||
ElementCompute reduction_identity; /// Identity element for reduction operator
|
||||
ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TensorReductionAffineStridedParams() {
|
||||
|
||||
}
|
||||
|
||||
/// Ctor
|
||||
TensorReductionAffineStridedParams(
|
||||
Coord<kRank> extent_, ///< Extent of source tensor
|
||||
ElementOutput * dst_ptr_, ///< Output tensor data
|
||||
int64_t dst_stride_[], ///< Stride (units of elements)
|
||||
ElementSource const * src_ptr_, ///< Source tensor data
|
||||
int64_t src_stride_[], ///< Stride (units of elements)
|
||||
ElementCompute *device_workspace_, ///< Pointer to device workspace for inter-CTA reductions
|
||||
int64_t workspace_stride_, ///< Stride between workspaces
|
||||
int workspace_count_, ///< Number of workspaces
|
||||
ReductionOp reduction_op_, ///< Reduction operator
|
||||
ElementCompute reduction_identity_ = ElementCompute() ///< Identity element for reduction operator
|
||||
):
|
||||
extent(extent_),
|
||||
inner_count(1),
|
||||
outer_count(1),
|
||||
destination(dst_ptr_),
|
||||
source(src_ptr_),
|
||||
device_workspace(device_workspace_),
|
||||
workspace_outer_stride(0),
|
||||
workspace_stride(workspace_stride_),
|
||||
workspace_count(workspace_count_),
|
||||
reduction_op(reduction_op_),
|
||||
reduction_identity(reduction_identity_) {
|
||||
|
||||
// Initialize divisors for fast div-mod
|
||||
for (int p = 1; p < kRank - 1; ++p) {
|
||||
divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
|
||||
}
|
||||
|
||||
int input_size_bits = sizeof_bits<ElementSource>::value;
|
||||
int output_size_bits = sizeof_bits<ElementOutput>::value;
|
||||
|
||||
workspace_outer_stride = workspace_stride * workspace_count;
|
||||
|
||||
// Compute strides in units of bytes
|
||||
for (int p = 0; p < kReducedRank - 1; ++p) {
|
||||
dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
|
||||
}
|
||||
|
||||
for (int p = 0; p < kRank - 1; ++p) {
|
||||
src_stride[p] = src_stride_[p] * input_size_bits / 8;
|
||||
}
|
||||
|
||||
// Compute number of elements in strided ranks
|
||||
for (int p = 0; p < kReducedRank - 1; ++p) {
|
||||
outer_count *= uint64_t(extent[p]);
|
||||
}
|
||||
|
||||
for (int p = 0; p < kInnerRank; ++p) {
|
||||
inner_count *= uint64_t(extent[kReducedRank + p - 1]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
|
||||
/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
class TensorReductionAffineStrided {
|
||||
public:
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
using ComputeFragment = Array<ElementCompute, VectorLength>;
|
||||
using SourceFragment = AlignedArray<ElementSource, VectorLength>;
|
||||
using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
|
||||
|
||||
/// Shared memory allocation used for reduction within the CTA
|
||||
struct SharedStorage {
|
||||
Array<ElementCompute, kThreads * kVectorLength> workspace;
|
||||
};
|
||||
|
||||
/// Parameters structure
|
||||
using Params = TensorReductionAffineStridedParams<
|
||||
Rank,
|
||||
ReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
VectorLength,
|
||||
ElementCompute,
|
||||
Threads,
|
||||
BatchSize
|
||||
>;
|
||||
|
||||
private:
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_inner_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kInnerRank> & coord,
|
||||
int64_t &src_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose into coordinate
|
||||
coord = CoordinateDecomposition<kInnerRank>(linear_idx, ¶ms.divmod[kReducedRank]);
|
||||
|
||||
// Compute linear offset
|
||||
src_offset = 0;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kInnerRank; ++i) {
|
||||
src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_outer_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kReducedRank - 1> & coord,
|
||||
int64_t &dst_offset,
|
||||
int64_t &src_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose linear coordinate
|
||||
coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
|
||||
|
||||
// Compute offset into tensors
|
||||
dst_offset = 0;
|
||||
src_offset = 0;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kReducedRank - 1; ++i) {
|
||||
dst_offset += params.dst_stride[i] * coord[i];
|
||||
src_offset += params.src_stride[i] * coord[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Reduces over the reduction indices
|
||||
CUTLASS_DEVICE
|
||||
ComputeFragment reduce_indices_(
|
||||
Params const ¶ms,
|
||||
ElementCompute *threadblock_workspace,
|
||||
char const *src_byte_ptr) {
|
||||
|
||||
NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
|
||||
ReductionOp reduction_op(params.reduction_op);
|
||||
|
||||
// Accumulated output
|
||||
ComputeFragment identity_frag;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < identity_frag.size(); ++i) {
|
||||
identity_frag[i] = params.reduction_identity;
|
||||
}
|
||||
|
||||
if (!params.inner_count) {
|
||||
return identity_frag;
|
||||
}
|
||||
|
||||
ComputeFragment accumulator = identity_frag;
|
||||
|
||||
// Compute the coordinate of the first access
|
||||
int64_t src_byte_offset = 0;
|
||||
Coord<kInnerRank> coord;
|
||||
|
||||
uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
|
||||
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
|
||||
|
||||
// Load the first vector
|
||||
SourceFragment source_fragment[kBatchSize];
|
||||
|
||||
bool not_done = true;
|
||||
|
||||
// Iterate over vectors in a linearized reduction index space
|
||||
while (not_done) {
|
||||
|
||||
bool guards[kBatchSize];
|
||||
|
||||
// Issue a batch of loads
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
|
||||
if (linear_idx < params.inner_count) {
|
||||
source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
|
||||
guards[b] = true;
|
||||
}
|
||||
else {
|
||||
guards[b] = false;
|
||||
not_done = false;
|
||||
}
|
||||
|
||||
linear_idx += blockDim.z * gridDim.z;
|
||||
compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
|
||||
}
|
||||
|
||||
// Perform a batch of reduction operations
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
if (guards[b]) {
|
||||
|
||||
auto cvt = convert_source(source_fragment[b]);
|
||||
|
||||
accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
|
||||
reduction_op,
|
||||
accumulator,
|
||||
cvt);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Optional reduction within a CTA
|
||||
if (blockDim.z > 1) {
|
||||
|
||||
// Linearized thread ID
|
||||
int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
|
||||
|
||||
// all threads store to workspace
|
||||
ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
|
||||
|
||||
frag_ptr[thread_idx] = accumulator;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.z == 0) {
|
||||
// Load all additional block indices
|
||||
for (int z = 1; z < blockDim.z; ++z) {
|
||||
ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
|
||||
|
||||
accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
|
||||
reduction_op,
|
||||
accumulator,
|
||||
frag);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Perform a reduction
|
||||
CUTLASS_DEVICE
|
||||
void operator()(Params const ¶ms, SharedStorage &shared_storage) {
|
||||
|
||||
int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
|
||||
|
||||
char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
|
||||
char * dst_byte_ptr = nullptr;
|
||||
|
||||
// If performing a reduction across CTAs, redirect output to device workspace
|
||||
if (gridDim.z == 1) {
|
||||
dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
|
||||
}
|
||||
else {
|
||||
dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
|
||||
}
|
||||
|
||||
// If the C index is out of bounds, exit
|
||||
if (coord_c >= params.extent[kRank - 1]) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
// Use modulo division to compute location
|
||||
Coord<kReducedRank - 1> outer_coord;
|
||||
int64_t dst_byte_offset;
|
||||
int64_t src_byte_offset;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
if (gridDim.z == 1) {
|
||||
|
||||
/// Complete the reduction with no workspace
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
ComputeFragment result;
|
||||
|
||||
result = reduce_indices_(
|
||||
params,
|
||||
shared_storage.workspace.data(),
|
||||
src_byte_ptr + src_byte_offset);
|
||||
|
||||
// Store the result after possible final reduction within the CTA
|
||||
if (threadIdx.z == 0) {
|
||||
|
||||
// Convert to output type and store
|
||||
NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
|
||||
auto cvt = convert_output(result);
|
||||
|
||||
*reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) =
|
||||
reinterpret_cast<OutputFragment const &>(cvt);
|
||||
}
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.y * blockDim.y;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
} // while
|
||||
}
|
||||
else {
|
||||
|
||||
/// Complete the reduction with a device workspace
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
ComputeFragment result;
|
||||
|
||||
result = reduce_indices_(
|
||||
params,
|
||||
shared_storage.workspace.data(),
|
||||
src_byte_ptr + src_byte_offset);
|
||||
|
||||
// Store the result after possible final reduction within the CTA
|
||||
if (threadIdx.z == 0) {
|
||||
|
||||
int64_t byte_offset =
|
||||
blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
|
||||
|
||||
// No conversion - store in compute type
|
||||
*reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) =
|
||||
reinterpret_cast<ComputeFragment const &>(result);
|
||||
}
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.y * blockDim.y;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
src_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
} // while (outer index)
|
||||
} // if ()
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Kernel to perform final reduction
|
||||
template <
|
||||
int Rank, ///< Rank of source tensor (e.g. NDHWC => 5)
|
||||
int ReducedRank, ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
|
||||
typename ElementOutput, ///< Data type of output tensor
|
||||
typename ElementSource, ///< Data type of source tensor
|
||||
typename ReductionOp, ///< Reduction operator
|
||||
int VectorLength = 1, ///< Vector length for memory
|
||||
typename ElementCompute = ElementOutput, ///< Internal compute type - input type of reduction operation
|
||||
int Threads = 256, ///< Number of participating threads
|
||||
int BatchSize = 4 ///< Number of elements to load per batch
|
||||
>
|
||||
class TensorReductionAffineStridedFinal {
|
||||
public:
|
||||
|
||||
static int const kRank = Rank;
|
||||
static int const kReducedRank = ReducedRank;
|
||||
static int const kVectorLength = VectorLength;
|
||||
static int const kInnerRank = kRank - kReducedRank;
|
||||
static int const kThreads = Threads;
|
||||
static int const kBatchSize = BatchSize;
|
||||
using ComputeFragment = Array<ElementCompute, VectorLength>;
|
||||
using SourceFragment = AlignedArray<ElementSource, VectorLength>;
|
||||
using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
|
||||
|
||||
/// Shared memory
|
||||
struct SharedStorage { };
|
||||
|
||||
/// Parameters structure
|
||||
using Params = TensorReductionAffineStridedParams<
|
||||
Rank,
|
||||
ReducedRank,
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
ReductionOp,
|
||||
VectorLength,
|
||||
ElementCompute,
|
||||
Threads,
|
||||
BatchSize
|
||||
>;
|
||||
|
||||
private:
|
||||
|
||||
/// Computes the coordinate and offset of a given linear index
|
||||
CUTLASS_DEVICE
|
||||
void compute_outer_coord_and_offset_(
|
||||
Params const ¶ms,
|
||||
Coord<kReducedRank - 1> & coord,
|
||||
int64_t &dst_offset,
|
||||
uint64_t linear_idx) const {
|
||||
|
||||
// Decompose linear index
|
||||
coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
|
||||
|
||||
// Compute tensor offset
|
||||
dst_offset = 0;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kReducedRank - 1; ++i) {
|
||||
dst_offset += params.dst_stride[i] * coord[i];
|
||||
}
|
||||
}
|
||||
|
||||
/// Reduces over the reduction indices
|
||||
CUTLASS_DEVICE
|
||||
ComputeFragment reduce_indices_(
|
||||
Params const ¶ms,
|
||||
char *src_byte_ptr) {
|
||||
|
||||
ReductionOp reduction_op(params.reduction_op);
|
||||
|
||||
// Accumulated output
|
||||
ComputeFragment identity_frag;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < identity_frag.size(); ++i) {
|
||||
identity_frag[i] = params.reduction_identity;
|
||||
}
|
||||
|
||||
ComputeFragment accumulator = identity_frag;
|
||||
ComputeFragment workspace_fragments[kBatchSize];
|
||||
|
||||
// Partially unrolled loop
|
||||
for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
|
||||
|
||||
// Issue a batch of loads
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
if (idx + b < params.workspace_count) {
|
||||
workspace_fragments[b] =
|
||||
*reinterpret_cast<ComputeFragment *>(src_byte_ptr);
|
||||
}
|
||||
else {
|
||||
workspace_fragments[b] = identity_frag;
|
||||
}
|
||||
src_byte_ptr += + params.workspace_stride;
|
||||
}
|
||||
|
||||
// Perform a reduction
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int b = 0; b < kBatchSize; ++b) {
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < kVectorLength; ++i) {
|
||||
accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Perform a reduction
|
||||
CUTLASS_DEVICE
|
||||
void operator()(Params const ¶ms, SharedStorage &shared_storage) {
|
||||
|
||||
int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
|
||||
|
||||
char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
|
||||
char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
|
||||
|
||||
// If the C index is out of bounds, exit
|
||||
if (coord_c >= params.extent[kRank - 1]) {
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
// Use modulo division to compute location
|
||||
Coord<kReducedRank - 1> outer_coord;
|
||||
int64_t dst_byte_offset;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
idx_linear);
|
||||
|
||||
/// Complete the reduction
|
||||
while (idx_linear < params.outer_count) {
|
||||
|
||||
int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
|
||||
|
||||
ComputeFragment result = reduce_indices_(
|
||||
params,
|
||||
src_byte_ptr + src_byte_offset);
|
||||
|
||||
// Convert to output type and store
|
||||
NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
|
||||
auto cvt = convert_output(result);
|
||||
|
||||
*reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) =
|
||||
reinterpret_cast<OutputFragment const &>(cvt);
|
||||
|
||||
// Update indices and pointers
|
||||
idx_linear += gridDim.y * blockDim.y;
|
||||
|
||||
compute_outer_coord_and_offset_(
|
||||
params,
|
||||
outer_coord,
|
||||
dst_byte_offset,
|
||||
idx_linear);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace reduction
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
28
test/unit/reduction/device/CMakeLists.txt
Normal file
28
test/unit/reduction/device/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_reduction_device
|
||||
tensor_reduce_strided.cu
|
||||
tensor_reduce_contiguous.cu
|
||||
)
|
||||
|
||||
470
test/unit/reduction/device/tensor_reduce_contiguous.cu
Normal file
470
test/unit/reduction/device/tensor_reduce_contiguous.cu
Normal file
@ -0,0 +1,470 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for TensorReduce family of device-wide operators
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/complex.h"
|
||||
#include "cutlass/reduction/thread/reduction_operators.h"
|
||||
#include "cutlass/reduction/device/tensor_reduce.h"
|
||||
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/device/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// This reduces the C dimension, transforming an NHWC tensor into NHWC with C=1.
|
||||
template <typename TensorReduction, typename ElementCompute = typename TensorReduction::ElementCompute>
|
||||
bool TestAllReduction_NHWC_reduce_c(ElementCompute reduction_identity = ElementCompute()) {
|
||||
|
||||
using Layout = typename TensorReduction::Layout;
|
||||
using ElementOutput = typename TensorReduction::ElementOutput;
|
||||
using ElementSource = typename TensorReduction::ElementSource;
|
||||
|
||||
int const kV = TensorReduction::kVectorLength;
|
||||
|
||||
int const N_indices[] = {3, 13};
|
||||
int const H_indices[] = {5, 17};
|
||||
int const W_indices[] = {7, 19};
|
||||
int const C_indices[] = {2049, 2048, 2047, 384, 64, 48, 32, 24, 16, 12, 8, 6, 4, 3, 2, 1};
|
||||
|
||||
for (int N : N_indices) {
|
||||
for (int H : H_indices) {
|
||||
for (int W : W_indices) {
|
||||
for (int Cx : C_indices) {
|
||||
|
||||
int C = Cx * kV;
|
||||
|
||||
cutlass::HostTensor<ElementSource, Layout> src_tensor({N, H, W, C});
|
||||
cutlass::HostTensor<ElementOutput, Layout> dst_tensor({N, H, W, 1});
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
src_tensor.host_view(), 17, 10, -10, 0);
|
||||
|
||||
dst_tensor.sync_device();
|
||||
src_tensor.sync_device();
|
||||
|
||||
// Execute a tensor reduction over rank 3 (the 'C' dimension is reduced; NHWC => NHW)
|
||||
TensorReduction reduction(src_tensor.extent(), 3);
|
||||
|
||||
cutlass::DeviceAllocation<uint8_t> device_workspace(reduction.workspace_size());
|
||||
|
||||
cutlass::Status status = reduction.reduce(
|
||||
dst_tensor.device_ref(),
|
||||
src_tensor.device_ref(),
|
||||
device_workspace.get(),
|
||||
reduction_identity
|
||||
);
|
||||
|
||||
EXPECT_EQ(status, cutlass::Status::kSuccess);
|
||||
EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
||||
|
||||
dst_tensor.sync_host();
|
||||
|
||||
typename TensorReduction::ReductionOp reduction_op;
|
||||
|
||||
//
|
||||
// Reference check
|
||||
//
|
||||
for (int n = 0; n < src_tensor.extent().n(); ++n) {
|
||||
for (int h = 0; h < src_tensor.extent().h(); ++h) {
|
||||
for (int w = 0; w < src_tensor.extent().w(); ++w) {
|
||||
|
||||
ElementCompute c_accum = reduction_identity;
|
||||
|
||||
for (int c = 0; c < src_tensor.extent().c(); ++c) {
|
||||
c_accum = reduction_op(c_accum, ElementCompute(src_tensor.at({n, h, w, c})));
|
||||
}
|
||||
|
||||
ElementCompute got = ElementCompute(dst_tensor.at({n, h, w, 0}));
|
||||
|
||||
bool equal = (c_accum == got);
|
||||
|
||||
EXPECT_TRUE(equal);
|
||||
if (!equal) {
|
||||
|
||||
std::cerr
|
||||
<< "Error at location (" << n << ", " << h << ", " << w << ", 0)" << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< " expected: " << c_accum << std::endl
|
||||
<< " got: " << got << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< "Problem: " << src_tensor.extent() << " -> "
|
||||
<< dst_tensor.extent() << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< " Grid: " << reduction.reduction_strided.grid_shape
|
||||
<< "\n Block: " << reduction.reduction_strided.threadblock_shape << std::endl
|
||||
<< " FInal: " << reduction.reduction_strided.grid_final
|
||||
<< "\n Block: " << reduction.reduction_strided.threadblock_final << "\n";
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
} //w
|
||||
} // h
|
||||
} // n
|
||||
|
||||
//
|
||||
// Next problem
|
||||
//
|
||||
|
||||
} // C
|
||||
} // W
|
||||
} // H
|
||||
} // N
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x1) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x1_f16x1) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x2) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 2;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x2_f16x2) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
int const kV = 2;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x4) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 4;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_c_f32x4_f16x4) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
int const kV = 4;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_maximum_c_f32x4) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 4;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::maximum<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( -std::numeric_limits<float>::max() ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_minimum_c_f32x4) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 4;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::minimum<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( std::numeric_limits<float>::max() ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_ANY_c_s32) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_or<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(0) ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_ALL_c_s32) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_and<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(1) ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_ANY_c_f32) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_or<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(0) ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHW
|
||||
TEST(Reduction_TensorReduce, nhwc_ALL_c_f32) {
|
||||
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
int const kV = 1;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_and<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_c<TensorReduction>( ElementCompute(1) ));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
517
test/unit/reduction/device/tensor_reduce_strided.cu
Normal file
517
test/unit/reduction/device/tensor_reduce_strided.cu
Normal file
@ -0,0 +1,517 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for TensorReduce family of device-wide operators
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
|
||||
#include "../../common/cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/complex.h"
|
||||
#include "cutlass/reduction/thread/reduction_operators.h"
|
||||
#include "cutlass/reduction/device/tensor_reduce.h"
|
||||
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/device/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// This reduces the W dimension, transforming an NHWC tensor into NHWC with W=1.
|
||||
template <
|
||||
typename TensorReduction,
|
||||
typename ElementCompute = typename TensorReduction::ElementCompute
|
||||
>
|
||||
bool TestAllReduction_NHWC_reduce_w(ElementCompute reduction_identity = ElementCompute()) {
|
||||
|
||||
using Layout = typename TensorReduction::Layout;
|
||||
using ElementOutput = typename TensorReduction::ElementOutput;
|
||||
using ElementSource = typename TensorReduction::ElementSource;
|
||||
|
||||
int const kV = TensorReduction::kVectorLength;
|
||||
|
||||
int const N_indices[] = {1, 2, 5, 10};
|
||||
int const H_indices[] = {1, 3, 9 };
|
||||
int const W_indices[] = {1, 5, 19, 40, 224};
|
||||
int const C_indices[] = {
|
||||
kV,
|
||||
2 * kV,
|
||||
5 * kV,
|
||||
9 * kV,
|
||||
17 * kV,
|
||||
39 * kV,
|
||||
257 * kV,
|
||||
kV * 760
|
||||
};
|
||||
|
||||
using Element = int;
|
||||
|
||||
for (int N : N_indices) {
|
||||
for (int H : H_indices) {
|
||||
for (int W : W_indices) {
|
||||
for (int C : C_indices) {
|
||||
|
||||
cutlass::HostTensor<ElementSource, Layout> src_tensor({N, H, W, C});
|
||||
cutlass::HostTensor<ElementOutput, Layout> dst_tensor({N, H, 1, C});
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
src_tensor.host_view(), 17, 10, -10, 0);
|
||||
|
||||
cutlass::reference::host::BlockFillSequential(
|
||||
dst_tensor.host_data(), dst_tensor.capacity());
|
||||
|
||||
dst_tensor.sync_device();
|
||||
src_tensor.sync_device();
|
||||
|
||||
// Execute a tensor reduction over rank 2 (the 'W' dimension is reduced; NHWC => NHC)
|
||||
TensorReduction reduction(src_tensor.extent(), 2);
|
||||
|
||||
cutlass::DeviceAllocation<uint8_t> device_workspace(reduction.workspace_size());
|
||||
|
||||
cutlass::Status status = reduction.reduce(
|
||||
dst_tensor.device_ref(),
|
||||
src_tensor.device_ref(),
|
||||
device_workspace.get(),
|
||||
reduction_identity
|
||||
);
|
||||
|
||||
EXPECT_EQ(status, cutlass::Status::kSuccess);
|
||||
EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
||||
// Reference check
|
||||
dst_tensor.sync_host();
|
||||
|
||||
typename TensorReduction::ReductionOp reduction_op;
|
||||
|
||||
for (int n = 0; n < src_tensor.extent().n(); ++n) {
|
||||
for (int h = 0; h < src_tensor.extent().h(); ++h) {
|
||||
for (int c = 0; c < src_tensor.extent().c(); ++c) {
|
||||
|
||||
ElementCompute w_accum = reduction_identity;
|
||||
|
||||
for (int w = 0; w < src_tensor.extent().w(); ++w) {
|
||||
w_accum = reduction_op(w_accum, ElementCompute(src_tensor.at({n, h, w, c})));
|
||||
}
|
||||
|
||||
ElementCompute got = ElementCompute(dst_tensor.at({n, h, 0, c}));
|
||||
|
||||
bool equal = (w_accum == got);
|
||||
|
||||
EXPECT_TRUE(equal);
|
||||
if (!equal) {
|
||||
|
||||
std::cerr
|
||||
<< "Error at location (" << n << ", " << h << ", 0, " << c << ")" << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< " expected: " << w_accum << std::endl
|
||||
<< " got: " << got << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< "Problem: " << src_tensor.extent() << " -> "
|
||||
<< dst_tensor.extent() << std::endl;
|
||||
|
||||
std::cerr
|
||||
<< " Grid: " << reduction.reduction_strided.grid_shape
|
||||
<< "\n Block: " << reduction.reduction_strided.threadblock_shape << std::endl
|
||||
<< " Final: " << reduction.reduction_strided.grid_final
|
||||
<< "\n Block: " << reduction.reduction_strided.threadblock_final << "\n";
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x8_f16x8) {
|
||||
|
||||
int const kV = 8;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x2_f16x2) {
|
||||
|
||||
int const kV = 2;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_w_f32x1_f16x1) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_w_s32x4) {
|
||||
|
||||
int const kV = 4;
|
||||
using Element = int;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<Element>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
Element,
|
||||
Element,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
Element
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_reduce_w_cf32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = cutlass::complex<float>;
|
||||
using ElementSource = cutlass::complex<float>;
|
||||
using ElementCompute = cutlass::complex<float>;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::plus<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_maximum_w_cf32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::maximum<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>( -std::numeric_limits<float>::max() ));
|
||||
}
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_minimum_w_cf32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::minimum<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(std::numeric_limits<float>::max()));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_XOR_w_u32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::bit_xor<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_AND_w_s32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = unsigned;
|
||||
using ElementSource = unsigned;
|
||||
using ElementCompute = unsigned;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::bit_and<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(0xffffffff));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_OR_w_u32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::bit_or<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>());
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_ANY_w_s32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_or<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_ALL_w_s32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = int;
|
||||
using ElementSource = int;
|
||||
using ElementCompute = int;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_and<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_ANY_w_f32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_or<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(0)));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Test tensor reduction from NHWC to NHC
|
||||
TEST(Reduction_TensorReduce, nhwc_ALL_w_f32) {
|
||||
|
||||
int const kV = 1;
|
||||
using ElementOutput = float;
|
||||
using ElementSource = float;
|
||||
using ElementCompute = float;
|
||||
using Layout = cutlass::layout::TensorNHWC;
|
||||
|
||||
// Define the functor
|
||||
using Functor = cutlass::logical_and<ElementCompute>;
|
||||
|
||||
using TensorReduction = cutlass::reduction::device::TensorReduction<
|
||||
ElementOutput,
|
||||
ElementSource,
|
||||
Layout,
|
||||
Functor,
|
||||
kV,
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
EXPECT_TRUE(TestAllReduction_NHWC_reduce_w<TensorReduction>(ElementCompute(1)));
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
Loading…
Reference in New Issue
Block a user