cutlass/include/cutlass/conv/threadblock/conv2d_params.h
Manish Gupta 2e07c4cc2f
CUTLASS 2.7 (#318)
CUTLASS 2.7

Mainloop fusion for GEMM: summation over A or B
Strided DGRAD (optimized iterators)
Half-precision GELU_taylor activation functions
Use these when accumulation and epilogue compute types are all cutlass::half_t
Tuning and bug fixes to fused GEMM + GEMM example
Support for smaller than 128b aligned Convolutions: see examples
Caching of results to accelerate Convolution unit tests
Can be enabled or disabled by running cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF
Corrections and bug fixes reported by the CUTLASS community
Thank you for filing these issues!

authored-by: Haicheng Wu haichengw@nvidia.com, Manish Gupta manigupta@nvidia.com, Dustyn Blasig dblasig@nvidia.com, Andrew Kerr akerr@nvidia.com
2021-09-20 11:02:22 -07:00

844 lines
28 KiB
C++

/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*!
\file
\brief Extracts the host-params objects into non-template code.
*/
#pragma once
#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
#include "cutlass/cutlass.h"
#include "cutlass/fast_math.h"
#include "cutlass/layout/tensor.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/layout/pitch_linear.h"
#include "cutlass/conv/convolution.h"
#include "cutlass/conv/conv2d_problem_size.h"
#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
#include <fstream>
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace conv {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Params structure used for all Conv2d analytic tile iterators
template< typename Layout_ = layout::TensorNHWC >
struct Conv2dAnalyticParams {
using Layout = Layout_;
Layout layout;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dAnalyticParams() { }
CUTLASS_HOST_DEVICE
Conv2dAnalyticParams(
Conv2dProblemSize const &, // unused; placeholder to match other Params interfaces.
Layout const &layout
): layout(layout) {
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Parameters structure used for Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams
struct Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams {
using Layout = layout::TensorNHWC;
Layout layout;
int tiled_rows_per_filter;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams() { }
CUTLASS_HOST_DEVICE
Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams(
Conv2dProblemSize const &problem_size,
Layout const &layout, ///< layout object
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape
): layout(layout) {
int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
CUTLASS_HOST_DEVICE
void TraceIteratorParams(
char const *conv_operator,
char const *operand,
int element_size_bits,
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
) {
#if !defined(__CUDA_ARCH__)
char const *fname = "conv_iterator_params.csv";
std::ifstream test(fname);
bool file_exists = test.is_open();
if (file_exists) {
test.close();
}
std::ofstream trace("conv_iterator_params.csv", std::ofstream::app);
if (!file_exists) {
trace
<< "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize,"
<< "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n";
}
trace << conv_operator << "," << operand << "," << element_size_bits << ","
<< threadblock_shape.row() << "," << threadblock_shape.column()
<< "," << thread_count << "," << access_size
<< "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided()
<< "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n";
#endif
}
#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \
TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta);
#else
#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {}
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
template< typename Layout_ = layout::TensorNHWC >
struct Conv2dFpropActivationIteratorOptimizedParams;
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
template<>
struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
using Layout = layout::TensorNHWC;
Layout layout;
int64_t inc_next[3]; // {next S, next R, next C}
int filter_c_delta; // number of logical elements to add to filter_c_
int PQ; // product of P*Q
FastDivmod pq_divmod;
FastDivmod q_divmod;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dFpropActivationIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dFpropActivationIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout, ///< layout object
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout),
PQ(problem_size.P * problem_size.Q),
pq_divmod(PQ),
q_divmod(problem_size.Q) {
TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
// next S
inc_next[0] = conv_sign * (
int64_t(layout.stride()[0]) * problem_size.dilation_w
) * element_size_bits / 8;
// next R
inc_next[1] = conv_sign * (
int64_t(layout.stride()[1]) * problem_size.dilation_h
- (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// next C
inc_next[2] = (
threadblock_shape.column() * problem_size.split_k_slices
- conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
- conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// logical offset added to internal channel counter - units are elements, not bytes
filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
}
#if ENABLE_CONV2D_PARAMS_PRINT
/// Prints internal state.
CUTLASS_HOST_DEVICE
void print() {
auto stride = layout.stride();
printf(
"Conv2dFpropActivationIteratorOptimizedParams:\n"
" layout(w: %d, h: %d, n: %d)\n"
" inc_next[%ld, %ld, %ld]\n"
" filter_c_delta(%d) - PQ(%d)\n"
" pq_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n"
" q_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n",
stride[0], stride[1], stride[2],
inc_next[0], inc_next[1], inc_next[2],
filter_c_delta,
PQ,
pq_divmod.divisor,
pq_divmod.multiplier,
pq_divmod.shift_right,
q_divmod.divisor,
q_divmod.multiplier,
q_divmod.shift_right
);
}
#endif
};
/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
template <int Interleaved_>
struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNCxHWx<Interleaved_>> {
static int const kInterleaved = Interleaved_;
using Layout = layout::TensorNCxHWx<kInterleaved>;
Layout layout;
int64_t inc_next[3]; // {next S, next R, next C}
int filter_c_delta; // number of logical elements to add to filter_c_
int PQ; // product of P*Q
FastDivmod pq_divmod;
FastDivmod q_divmod;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dFpropActivationIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dFpropActivationIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout, ///< layout object
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
// next S
inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8;
// next R
inc_next[1] = conv_sign * (
int64_t(layout.stride()[0]) * problem_size.dilation_h
- (problem_size.S - 1) * kInterleaved * problem_size.dilation_w
) * element_size_bits / 8;
// next C
inc_next[2] = (
threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1])
- conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h
- conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w
) * element_size_bits / 8;
// logical offset added to internal channel counter - units are elements, not bytes
filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
template< typename Layout_ = layout::TensorNHWC >
struct Conv2dFpropFilterIteratorOptimizedParams;
/////////////////////////////////////////////////////////////////////////////////////////////////
template<>
struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorNHWC>
{
using Layout = layout::TensorNHWC;
Layout layout;
int RS;
int filter_c_delta;
int64_t inc_next_k; // offset in units of bytes to next K position
int64_t inc_next_rs; // offset in units of bytes to next RS position
int64_t inc_next_c; // offset in units of bytes to next C position
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dFpropFilterIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dFpropFilterIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout) {
TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
RS = problem_size.R * problem_size.S;
inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8;
inc_next_rs =
( int64_t(layout.stride()[0])
- int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
) * element_size_bits / 8;
inc_next_c =
(
threadblock_shape.row() * problem_size.split_k_slices
- int64_t(RS - 1) * layout.stride()[0]
- int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
}
#if ENABLE_CONV2D_PARAMS_PRINT
/// Prints internal state.
CUTLASS_HOST_DEVICE
void print() {
auto stride = layout.stride();
printf(
"Conv2dFpropFilterIteratorOptimizedParams:\n"
" layout[%d, %d, %d]\n"
" RS(%d), filter_c_delta(%d), inc_next(k: %ld, rs: %ld, c: %ld)\n",
stride[0], stride[1], stride[2],
RS,
filter_c_delta,
inc_next_k, inc_next_rs, inc_next_c
);
}
#endif
};
template<int Interleaved_>
struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorCxRSKx<Interleaved_>>
{
static int const kInterleaved = Interleaved_;
using Layout = layout::TensorCxRSKx<kInterleaved>;
Layout layout;
int RS;
int filter_c_delta;
int64_t inc_next_k; // offset in units of bytes to next K position
int64_t inc_next_rs; // offset in units of bytes to next RS position
int64_t inc_next_c; // offset in units of bytes to next C position
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dFpropFilterIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dFpropFilterIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout) {
TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
RS = problem_size.R * problem_size.S;
inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8;
inc_next_rs =
( int64_t(layout.stride()[0])
- kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
) * element_size_bits / 8;
inc_next_c =
(
threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2])
- int64_t(RS - 1) * layout.stride()[0]
- int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved
) * element_size_bits / 8;
filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Dgrad Optimized Dy params (layout::TensorNHWC)
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator
struct Conv2dDgradOutputGradientIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
int64_t inc_next[3]; // {next S, next R, next K}
int filter_k_delta; // number of logical elements to add to filter_k_
int HW; // product of H*W
FastDivmod hw_divmod;
FastDivmod w_divmod;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dDgradOutputGradientIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dDgradOutputGradientIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout),
HW(problem_size.H *problem_size.W),
hw_divmod(HW),
w_divmod(problem_size.W) {
TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
// next S
inc_next[0] = conv_sign * (
layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// next R
inc_next[1] = conv_sign * (
layout.stride()[1] * problem_size.dilation_h
- (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// next K
inc_next[2] = (
threadblock_shape.column() * problem_size.split_k_slices
- conv_sign * (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
- conv_sign * (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// logical offset added to internal channel counter - units are elements, not bytes
filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// Strided Dgrad Optimized Dy params (layout::TensorNHWC)
/////////////////////////////////////////////////////////////////////////////////////////////////
struct Conv2dStridedDgradOutputGradientIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
int64_t inc_next[3]; // {next S, next R, next K}
int filter_k_delta; // number of logical elements to add to filter_k_
int tiled_rows_per_filter;
int conv_sign;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dStridedDgradOutputGradientIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dStridedDgradOutputGradientIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout, ///< layout object
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape
): layout(layout) {
int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
// next S
inc_next[0] = conv_sign * (
layout.stride()[0] * problem_size.dilation_w
) * element_size_bits / 8;
// next R
inc_next[1] = conv_sign * (
layout.stride()[1] * problem_size.dilation_h
) * element_size_bits / 8;
// next K
inc_next[2] = (
threadblock_shape.column() * problem_size.split_k_slices
) * element_size_bits / 8;
// logical offset added to internal channel counter - units are elements, not bytes
filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
// Dgrad Optimized w params (layout::TensorNHWC)
/////////////////////////////////////////////////////////////////////////////////////////////////
struct Conv2dDgradFilterIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
int RS;
int filter_k_delta;
int64_t inc_next_strided; // offset in units of bytes to next K coordinate within tile
int64_t inc_next_rs; // offset in units of bytes to next RS position
int64_t inc_next_k; // offset in units of bytes to next K position in subsequent tile
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dDgradFilterIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dDgradFilterIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout), RS(problem_size.R * problem_size.S) {
TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
inc_next_rs =
( layout.stride()[0]
- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
inc_next_k =
(
threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2]
- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
// StridedDgrad Optimized w params (layout::TensorNHWC)
/////////////////////////////////////////////////////////////////////////////////////////////////
struct Conv2dStridedDgradFilterIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
int RS;
int filter_k_delta;
int64_t inc_next_strided; // offset in units of bytes to next K coordinate within tile
int64_t inc_next[3]; // {next S, next R, next K}
int64_t reset_bytes; // offset in units of bytes to move back the pointer
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dStridedDgradFilterIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dStridedDgradFilterIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout), RS(problem_size.R * problem_size.S) {
TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
// next S
inc_next[0] =
( layout.stride()[0] * problem_size.stride_w
//- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
// next R
inc_next[1] =
( layout.stride()[1] * problem_size.stride_h
//- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
// next K
inc_next[2] =
(
threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2]
//- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
//- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
) * element_size_bits / 8;
// offset in units of bytes to move the pointer in backward direction
reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
* element_size_bits / 8;
filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator
struct Conv2dWgradOutputGradientIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
int NPQ; // precomputd product of N*P*Q for clearing predicates
FastDivmod pq_divmod;
FastDivmod q_divmod;
int64_t offset_next_strided; // offset in units of bytes to next npq coordinate within tile
int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile
int64_t inc_next_npq; // offset in units of bytes to next npq position in subsequent tile
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dWgradOutputGradientIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dWgradOutputGradientIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
layout(layout),
NPQ(problem_size.N * problem_size.P * problem_size.Q),
pq_divmod(problem_size.P * problem_size.Q),
q_divmod(problem_size.Q) {
TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
// Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
offset_next_strided = (threadmap_delta.strided() * layout.stride()[0])
* element_size_bits / 8;
offset_next_contiguous = (threadmap_delta.contiguous())
* element_size_bits / 8;
inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * layout.stride()[0])
* element_size_bits / 8;
}
};
struct Conv2dWgradActivationIteratorOptimizedParams {
using Layout = layout::TensorNHWC;
Layout layout;
FastDivmod sc_divmod;
FastDivmod pq_divmod;
FastDivmod q_divmod;
FastDivmod c_divmod;
//
// Methods
//
CUTLASS_HOST_DEVICE
Conv2dWgradActivationIteratorOptimizedParams() { }
CUTLASS_HOST_DEVICE
Conv2dWgradActivationIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout
):
layout(layout),
sc_divmod(problem_size.S * problem_size.C),
pq_divmod(problem_size.P * problem_size.Q),
q_divmod(problem_size.Q),
c_divmod(problem_size.C) {
}
CUTLASS_HOST_DEVICE
Conv2dWgradActivationIteratorOptimizedParams(
Conv2dProblemSize const &problem_size,
Layout const &layout,
int element_size_bits, ///< size of each element in bits
MatrixCoord threadblock_shape,
int thread_count,
int access_size,
layout::PitchLinearCoord threadmap_iterations,
layout::PitchLinearCoord threadmap_delta
):
Conv2dWgradActivationIteratorOptimizedParams(
problem_size,
layout
) {
TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation",
element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
}
};
struct PredicatedScaleBiasVectorAccessIteratorParams {
public:
/// Default ctor
CUTLASS_HOST_DEVICE
PredicatedScaleBiasVectorAccessIteratorParams() { }
// Default ctor
CUTLASS_HOST_DEVICE
PredicatedScaleBiasVectorAccessIteratorParams(
Conv2dProblemSize const &problem_size,
layout::PitchLinear const &layout) {}
// Default ctor
CUTLASS_HOST_DEVICE
PredicatedScaleBiasVectorAccessIteratorParams(
Conv2dProblemSize const &problem_size,
layout::RowMajor const &layout) {}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace conv
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////