
CUTLASS 2.3 adds GEMMs targeting Sparse Tensor Cores on the NVIDIA Ampere Architecture, fast SGEMM, and small matrix classes, bug fixes, and performance enhancements.
183 lines
6.9 KiB
Plaintext
183 lines
6.9 KiB
Plaintext
/***************************************************************************************************
|
|
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
*modification, are permitted provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
*this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
*notice, this list of conditions and the following disclaimer in the
|
|
*documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of the NVIDIA CORPORATION nor the names of its
|
|
*contributors may be used to endorse or promote products derived from this
|
|
*software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
*DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
|
|
*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
*DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
*OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING
|
|
*NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
*EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
|
|
/*! \file
|
|
\brief Demonstrate CUTLASS debugging tool for dumping fragments and shared
|
|
memory
|
|
*/
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Standard Library includes
|
|
|
|
#include <iostream>
|
|
|
|
//
|
|
// CUTLASS includes
|
|
//
|
|
|
|
#include "cutlass/aligned_buffer.h"
|
|
#include "cutlass/gemm/gemm.h"
|
|
#include "cutlass/layout/matrix.h"
|
|
#include "cutlass/matrix_shape.h"
|
|
#include "cutlass/numeric_types.h"
|
|
|
|
#include "cutlass/core_io.h"
|
|
#include "cutlass/util/host_tensor.h"
|
|
#include "cutlass/util/tensor_view_io.h"
|
|
|
|
#include "cutlass/util/reference/host/gemm.h"
|
|
#include "cutlass/util/reference/host/tensor_compare.h"
|
|
#include "cutlass/util/reference/host/tensor_fill.h"
|
|
|
|
#include "cutlass/transform/pitch_linear_thread_map.h"
|
|
#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
|
|
#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"
|
|
|
|
#include "cutlass/util/debug.h"
|
|
#include "cutlass/util/device_dump.h"
|
|
|
|
#define EXAMPLE_MATRIX_ROW 64
|
|
#define EXAMPLE_MATRIX_COL 32
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
template <typename Element, typename GmemIterator, typename SmemIterator>
|
|
__global__ void kernel_dump(typename GmemIterator::Params params,
|
|
typename GmemIterator::TensorRef ref) {
|
|
extern __shared__ Element shared_storage[];
|
|
|
|
// Construct the global iterator and load the data to the fragments.
|
|
int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
|
|
|
|
GmemIterator gmem_iterator(params, ref.data(),
|
|
{EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL},
|
|
tb_thread_id);
|
|
|
|
typename GmemIterator::Fragment frag;
|
|
|
|
frag.clear();
|
|
gmem_iterator.load(frag);
|
|
|
|
// Call dump_fragment() with different parameters.
|
|
if (threadIdx.x == 0 && blockIdx.x == 0)
|
|
printf("\nAll threads dump all the elements:\n");
|
|
cutlass::debug::dump_fragment(frag);
|
|
|
|
if (threadIdx.x == 0 && blockIdx.x == 0)
|
|
printf("\nFirst thread dumps all the elements:\n");
|
|
cutlass::debug::dump_fragment(frag, /*N = */ 1);
|
|
|
|
if (threadIdx.x == 0 && blockIdx.x == 0)
|
|
printf("\nFirst thread dumps first 16 elements:\n");
|
|
cutlass::debug::dump_fragment(frag, /*N = */ 1, /*M = */ 16);
|
|
|
|
if (threadIdx.x == 0 && blockIdx.x == 0)
|
|
printf("\nFirst thread dumps first 16 elements with a stride of 8:\n");
|
|
cutlass::debug::dump_fragment(frag, /*N = */ 1, /*M = */ 16, /*S = */ 8);
|
|
|
|
// Construct the shared iterator and store the data to the shared memory.
|
|
SmemIterator smem_iterator(
|
|
typename SmemIterator::TensorRef(
|
|
{shared_storage, SmemIterator::Layout::packed(
|
|
{EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL})}),
|
|
tb_thread_id);
|
|
|
|
smem_iterator.store(frag);
|
|
|
|
// Call dump_shmem() with different parameters.
|
|
if (threadIdx.x == 0 && blockIdx.x == 0) printf("\nDump all the elements:\n");
|
|
cutlass::debug::dump_shmem(shared_storage,
|
|
EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
|
|
|
|
if (threadIdx.x == 0 && blockIdx.x == 0)
|
|
printf("\nDump all the elements with a stride of 8:\n");
|
|
cutlass::debug::dump_shmem(
|
|
shared_storage, EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL, /*S = */ 8);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Entry point for dump_reg_shmem example.
|
|
//
|
|
// usage:
|
|
//
|
|
// 02_dump_reg_shmem
|
|
//
|
|
int main() {
|
|
// Initialize a 64x32 column major matrix with sequential data (1,2,3...).
|
|
using Element = cutlass::half_t;
|
|
using Layout = cutlass::layout::ColumnMajor;
|
|
|
|
cutlass::HostTensor<Element, Layout> matrix(
|
|
{EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL});
|
|
cutlass::reference::host::BlockFillSequential(matrix.host_data(),
|
|
matrix.capacity());
|
|
|
|
// Dump the matrix.
|
|
std::cout << "Matrix:\n" << matrix.host_view() << "\n";
|
|
|
|
// Copy the matrix to the device.
|
|
matrix.sync_device();
|
|
|
|
// Define a global iterator, a shared iterator and their thread map.
|
|
using ThreadMap = cutlass::transform::PitchLinearWarpRakedThreadMap<
|
|
cutlass::layout::PitchLinearShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>,
|
|
32, cutlass::layout::PitchLinearShape<8, 4>, 8>;
|
|
|
|
using GmemIterator =
|
|
cutlass::transform::threadblock::PredicatedTileIterator<
|
|
cutlass::MatrixShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>, Element,
|
|
Layout, 1, ThreadMap>;
|
|
|
|
typename GmemIterator::Params params(matrix.layout());
|
|
|
|
using SmemIterator = cutlass::transform::threadblock::RegularTileIterator<
|
|
cutlass::MatrixShape<EXAMPLE_MATRIX_ROW, EXAMPLE_MATRIX_COL>, Element,
|
|
cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<16, 64>, 1,
|
|
ThreadMap>;
|
|
|
|
dim3 grid(1, 1);
|
|
dim3 block(32, 1, 1);
|
|
|
|
int smem_size =
|
|
int(sizeof(Element) * EXAMPLE_MATRIX_ROW * EXAMPLE_MATRIX_COL);
|
|
|
|
kernel_dump<Element, GmemIterator, SmemIterator>
|
|
<<<grid, block, smem_size, 0>>>(params, matrix.device_ref());
|
|
|
|
cudaError_t result = cudaDeviceSynchronize();
|
|
|
|
if (result != cudaSuccess) {
|
|
std::cout << "Failed" << std::endl;
|
|
}
|
|
|
|
return (result == cudaSuccess ? 0 : -1);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|