cutlass/examples/03_visualize_layout/register_layout.cu

/***************************************************************************************************
 * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/

/*! \file
  \brief CUTLASS layout visualization example
*/

#include <map>
#include <memory>

#include "cutlass/layout/matrix.h"
#include "cutlass/layout/pitch_linear.h"
#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
#include "cutlass/layout/tensor_op_multiplicand_sm80.h"

#include "visualize_layout.h"
#include "register_layout.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase> > &layouts) {

  struct {
    char const *name;
    VisualizeLayoutBase *ptr;
  } layout_pairs[] = {

      {"PitchLinear", new VisualizeLayout<cutlass::layout::PitchLinear>},
      {"ColumnMajor", new VisualizeLayout<cutlass::layout::ColumnMajor>},
      {"RowMajor", new VisualizeLayout<cutlass::layout::RowMajor>},
      {"ColumnMajorInterleaved<4>",
       new VisualizeLayout<cutlass::layout::ColumnMajorInterleaved<4>>},
      {"RowMajorInterleaved<4>",
       new VisualizeLayout<cutlass::layout::RowMajorInterleaved<4>>},
      // Integer matrix multiply.int4 8832  Interleaved-64
      {"TensorOpMultiplicand<4,64>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 64>>},
      // Integer matrix multiply.int4 8832  TN kblock128
      {"TensorOpMultiplicand<4,128>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},
      // Integer matrix multiply.int4 16864 TN kblock256
      {"TensorOpMultiplicand<4,256>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},
      // Integer matrix multiply 8816  Interleaved-32
      {"TensorOpMultiplicand<8,32>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},
      // Integer matrix multiply 8816  TN kblock64
      {"TensorOpMultiplicand<8,64>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},
      {"TensorOpMultiplicand<8,128>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},
      // Matrix Multiply 1688  TN kblock32
      {"TensorOpMultiplicand<16,32>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},
      // Matrix multiply 1688  NT
      {"TensorOpMultiplicand<16,64>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},
      // Matrix multiply 1688.TF32 TN kblock16
      {"TensorOpMultiplicand<32,16>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 16>>},
      // Matrix multiply 1688.TF32 TN kblock32
      {"TensorOpMultiplicand<32,32>",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 32>>},
      // Matrix multiply 1688 NT
      {"TensorOpMultiplicandCongruous<32,32>",
       new VisualizeLayout<
           cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>},
      // Matrix multiply 884 NT
      {"TensorOpMultiplicandCongruous<64,16>",
       new VisualizeLayout<
           cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>},
      // Matrix multiply 884 TN
      {"TensorOpMultiplicand64bCrosswise",
       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand64bCrosswise>},
      {"TensorOpMultiplicandCongruous<128,4>",
       new VisualizeLayout<
           cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>},
      {"TensorOpMultiplicandCrosswise<128,4>",
       new VisualizeLayout<
           cutlass::layout::TensorOpMultiplicandCrosswise<128, 4>>},
      {"VoltaTensorOpMultiplicandCongruous<16>",
       new VisualizeLayout<
           cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>},
      {"VoltaTensorOpMultiplicandCrosswise<16,32>",
       new VisualizeLayout<
           cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}
  };

  for (auto layout : layout_pairs) {
    layouts.emplace(std::string(layout.name), std::unique_ptr<VisualizeLayoutBase>(layout.ptr));
  }
}

/////////////////////////////////////////////////////////////////////////////////////////////////
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`/***************************************************************************************************`
CUTLASS 2.5 2021-02-26 22:58:26 +08:00			`* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without modification, are permitted`
			`* provided that the following conditions are met:`
			`* * Redistributions of source code must retain the above copyright notice, this list of`
			`* conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`* conditions and the following disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`* to endorse or promote products derived from this software without specific prior written`
			`* permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
			`* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`

			`/*! \file`
			`\brief CUTLASS layout visualization example`
			`*/`

			`#include <map>`
			`#include <memory>`

			`#include "cutlass/layout/matrix.h"`
			`#include "cutlass/layout/pitch_linear.h"`
			`#include "cutlass/layout/tensor_op_multiplicand_sm70.h"`
			`#include "cutlass/layout/tensor_op_multiplicand_sm75.h"`
CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. 2020-06-09 07:17:35 +08:00			`#include "cutlass/layout/tensor_op_multiplicand_sm80.h"`

CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`#include "visualize_layout.h"`
			`#include "register_layout.h"`

			`/////////////////////////////////////////////////////////////////////////////////////////////////`

			`void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase> > &layouts) {`

			`struct {`
			`char const *name;`
			`VisualizeLayoutBase *ptr;`
			`} layout_pairs[] = {`

			`{"PitchLinear", new VisualizeLayout<cutlass::layout::PitchLinear>},`
			`{"ColumnMajor", new VisualizeLayout<cutlass::layout::ColumnMajor>},`
			`{"RowMajor", new VisualizeLayout<cutlass::layout::RowMajor>},`
			`{"ColumnMajorInterleaved<4>",`
			`new VisualizeLayout<cutlass::layout::ColumnMajorInterleaved<4>>},`
			`{"RowMajorInterleaved<4>",`
			`new VisualizeLayout<cutlass::layout::RowMajorInterleaved<4>>},`
			`// Integer matrix multiply.int4 8832 Interleaved-64`
			`{"TensorOpMultiplicand<4,64>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 64>>},`
			`// Integer matrix multiply.int4 8832 TN kblock128`
			`{"TensorOpMultiplicand<4,128>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},`
CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. 2020-06-09 07:17:35 +08:00			`// Integer matrix multiply.int4 16864 TN kblock256`
			`{"TensorOpMultiplicand<4,256>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`// Integer matrix multiply 8816 Interleaved-32`
			`{"TensorOpMultiplicand<8,32>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},`
			`// Integer matrix multiply 8816 TN kblock64`
			`{"TensorOpMultiplicand<8,64>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},`
CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. 2020-06-09 07:17:35 +08:00			`{"TensorOpMultiplicand<8,128>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`// Matrix Multiply 1688 TN kblock32`
			`{"TensorOpMultiplicand<16,32>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},`
			`// Matrix multiply 1688 NT`
			`{"TensorOpMultiplicand<16,64>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},`
CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. 2020-06-09 07:17:35 +08:00			`// Matrix multiply 1688.TF32 TN kblock16`
			`{"TensorOpMultiplicand<32,16>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 16>>},`
			`// Matrix multiply 1688.TF32 TN kblock32`
			`{"TensorOpMultiplicand<32,32>",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<32, 32>>},`
			`// Matrix multiply 1688 NT`
			`{"TensorOpMultiplicandCongruous<32,32>",`
			`new VisualizeLayout<`
			`cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>},`
			`// Matrix multiply 884 NT`
			`{"TensorOpMultiplicandCongruous<64,16>",`
			`new VisualizeLayout<`
			`cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>},`
			`// Matrix multiply 884 TN`
			`{"TensorOpMultiplicand64bCrosswise",`
			`new VisualizeLayout<cutlass::layout::TensorOpMultiplicand64bCrosswise>},`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`{"TensorOpMultiplicandCongruous<128,4>",`
			`new VisualizeLayout<`
			`cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>},`
			`{"TensorOpMultiplicandCrosswise<128,4>",`
			`new VisualizeLayout<`
			`cutlass::layout::TensorOpMultiplicandCrosswise<128, 4>>},`
			`{"VoltaTensorOpMultiplicandCongruous<16>",`
			`new VisualizeLayout<`
			`cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>},`
			`{"VoltaTensorOpMultiplicandCrosswise<16,32>",`
			`new VisualizeLayout<`
CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. 2020-06-09 07:17:35 +08:00			`cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`};`

			`for (auto layout : layout_pairs) {`
			`layouts.emplace(std::string(layout.name), std::unique_ptr<VisualizeLayoutBase>(layout.ptr));`
			`}`
			`}`

			`/////////////////////////////////////////////////////////////////////////////////////////////////`