cutlass/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu

/***************************************************************************************************
 * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*! \file
    \brief Unit tests for thread-level GEMM
*/

#include <fstream>

#include "../../common/cutlass_unit_test.h"

#include "cutlass/aligned_buffer.h"

#include "cutlass/gemm/warp/mma_simt.h"
#include "cutlass/gemm/warp/mma_simt_policy.h"

#include "cutlass/epilogue/thread/linear_combination.h"    
#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"

#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"

#include "testbed.h"

/////////////////////////////////////////////////////////////////////////////////////////////////
//
// Real-valued half precision tests
//
/////////////////////////////////////////////////////////////////////////////////////////////////

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_32x64_32x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<32, 64, 8>;
  using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<4, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_64x64_64x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<64, 64, 8>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<8, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_64x128_64x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<64, 128, 8>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<8, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_128x128_64x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<128, 128, 8>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<8, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_128x256_64x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<128, 256, 8>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<8, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_256x128_64x64x8) {

  //
  // Define the warp-level matrix multiply
  //

  using Element = cutlass::half_t;
  using ElementOutput = cutlass::half_t;
  using ElementAccumulator = cutlass::half_t;
  using ElementCompute = cutlass::half_t;

  int const kElementsPerAccess = 1;
  
  using Shape = cutlass::gemm::GemmShape<256, 128, 8>;
  using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;
  using ElementC = ElementAccumulator;
  using LayoutA = cutlass::layout::ColumnMajor;
  using LayoutB = cutlass::layout::RowMajor;
  using LayoutC = cutlass::layout::RowMajor;

  using ElementOutput = Element;
  using ElementAccumulator = Element;
  using ElementCompute = Element;

  using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<
    WarpShape,
    Element,
    LayoutA,
    Element,
    LayoutB,
    Element,
    LayoutC,
    cutlass::gemm::warp::MmaSimtPolicy<
      cutlass::MatrixShape<4, 8>,
      cutlass::layout::RowMajorInterleaved<2>,
      cutlass::gemm::GemmShape<8, 4, 1>
    >
  >;

  //
  // Output operator
  //

  using OutputOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,
    kElementsPerAccess,
    ElementAccumulator,
    ElementCompute
  >;

  //
  // Define the epilogue
  //

  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
    Shape,
    WarpMmaSimt,
    OutputOp,
    kElementsPerAccess
  >::Epilogue;

  //
  // Instantiate epilogue
  //

  EpilogueTestbed<Epilogue> testbed;

  bool passed = testbed.run_all();

  EXPECT_TRUE(passed);
}

///////////////////////////////////////////////////////////////////////////////////////////////////
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`/***************************************************************************************************`
CUTLASS 2.5 2021-02-26 22:58:26 +08:00			`* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`*`
			`* Redistribution and use in source and binary forms, with or without modification, are permitted`
			`* provided that the following conditions are met:`
			`* * Redistributions of source code must retain the above copyright notice, this list of`
			`* conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`* conditions and the following disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`* to endorse or promote products derived from this software without specific prior written`
			`* permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
Cutlass 2.6 Update 1 (#301) * cutlass 2.6 update * remove debug prints 2021-07-28 08:58:30 +08:00			`* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`
			`/*! \file`
			`\brief Unit tests for thread-level GEMM`
			`*/`

			`#include <fstream>`

			`#include "../../common/cutlass_unit_test.h"`

			`#include "cutlass/aligned_buffer.h"`

			`#include "cutlass/gemm/warp/mma_simt.h"`
			`#include "cutlass/gemm/warp/mma_simt_policy.h"`

			`#include "cutlass/epilogue/thread/linear_combination.h"`
			`#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"`

			`#include "cutlass/util/host_tensor.h"`
			`#include "cutlass/util/tensor_view_io.h"`
			`#include "cutlass/util/reference/host/tensor_fill.h"`

			`#include "testbed.h"`

			`/////////////////////////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Real-valued half precision tests`
			`//`
			`/////////////////////////////////////////////////////////////////////////////////////////////////`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_32x64_32x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<32, 64, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<4, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_64x64_64x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<8, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_64x128_64x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<64, 128, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<8, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_128x128_64x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<128, 128, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<8, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_128x256_64x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<128, 256, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<8, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`TEST(SM60_Epilogue_threadblock_epilogue, simt_f16_256x128_64x64x8) {`

			`//`
			`// Define the warp-level matrix multiply`
			`//`

			`using Element = cutlass::half_t;`
			`using ElementOutput = cutlass::half_t;`
			`using ElementAccumulator = cutlass::half_t;`
			`using ElementCompute = cutlass::half_t;`

			`int const kElementsPerAccess = 1;`

			`using Shape = cutlass::gemm::GemmShape<256, 128, 8>;`
			`using WarpShape = cutlass::gemm::GemmShape<64, 64, 8>;`
			`using ElementC = ElementAccumulator;`
			`using LayoutA = cutlass::layout::ColumnMajor;`
			`using LayoutB = cutlass::layout::RowMajor;`
			`using LayoutC = cutlass::layout::RowMajor;`

			`using ElementOutput = Element;`
			`using ElementAccumulator = Element;`
			`using ElementCompute = Element;`

			`using WarpMmaSimt = cutlass::gemm::warp::MmaSimt<`
			`WarpShape,`
			`Element,`
			`LayoutA,`
			`Element,`
			`LayoutB,`
			`Element,`
			`LayoutC,`
			`cutlass::gemm::warp::MmaSimtPolicy<`
			`cutlass::MatrixShape<4, 8>,`
			`cutlass::layout::RowMajorInterleaved<2>,`
			`cutlass::gemm::GemmShape<8, 4, 1>`
			`>`
			`>;`

			`//`
			`// Output operator`
			`//`

			`using OutputOp = cutlass::epilogue::thread::LinearCombination<`
			`ElementOutput,`
			`kElementsPerAccess,`
			`ElementAccumulator,`
			`ElementCompute`
			`>;`

			`//`
			`// Define the epilogue`
			`//`

			`using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<`
			`Shape,`
			`WarpMmaSimt,`
			`OutputOp,`
			`kElementsPerAccess`
			`>::Epilogue;`

			`//`
			`// Instantiate epilogue`
			`//`

			`EpilogueTestbed<Epilogue> testbed;`

			`bool passed = testbed.run_all();`

			`EXPECT_TRUE(passed);`
			`}`

			`///////////////////////////////////////////////////////////////////////////////////////////////////`