cutlass/test/unit/epilogue/threadblock/testbed.h

/***************************************************************************************************
 * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/*! \file
    \brief Unit tests for epilogues
*/
#pragma once

#include <fstream>
#include <cfenv>

#include "../../common/cutlass_unit_test.h"

#include "cutlass/aligned_buffer.h"
#include "cutlass/half.h"
#include "cutlass/complex.h"
#include "cutlass/quaternion.h"
#include "cutlass/epilogue/thread/linear_combination.h"

#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_fill.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

namespace test {
namespace kernel {

template <typename Epilogue>
__global__ void epilogue_threadblock(
  typename Epilogue::OutputTileIterator::Params params_D,
  typename Epilogue::OutputTileIterator::Element *ptr_D,
  typename Epilogue::OutputTileIterator::Params params_C,
  typename Epilogue::OutputTileIterator::Element *ptr_C,
  typename Epilogue::OutputOp::Params params_output_op,
  cutlass::MatrixCoord problem_size,
  cutlass::TensorRef<
    typename Epilogue::WarpMmaOperator::ElementC, 
    typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
    int epilogue_count = 1) {

  __shared__ typename Epilogue::SharedStorage shared_storage;

  int thread_idx = threadIdx.x;
  int warp_idx = threadIdx.x / 32;
  int lane_idx = threadIdx.x % 32;

  //
  // Construct the epilogue
  //

  // Tile iterator writing to output tile
  typename Epilogue::OutputTileIterator iterator_D(
    params_D,
    ptr_D,
    problem_size,
    thread_idx
  );

  // Tile iterator writing to output tile
  typename Epilogue::OutputTileIterator iterator_C(
    params_C,
    ptr_C,
    problem_size,
    thread_idx
  );

  // Epilogue operator
  Epilogue epilogue(
    shared_storage, 
    thread_idx, 
    warp_idx, 
    lane_idx);

  //
  // Initialize the accumulators
  //

  int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
  int warp_m = warp_mn % Epilogue::WarpCount::kM;
  int warp_n = warp_mn / Epilogue::WarpCount::kM;

  accumulator_ref.add_coord_offset({
    warp_m * Epilogue::WarpMmaOperator::Shape::kM, 
    warp_n * Epilogue::WarpMmaOperator::Shape::kN});

  typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
  
  typename Epilogue::AccumulatorTile accumulators;

  accumulators.clear();
  accumulator_iterator.load(accumulators);

#if 0
  // For debugging, enable this block of code to fill each accumulator element with its
  // source thread ID.
  CUTLASS_PRAGMA_UNROLL
  for (int i = 0; i < accumulators.size(); ++i) {
    typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
    //typename Epilogue::WarpMmaOperator::ElementC x(i);
    accumulators[i] = x;
  }

  /*
  #pragma unroll 1
  for (int tid = 0; tid < 32; ++tid) {
    if (tid == thread_idx) {
      printf("\nT%d: ", thread_idx);
      CUTLASS_PRAGMA_UNROLL
      for (int i = 0; i < accumulators.size(); ++i) {
        printf("%d ", int(accumulators[i]));
      }  
    }
  }

  if (thread_idx == 0) {
    printf("\n\n");  
  }
  */

  __syncthreads();

#endif

  //
  // Perform the epilogue operation
  //

  typename Epilogue::OutputOp output_op(params_output_op);

  // Place the epilogue in a loop
  for (int iter = 0; iter < epilogue_count; ++iter) {
    epilogue(output_op, iterator_D, accumulators, iterator_C);
  }
}

} // namespace kernel
} // namespace test


/////////////////////////////////////////////////////////////////////////////////////////////////

template <
  typename Epilogue_
>
class EpilogueTestbed {
public:

  using Epilogue = Epilogue_;
  using ElementAccumulator = typename Epilogue::ElementAccumulator;
  using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
  using ElementOutput = typename Epilogue::ElementOutput;
  using OutputOpParams = typename Epilogue::OutputOp::Params;

public:

  //
  // Data members
  //

  cutlass::MatrixCoord quantized_size;
  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;
  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;

public:

  //
  // Methods
  //

  EpilogueTestbed(): 
    quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
    accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
    source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
    output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}) {

    //
    // Initialize problem space
    //

    uint64_t seed = 2019;

    cutlass::reference::host::TensorFillRandomUniform(
      accumulator_tensor.host_view(), 
      seed, 
      20, 
      -20, 
      0);

    cutlass::reference::host::TensorFillRandomUniform(
      source_tensor.host_view(),
      seed + 2018, 
      20, 
      -20, 
      0);
  }

  bool run_all() {
   
    double alpha_values[] = {1, 0, 2.25};
    double beta_values[] = {0, 1, -1.25};

    // Test runtime explodes if we tried to test every case exhaustively. This tests the full
    // output tile and several smaller sizes to stress predication.
    for (int m_idx = 0; m_idx < 3; ++m_idx) {
      for (int n_idx = 0; n_idx < 3; ++n_idx) {

        int m = quantized_size.row() - m_idx * 3;
        int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;

        for (double const &alpha : alpha_values) {
          for (double const &beta : beta_values) {

            bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});

            if (!passed) {
              return false;
            }
          }
        }
      }
    }

    return true;
  }

  /// Runs the test
  bool run(
    cutlass::MatrixCoord problem_size,
    OutputOpParams output_params) { 

    //
    // Initialize problem space
    //

    ElementOutput default_output = ElementOutput(-127);
    cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);

    accumulator_tensor.sync_device();
    output_tensor.sync_device();
    source_tensor.sync_device();

    //
    // Initialize epilogue parameters
    //

    typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());
    typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());

    //
    // Launch kernel
    //

    dim3 grid(1, 1);
    dim3 block(Epilogue::WarpCount::kCount * 32, 1);

    test::kernel::epilogue_threadblock<Epilogue><<< grid, block >>>(
      params_D,
      output_tensor.device_data(),
      params_C,
      source_tensor.device_data(),
      output_params,
      problem_size, 
      accumulator_tensor.device_view());

    cudaError_t result = cudaDeviceSynchronize();

    if (result != cudaSuccess) {
      std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
      return false;
    }

    //
    // Verify results
    //
    output_tensor.sync_host();

    int errors = 0;
    int const kMaxErrors = 5;

    for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
      for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {

        cutlass::MatrixCoord coord{r, c};
        ElementOutput got = output_tensor.at(coord);
        
        ElementOutput expected;
        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
          ElementCompute intermediate =
            output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
            output_params.beta * ElementCompute(source_tensor.at(coord));
          
          if (std::numeric_limits<ElementOutput>::is_integer
              && !std::numeric_limits<ElementCompute>::is_integer) {
            std::fesetround(FE_TONEAREST);
            expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));
          } else {
            expected = ElementOutput(intermediate);
          }
        } else {
          expected = default_output;
        }

        if (expected != got) {

          using OutputIO = cutlass::ScalarIO<ElementOutput>;

          EXPECT_TRUE(false)
            << "-------\n"
            << "Error - output element (" << coord << ") - expected: " 
            << OutputIO(expected) 
            << ",  got: " << OutputIO(got)
            << ",  accum: " << (accumulator_tensor.at(coord))
            << ",  source: " << OutputIO(source_tensor.at(coord))
            << ",  alpha: " << (output_params.alpha)
            << ",  beta: " << (output_params.beta) << "\n";

          ++errors;
        }
      }
    }

    //
    // Report results on error
    //

    if (errors) {
      std::stringstream ss;
      ss 
        << "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_" 
        << Epilogue::WarpTileIterator::WarpShape::kM << "x" 
        << Epilogue::WarpTileIterator::WarpShape::kN 
        << "_slice_" << Epilogue::WarpCount::kK << ".csv"; 

      std::ofstream output_file(ss.str()); 
      output_file << output_tensor.host_view(); 
    }

    return !errors;
  }
};

/////////////////////////////////////////////////////////////////////////////////////////////////
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`/***************************************************************************************************`
New updates for 2.11 (#775) * New updates. * Minor profiler updates Co-authored-by: Aniket Shivam <ashivam@nvidia.com> 2023-01-21 05:32:57 +08:00			`* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
CUTLASS 2.9 (#468) 2022-04-24 03:02:38 +08:00			`* SPDX-License-Identifier: BSD-3-Clause`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`*`
CUTLASS 2.9 (#468) 2022-04-24 03:02:38 +08:00			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions are met:`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`*`
CUTLASS 2.9 (#468) 2022-04-24 03:02:38 +08:00			`* 1. Redistributions of source code must retain the above copyright notice, this`
			`* list of conditions and the following disclaimer.`
			`*`
			`* 2. Redistributions in binary form must reproduce the above copyright notice,`
			`* this list of conditions and the following disclaimer in the documentation`
			`* and/or other materials provided with the distribution.`
			`*`
			`* 3. Neither the name of the copyright holder nor the names of its`
			`* contributors may be used to endorse or promote products derived from`
			`* this software without specific prior written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE`
			`* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR`
			`* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER`
			`* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,`
			`* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`
			`/*! \file`
			`\brief Unit tests for epilogues`
			`*/`
			`#pragma once`

			`#include <fstream>`
CUTLASS 2.6 (#298) CUTLASS 2.6 2021-07-23 12:40:53 +08:00			`#include <cfenv>`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00
			`#include "../../common/cutlass_unit_test.h"`

			`#include "cutlass/aligned_buffer.h"`
			`#include "cutlass/half.h"`
			`#include "cutlass/complex.h"`
CUTLASS 2.6 (#298) CUTLASS 2.6 2021-07-23 12:40:53 +08:00			`#include "cutlass/quaternion.h"`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`#include "cutlass/epilogue/thread/linear_combination.h"`

			`#include "cutlass/util/host_tensor.h"`
			`#include "cutlass/util/tensor_view_io.h"`
			`#include "cutlass/util/reference/host/tensor_fill.h"`

			`/////////////////////////////////////////////////////////////////////////////////////////////////`

			`namespace test {`
			`namespace kernel {`

			`template <typename Epilogue>`
			`__global__ void epilogue_threadblock(`
			`typename Epilogue::OutputTileIterator::Params params_D,`
			`typename Epilogue::OutputTileIterator::Element *ptr_D,`
			`typename Epilogue::OutputTileIterator::Params params_C,`
			`typename Epilogue::OutputTileIterator::Element *ptr_C,`
			`typename Epilogue::OutputOp::Params params_output_op,`
			`cutlass::MatrixCoord problem_size,`
			`cutlass::TensorRef<`
			`typename Epilogue::WarpMmaOperator::ElementC,`
			`typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,`
			`int epilogue_count = 1) {`

			`__shared__ typename Epilogue::SharedStorage shared_storage;`

			`int thread_idx = threadIdx.x;`
			`int warp_idx = threadIdx.x / 32;`
			`int lane_idx = threadIdx.x % 32;`

			`//`
			`// Construct the epilogue`
			`//`

			`// Tile iterator writing to output tile`
			`typename Epilogue::OutputTileIterator iterator_D(`
			`params_D,`
			`ptr_D,`
			`problem_size,`
			`thread_idx`
			`);`

			`// Tile iterator writing to output tile`
			`typename Epilogue::OutputTileIterator iterator_C(`
			`params_C,`
			`ptr_C,`
			`problem_size,`
			`thread_idx`
			`);`

			`// Epilogue operator`
			`Epilogue epilogue(`
			`shared_storage,`
			`thread_idx,`
			`warp_idx,`
			`lane_idx);`

			`//`
			`// Initialize the accumulators`
			`//`

			`int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);`
			`int warp_m = warp_mn % Epilogue::WarpCount::kM;`
			`int warp_n = warp_mn / Epilogue::WarpCount::kM;`

			`accumulator_ref.add_coord_offset({`
			`warp_m * Epilogue::WarpMmaOperator::Shape::kM,`
			`warp_n * Epilogue::WarpMmaOperator::Shape::kN});`

			`typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);`

			`typename Epilogue::AccumulatorTile accumulators;`

			`accumulators.clear();`
			`accumulator_iterator.load(accumulators);`

			`#if 0`
			`// For debugging, enable this block of code to fill each accumulator element with its`
			`// source thread ID.`
			`CUTLASS_PRAGMA_UNROLL`
			`for (int i = 0; i < accumulators.size(); ++i) {`
			`typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);`
			`//typename Epilogue::WarpMmaOperator::ElementC x(i);`
			`accumulators[i] = x;`
			`}`

			`/*`
			`#pragma unroll 1`
			`for (int tid = 0; tid < 32; ++tid) {`
			`if (tid == thread_idx) {`
			`printf("\nT%d: ", thread_idx);`
			`CUTLASS_PRAGMA_UNROLL`
			`for (int i = 0; i < accumulators.size(); ++i) {`
			`printf("%d ", int(accumulators[i]));`
			`}`
			`}`
			`}`

			`if (thread_idx == 0) {`
			`printf("\n\n");`
			`}`
			`*/`

			`__syncthreads();`

			`#endif`

			`//`
			`// Perform the epilogue operation`
			`//`

			`typename Epilogue::OutputOp output_op(params_output_op);`

			`// Place the epilogue in a loop`
			`for (int iter = 0; iter < epilogue_count; ++iter) {`
			`epilogue(output_op, iterator_D, accumulators, iterator_C);`
			`}`
			`}`

			`} // namespace kernel`
			`} // namespace test`


			`/////////////////////////////////////////////////////////////////////////////////////////////////`

			`template <`
			`typename Epilogue_`
			`>`
			`class EpilogueTestbed {`
			`public:`

			`using Epilogue = Epilogue_;`
			`using ElementAccumulator = typename Epilogue::ElementAccumulator;`
			`using ElementCompute = typename Epilogue::OutputOp::ElementCompute;`
			`using ElementOutput = typename Epilogue::ElementOutput;`
			`using OutputOpParams = typename Epilogue::OutputOp::Params;`

			`public:`

			`//`
			`// Data members`
			`//`

			`cutlass::MatrixCoord quantized_size;`
			`cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;`
			`cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;`
			`cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;`

			`public:`

			`//`
			`// Methods`
			`//`

			`EpilogueTestbed():`
			`quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),`
			`accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),`
			`source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),`
			`output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}) {`

			`//`
			`// Initialize problem space`
			`//`

			`uint64_t seed = 2019;`

			`cutlass::reference::host::TensorFillRandomUniform(`
			`accumulator_tensor.host_view(),`
			`seed,`
			`20,`
			`-20,`
			`0);`

			`cutlass::reference::host::TensorFillRandomUniform(`
			`source_tensor.host_view(),`
			`seed + 2018,`
			`20,`
			`-20,`
			`0);`
			`}`

			`bool run_all() {`

			`double alpha_values[] = {1, 0, 2.25};`
			`double beta_values[] = {0, 1, -1.25};`

			`// Test runtime explodes if we tried to test every case exhaustively. This tests the full`
			`// output tile and several smaller sizes to stress predication.`
			`for (int m_idx = 0; m_idx < 3; ++m_idx) {`
			`for (int n_idx = 0; n_idx < 3; ++n_idx) {`

			`int m = quantized_size.row() - m_idx * 3;`
			`int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;`

			`for (double const &alpha : alpha_values) {`
			`for (double const &beta : beta_values) {`

			`bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});`

			`if (!passed) {`
			`return false;`
			`}`
			`}`
			`}`
			`}`
			`}`

			`return true;`
			`}`

			`/// Runs the test`
			`bool run(`
			`cutlass::MatrixCoord problem_size,`
			`OutputOpParams output_params) {`

			`//`
			`// Initialize problem space`
			`//`

			`ElementOutput default_output = ElementOutput(-127);`
			`cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);`

			`accumulator_tensor.sync_device();`
			`output_tensor.sync_device();`
			`source_tensor.sync_device();`

			`//`
			`// Initialize epilogue parameters`
			`//`

			`typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());`
			`typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());`

			`//`
			`// Launch kernel`
			`//`

			`dim3 grid(1, 1);`
			`dim3 block(Epilogue::WarpCount::kCount * 32, 1);`

			`test::kernel::epilogue_threadblock<Epilogue><<< grid, block >>>(`
			`params_D,`
			`output_tensor.device_data(),`
			`params_C,`
			`source_tensor.device_data(),`
			`output_params,`
			`problem_size,`
			`accumulator_tensor.device_view());`

			`cudaError_t result = cudaDeviceSynchronize();`

			`if (result != cudaSuccess) {`
			`std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;`
			`return false;`
			`}`

			`//`
			`// Verify results`
			`//`
			`output_tensor.sync_host();`

			`int errors = 0;`
			`int const kMaxErrors = 5;`

			`for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {`
			`for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {`

			`cutlass::MatrixCoord coord{r, c};`
			`ElementOutput got = output_tensor.at(coord);`

			`ElementOutput expected;`
			`if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {`
CUTLASS 2.6 (#298) CUTLASS 2.6 2021-07-23 12:40:53 +08:00			`ElementCompute intermediate =`
			`output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) +`
			`output_params.beta * ElementCompute(source_tensor.at(coord));`

			`if (std::numeric_limits<ElementOutput>::is_integer`
			`&& !std::numeric_limits<ElementCompute>::is_integer) {`
			`std::fesetround(FE_TONEAREST);`
			`expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));`
			`} else {`
			`expected = ElementOutput(intermediate);`
			`}`
			`} else {`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00			`expected = default_output;`
			`}`

			`if (expected != got) {`

			`using OutputIO = cutlass::ScalarIO<ElementOutput>;`

			`EXPECT_TRUE(false)`
			`<< "-------\n"`
			`<< "Error - output element (" << coord << ") - expected: "`
			`<< OutputIO(expected)`
CUTLASS 2.6 (#298) CUTLASS 2.6 2021-07-23 12:40:53 +08:00			`<< ", got: " << OutputIO(got)`
			`<< ", accum: " << (accumulator_tensor.at(coord))`
			`<< ", source: " << OutputIO(source_tensor.at(coord))`
			`<< ", alpha: " << (output_params.alpha)`
			`<< ", beta: " << (output_params.beta) << "\n";`
CUTLASS 2.0 (#62) CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater. 2019-11-20 08:55:34 +08:00
			`++errors;`
			`}`
			`}`
			`}`

			`//`
			`// Report results on error`
			`//`

			`if (errors) {`
			`std::stringstream ss;`
			`ss`
			`<< "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_"`
			`<< Epilogue::WarpTileIterator::WarpShape::kM << "x"`
			`<< Epilogue::WarpTileIterator::WarpShape::kN`
			`<< "_slice_" << Epilogue::WarpCount::kK << ".csv";`

			`std::ofstream output_file(ss.str());`
			`output_file << output_tensor.host_view();`
			`}`

			`return !errors;`
			`}`
			`};`

			`/////////////////////////////////////////////////////////////////////////////////////////////////`