cutlass/tools/test/perf/gemm/gemm_profiler.h

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
#pragma once

#include <fstream>
#include <map>
#include <stdexcept>
#include <utility>

#include "cutlass/util/platform.h"
#if defined(CUTLASS_OS_WINDOWS)
#include <Windows.h>
#else
// needed for sleep
#include <unistd.h>
#endif

#include "tools/test/perf/gemm/gemm_perf_testbed.h"
#include "tools/test/perf/testbench_configs.h"
#include "tools/test/perf/testbench_options.h"
#include "tools/test/perf/testbench_output.h"

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace perf {

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Performance measuring testbed
template <typename AType,
          typename BType,
          typename CType,
          typename AccumulatorType,
          typename ScalarType>
class GemmProfiler {
 public:
  /// Test environment
  typedef GemmTestbed<AType, BType, CType, AccumulatorType, ScalarType> PerfTestbed;

 private:
  //
  // Data members
  //

  /// Reference to TestbenchOutput instance
  TestbenchOutput<GemmProblem> &output;

  /// Reference to options object
  TestbenchOptions const &options;

  // Reference to config object
  Config const &config;

  /// Performance test environment
  PerfTestbed testbed;

  /// Kernel name
  std::string kernel_name;

  /// Cutlass algorithm
  std::string cutlass_algo;

  /// Timing events
  cudaEvent_t events[2];

 public:
  /// Delays
  static void pause(int seconds) {
#if defined(WIN32)
    Sleep(1000 * seconds);
#else
    sleep(seconds);
#endif
  }

 public:
  //
  // Methods
  //

  /// Constructs performance testebed
  GemmProfiler(TestbenchOutput<GemmProblem> &_output,
               std::string const &_kernel_name,
               std::string const &_cutlass_algo,
               TestbenchOptions const &_options,
               Config const &_config)
      : output(_output),
        options(_options),
        config(_config),
        kernel_name(_kernel_name),
        cutlass_algo(_cutlass_algo),
        testbed(_options.initial_distribution) {
    for (int i = 0; i < 2; ++i) {
      cudaError_t result = cudaEventCreate(&events[i]);
      if (result != cudaSuccess) {
        throw std::runtime_error("GemmPerfTestbed() failed to create CUDA events");
      }
    }
  }

  ~GemmProfiler() {}

  /// Writes the workspace to text files
  void write_problem(Provider::Kind provider, std::string const &kernel_name) {
    std::stringstream base_filename;

    base_filename << provider << "_" << kernel_name << "_" << testbed.M() << "x" << testbed.N()
                  << "x" << testbed.K();

    std::string results_name = base_filename.str() + "_results.txt";
    std::string errors_name = base_filename.str() + "_errors.txt";

    std::ofstream results(results_name.c_str());
    std::ofstream errors(errors_name.c_str());
    testbed.write_problem(results, errors);
  }

  /// Profiles Cutlass
  template <typename CutlassDispatch>
  PerformanceResult<GemmProblem> execute_cutlass(GemmProblem const &problem,
                                                 cublasGemmAlgo_t algorithm) {
    PerformanceResult<GemmProblem> result(
      Provider::Cutlass
      , kernel_name
      , problem
    );

    if (options.dry_run) {
      result.disposition = Disposition::NotRun;
      return result;
    }

    if (CutlassDispatch::kRunCuBLAS) {
      testbed.compute_reference(algorithm);

      if (cudaDeviceSynchronize() != cudaSuccess) {
        result.disposition = Disposition::NotVerified;
        return result;
      }
    }
    else {
      result.disposition = Disposition::Passed;
    }

    CutlassDispatch *dispatch_ptr;

    // check to see if we need to launch batched strided gemm
    if (testbed.batch_count() == 1) {
      dispatch_ptr = new CutlassDispatch(testbed.M(),
        testbed.N(),
        testbed.K(),
        testbed.alpha(),
        testbed.ptr_A(),
        testbed.lda(),
        testbed.ptr_B(),
        testbed.ldb(),
        testbed.beta(),
        testbed.ptr_C_initial(),
        testbed.ldc(),
        testbed.ptr_experimental(),
        testbed.ldc());

      dispatch_ptr->operator()();
    }
    else {
      dispatch_ptr = new CutlassDispatch(testbed.M(),
        testbed.N(),
        testbed.K(),
        testbed.alpha(),
        testbed.ptr_A(),
        testbed.lda(),
        testbed.batch_stride_a(),
        testbed.ptr_B(),
        testbed.ldb(),
        testbed.batch_stride_b(),
        testbed.beta(),
        testbed.ptr_C_initial(),
        testbed.ldc(),
        testbed.batch_stride_c(),
        testbed.ptr_experimental(),
        testbed.ldc(),
        testbed.batch_stride_c(),
        testbed.batch_count());

      dispatch_ptr->operator()();
    }

    if (cudaDeviceSynchronize() != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    if (CutlassDispatch::kRunCuBLAS) {
      if (testbed.verify_with_reference()) {
        result.disposition = Disposition::Passed;
      } else {
        result.disposition = Disposition::Incorrect;
      }
    }

    if (options.save_workspace(result.disposition == Disposition::Passed)) {
      write_problem(Provider::Cutlass, kernel_name);
    }

    if (cudaDeviceSynchronize() != cudaSuccess) {
      result.disposition = Disposition::Failed;
    }

    // warmup launch
    dispatch_ptr->operator()();

    if (cudaDeviceSynchronize() != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    if (cudaEventRecord(events[0]) != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    for (int iter = 0; iter < options.iterations; ++iter) {
      dispatch_ptr->operator()();
    }

    if (cudaEventRecord(events[1]) != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    if (cudaEventSynchronize(events[1]) != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    float average_ms = 0;
    if (cudaEventElapsedTime(&average_ms, events[0], events[1]) != cudaSuccess) {
      result.disposition = Disposition::Failed;
      delete dispatch_ptr;
      return result;
    }

    result.runtime = double(average_ms) / double(options.iterations);
    result.gflops = testbed.GFLOPs_per_sec(result.runtime);

    if (result.disposition != Disposition::Passed) {
      std::cout << "[\033[1;31mFAILED\033[0m]: " << kernel_name
                << " failed with disposition: " << result.disposition << "\n";
    }

    delete dispatch_ptr;
    return result;
  }

  template <typename T, typename F>
  bool contains(T const &container, F const &val) {
    return std::find(container.begin(), container.end(), val) != container.end();
  }

  /// Executes all kernels for this problem size
  template <typename CutlassDispatch>
  std::vector<PerformanceResult<GemmProblem> > execute(GemmProblem const &problem) {

    // New problem size
    output.begin_problem();

    bool const tensor_op = !(CutlassDispatch::kThreadMultiplyAdd);
    cublasGemmAlgo_t algorithm = tensor_op ?
      CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT;

    testbed.resize(problem);

    std::vector<PerformanceResult<GemmProblem> > results;

    results.push_back(execute_cutlass<CutlassDispatch>(problem, algorithm));
    // cool-down period
    if (!options.dry_run) {
      pause(options.sleep_time);
    }

    return results;
  }

  /// Runs the test and collects performance for all results
  template <typename CutlassDispatch>
  void schmoo(Range const &M, Range const &N, Range const &K, Range const &batch_count) {
    for (int b = batch_count.start; b <= batch_count.end; b = batch_count.next(b)) {
      for (int m = M.start; m <= M.end; m = M.next(m)) {
        for (int n = N.start; n <= N.end; n = N.next(n)) {
          for (int k = K.start; k <= K.end; k = K.next(k)) {
            std::vector<PerformanceResult<GemmProblem> > results =
              execute<CutlassDispatch>(GemmProblem(m,
                n,
                k,
                CutlassDispatch::kLayoutA,
                CutlassDispatch::kLayoutB,
                config.alpha,
                config.beta,
                b));

            for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();
              it != results.end();
              ++it) {
              output.append(*it);
            }
          }//k
        }//n
      }//m
    }//batch_count
  }

  /// Runs the test over the problem space and reports only the best performance
  template <typename CutlassDispatch>
  void peak(Range const &M, Range const &N, Range const &K) {
    typedef std::map<Provider::Kind, PerformanceResult<GemmProblem> > ProviderPerformanceMap;

    ProviderPerformanceMap max_perf;

    for (int m = M.start; m <= M.end; m += M.next(m)) {
      for (int n = N.start; n <= N.end; n += N.next(n)) {
        for (int k = K.start; k <= K.end; k += K.next(k)) {
          std::vector<PerformanceResult<GemmProblem> > results =
              execute<CutlassDispatch>(GemmProblem(m,
                                                   n,
                                                   k,
                                                   CutlassDispatch::kLayoutA,
                                                   CutlassDispatch::kLayoutB,
                                                   config.alpha,
                                                   config.beta));

          for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();
               it != results.end();
               ++it) {
            /// Writes the output without appending it
            output.pretty_print(*it);

            if (it->disposition == Disposition::Passed) {
              /// Updates maximum performing kernel
              ProviderPerformanceMap::iterator max_perf_it = max_perf.find(it->provider);

              if (max_perf_it == max_perf.end()) {
                max_perf.insert(std::make_pair(it->provider, *it));
              } else if (max_perf_it->second.gflops < it->gflops) {
                max_perf_it->second = *it;
              }
            }
          }
        }
      }
    }

    Provider::Kind providers[] = {
      Provider::Cutlass,
      Provider::Invalid
    };
    for (int i = 0; providers[i] != Provider::Invalid; ++i) {
      ProviderPerformanceMap::const_iterator it = max_perf.find(providers[i]);
      if (it != max_perf.end()) {
        output.append(it->second);
      }
    }
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Dispatches to GEMM performance profiler
template <typename Dispatch, typename GemmProfiler>
int profile_gemm(TestbenchOutput<GemmProblem> &output,
                 std::string const &kernel,
                 TestbenchOptions const &options,
                 Config const &config,
                 std::string const &cutlass_algo = "") {
  if (config.kernel_enabled(kernel)) {
    GemmProfiler perf(output, kernel, cutlass_algo, options, config);
    if (options.peak_performance) {
      perf.template peak<Dispatch>(
          config.problem_range.M, config.problem_range.N, config.problem_range.K);
    } else {
      perf.template schmoo<Dispatch>(
          config.problem_range.M, config.problem_range.N, config.problem_range.K, config.problem_range.batch_count);
    }
  }

  return 0;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace perf
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`/***************************************************************************************************`
			`* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without modification, are permitted`
			`* provided that the following conditions are met:`
			`* * Redistributions of source code must retain the above copyright notice, this list of`
			`* conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`* conditions and the following disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`* to endorse or promote products derived from this software without specific prior written`
			`* permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
			`* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`
			`#pragma once`

			`#include <fstream>`
			`#include <map>`
			`#include <stdexcept>`
			`#include <utility>`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`#include "cutlass/util/platform.h"`
			`#if defined(CUTLASS_OS_WINDOWS)`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`#include <Windows.h>`
			`#else`
			`// needed for sleep`
			`#include <unistd.h>`
			`#endif`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`#include "tools/test/perf/gemm/gemm_perf_testbed.h"`
			`#include "tools/test/perf/testbench_configs.h"`
			`#include "tools/test/perf/testbench_options.h"`
			`#include "tools/test/perf/testbench_output.h"`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`////////////////////////////////////////////////////////////////////////////////////////////////////`

			`namespace perf {`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`

			`/// Performance measuring testbed`
			`template <typename AType,`
			`typename BType,`
			`typename CType,`
			`typename AccumulatorType,`
			`typename ScalarType>`
			`class GemmProfiler {`
			`public:`
			`/// Test environment`
			`typedef GemmTestbed<AType, BType, CType, AccumulatorType, ScalarType> PerfTestbed;`

			`private:`
			`//`
			`// Data members`
			`//`

			`/// Reference to TestbenchOutput instance`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`TestbenchOutput<GemmProblem> &output;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`/// Reference to options object`
			`TestbenchOptions const &options;`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`// Reference to config object`
			`Config const &config;`

CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`/// Performance test environment`
			`PerfTestbed testbed;`

			`/// Kernel name`
			`std::string kernel_name;`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`/// Cutlass algorithm`
			`std::string cutlass_algo;`

CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`/// Timing events`
			`cudaEvent_t events[2];`

			`public:`
			`/// Delays`
			`static void pause(int seconds) {`
			`#if defined(WIN32)`
			`Sleep(1000 * seconds);`
			`#else`
			`sleep(seconds);`
			`#endif`
			`}`

			`public:`
			`//`
			`// Methods`
			`//`

			`/// Constructs performance testebed`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`GemmProfiler(TestbenchOutput<GemmProblem> &_output,`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`std::string const &_kernel_name,`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::string const &_cutlass_algo,`
			`TestbenchOptions const &_options,`
			`Config const &_config)`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`: output(_output),`
			`options(_options),`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`config(_config),`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`kernel_name(_kernel_name),`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`cutlass_algo(_cutlass_algo),`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`testbed(_options.initial_distribution) {`
			`for (int i = 0; i < 2; ++i) {`
			`cudaError_t result = cudaEventCreate(&events[i]);`
			`if (result != cudaSuccess) {`
			`throw std::runtime_error("GemmPerfTestbed() failed to create CUDA events");`
			`}`
			`}`
			`}`

			`~GemmProfiler() {}`

			`/// Writes the workspace to text files`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`void write_problem(Provider::Kind provider, std::string const &kernel_name) {`
			`std::stringstream base_filename;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`base_filename << provider << "_" << kernel_name << "_" << testbed.M() << "x" << testbed.N()`
			`<< "x" << testbed.K();`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::string results_name = base_filename.str() + "_results.txt";`
			`std::string errors_name = base_filename.str() + "_errors.txt";`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::ofstream results(results_name.c_str());`
			`std::ofstream errors(errors_name.c_str());`
			`testbed.write_problem(results, errors);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

			`/// Profiles Cutlass`
			`template <typename CutlassDispatch>`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`PerformanceResult<GemmProblem> execute_cutlass(GemmProblem const &problem,`
			`cublasGemmAlgo_t algorithm) {`
			`PerformanceResult<GemmProblem> result(`
			`Provider::Cutlass`
			`, kernel_name`
			`, problem`
			`);`

			`if (options.dry_run) {`
			`result.disposition = Disposition::NotRun;`
			`return result;`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`if (CutlassDispatch::kRunCuBLAS) {`
			`testbed.compute_reference(algorithm);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`if (cudaDeviceSynchronize() != cudaSuccess) {`
			`result.disposition = Disposition::NotVerified;`
			`return result;`
			`}`
			`}`
			`else {`
			`result.disposition = Disposition::Passed;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`CutlassDispatch *dispatch_ptr;`

			`// check to see if we need to launch batched strided gemm`
			`if (testbed.batch_count() == 1) {`
			`dispatch_ptr = new CutlassDispatch(testbed.M(),`
			`testbed.N(),`
			`testbed.K(),`
			`testbed.alpha(),`
			`testbed.ptr_A(),`
			`testbed.lda(),`
			`testbed.ptr_B(),`
			`testbed.ldb(),`
			`testbed.beta(),`
			`testbed.ptr_C_initial(),`
			`testbed.ldc(),`
			`testbed.ptr_experimental(),`
			`testbed.ldc());`

			`dispatch_ptr->operator()();`
			`}`
			`else {`
			`dispatch_ptr = new CutlassDispatch(testbed.M(),`
			`testbed.N(),`
			`testbed.K(),`
			`testbed.alpha(),`
			`testbed.ptr_A(),`
			`testbed.lda(),`
			`testbed.batch_stride_a(),`
			`testbed.ptr_B(),`
			`testbed.ldb(),`
			`testbed.batch_stride_b(),`
			`testbed.beta(),`
			`testbed.ptr_C_initial(),`
			`testbed.ldc(),`
			`testbed.batch_stride_c(),`
			`testbed.ptr_experimental(),`
			`testbed.ldc(),`
			`testbed.batch_stride_c(),`
			`testbed.batch_count());`

			`dispatch_ptr->operator()();`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`if (cudaDeviceSynchronize() != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`if (CutlassDispatch::kRunCuBLAS) {`
			`if (testbed.verify_with_reference()) {`
			`result.disposition = Disposition::Passed;`
			`} else {`
			`result.disposition = Disposition::Incorrect;`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

			`if (options.save_workspace(result.disposition == Disposition::Passed)) {`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`write_problem(Provider::Cutlass, kernel_name);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

			`if (cudaDeviceSynchronize() != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
			`}`

			`// warmup launch`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`dispatch_ptr->operator()();`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`if (cudaDeviceSynchronize() != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

			`if (cudaEventRecord(events[0]) != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

			`for (int iter = 0; iter < options.iterations; ++iter) {`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`dispatch_ptr->operator()();`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

			`if (cudaEventRecord(events[1]) != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

			`if (cudaEventSynchronize(events[1]) != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

			`float average_ms = 0;`
			`if (cudaEventElapsedTime(&average_ms, events[0], events[1]) != cudaSuccess) {`
			`result.disposition = Disposition::Failed;`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

			`result.runtime = double(average_ms) / double(options.iterations);`
			`result.gflops = testbed.GFLOPs_per_sec(result.runtime);`

			`if (result.disposition != Disposition::Passed) {`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::cout << "[\033[1;31mFAILED\033[0m]: " << kernel_name`
			`<< " failed with disposition: " << result.disposition << "\n";`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`delete dispatch_ptr;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`return result;`
			`}`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`template <typename T, typename F>`
			`bool contains(T const &container, F const &val) {`
			`return std::find(container.begin(), container.end(), val) != container.end();`
			`}`

CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`/// Executes all kernels for this problem size`
			`template <typename CutlassDispatch>`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::vector<PerformanceResult<GemmProblem> > execute(GemmProblem const &problem) {`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`// New problem size`
			`output.begin_problem();`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`bool const tensor_op = !(CutlassDispatch::kThreadMultiplyAdd);`
			`cublasGemmAlgo_t algorithm = tensor_op ?`
			`CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`testbed.resize(problem);`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`std::vector<PerformanceResult<GemmProblem> > results;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`results.push_back(execute_cutlass<CutlassDispatch>(problem, algorithm));`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`// cool-down period`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`if (!options.dry_run) {`
			`pause(options.sleep_time);`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
			`return results;`
			`}`

			`/// Runs the test and collects performance for all results`
			`template <typename CutlassDispatch>`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`void schmoo(Range const &M, Range const &N, Range const &K, Range const &batch_count) {`
			`for (int b = batch_count.start; b <= batch_count.end; b = batch_count.next(b)) {`
			`for (int m = M.start; m <= M.end; m = M.next(m)) {`
			`for (int n = N.start; n <= N.end; n = N.next(n)) {`
			`for (int k = K.start; k <= K.end; k = K.next(k)) {`
			`std::vector<PerformanceResult<GemmProblem> > results =`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`execute<CutlassDispatch>(GemmProblem(m,`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`n,`
			`k,`
			`CutlassDispatch::kLayoutA,`
			`CutlassDispatch::kLayoutB,`
			`config.alpha,`
			`config.beta,`
			`b));`

			`for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();`
			`it != results.end();`
			`++it) {`
			`output.append(*it);`
			`}`
			`}//k`
			`}//n`
			`}//m`
			`}//batch_count`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`

			`/// Runs the test over the problem space and reports only the best performance`
			`template <typename CutlassDispatch>`
			`void peak(Range const &M, Range const &N, Range const &K) {`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`typedef std::map<Provider::Kind, PerformanceResult<GemmProblem> > ProviderPerformanceMap;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`ProviderPerformanceMap max_perf;`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`for (int m = M.start; m <= M.end; m += M.next(m)) {`
			`for (int n = N.start; n <= N.end; n += N.next(n)) {`
			`for (int k = K.start; k <= K.end; k += K.next(k)) {`
			`std::vector<PerformanceResult<GemmProblem> > results =`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`execute<CutlassDispatch>(GemmProblem(m,`
			`n,`
			`k,`
			`CutlassDispatch::kLayoutA,`
			`CutlassDispatch::kLayoutB,`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`config.alpha,`
			`config.beta));`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`it != results.end();`
			`++it) {`
			`/// Writes the output without appending it`
			`output.pretty_print(*it);`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`if (it->disposition == Disposition::Passed) {`
			`/// Updates maximum performing kernel`
			`ProviderPerformanceMap::iterator max_perf_it = max_perf.find(it->provider);`

			`if (max_perf_it == max_perf.end()) {`
			`max_perf.insert(std::make_pair(it->provider, *it));`
			`} else if (max_perf_it->second.gflops < it->gflops) {`
			`max_perf_it->second = *it;`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`
			`}`
			`}`
			`}`
			`}`

Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`Provider::Kind providers[] = {`
			`Provider::Cutlass,`
			`Provider::Invalid`
			`};`
			`for (int i = 0; providers[i] != Provider::Invalid; ++i) {`
			`ProviderPerformanceMap::const_iterator it = max_perf.find(providers[i]);`
			`if (it != max_perf.end()) {`
			`output.append(it->second);`
			`}`
			`}`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`
			`};`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`

			`/// Dispatches to GEMM performance profiler`
			`template <typename Dispatch, typename GemmProfiler>`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`int profile_gemm(TestbenchOutput<GemmProblem> &output,`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`std::string const &kernel,`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`TestbenchOptions const &options,`
			`Config const &config,`
			`std::string const &cutlass_algo = "") {`
			`if (config.kernel_enabled(kernel)) {`
			`GemmProfiler perf(output, kernel, cutlass_algo, options, config);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`if (options.peak_performance) {`
			`perf.template peak<Dispatch>(`
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`config.problem_range.M, config.problem_range.N, config.problem_range.K);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`} else {`
			`perf.template schmoo<Dispatch>(`
CUTLASS 1.2 2018-10-27 05:38:46 +08:00			`config.problem_range.M, config.problem_range.N, config.problem_range.K, config.problem_range.batch_count);`
CUTLASS v1.0 release 2018-05-17 02:44:56 +08:00			`}`
			`}`

			`return 0;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////////////////////////`

			`} // namespace perf`