2018-05-17 02:44:56 +08:00
|
|
|
/***************************************************************************************************
|
|
|
|
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
|
|
* provided that the following conditions are met:
|
|
|
|
* * Redistributions of source code must retain the above copyright notice, this list of
|
|
|
|
* conditions and the following disclaimer.
|
|
|
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
|
|
|
* conditions and the following disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
|
|
|
* to endorse or promote products derived from this software without specific prior written
|
|
|
|
* permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
|
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
|
|
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
**************************************************************************************************/
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <fstream>
|
|
|
|
#include <map>
|
|
|
|
#include <stdexcept>
|
|
|
|
#include <utility>
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
#include "cutlass/util/platform.h"
|
|
|
|
#if defined(CUTLASS_OS_WINDOWS)
|
2018-05-17 02:44:56 +08:00
|
|
|
#include <Windows.h>
|
|
|
|
#else
|
|
|
|
// needed for sleep
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
#include "tools/test/perf/gemm/gemm_perf_testbed.h"
|
|
|
|
#include "tools/test/perf/testbench_configs.h"
|
|
|
|
#include "tools/test/perf/testbench_options.h"
|
|
|
|
#include "tools/test/perf/testbench_output.h"
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
namespace perf {
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
/// Performance measuring testbed
|
|
|
|
template <typename AType,
|
|
|
|
typename BType,
|
|
|
|
typename CType,
|
|
|
|
typename AccumulatorType,
|
|
|
|
typename ScalarType>
|
|
|
|
class GemmProfiler {
|
|
|
|
public:
|
|
|
|
/// Test environment
|
|
|
|
typedef GemmTestbed<AType, BType, CType, AccumulatorType, ScalarType> PerfTestbed;
|
|
|
|
|
|
|
|
private:
|
|
|
|
//
|
|
|
|
// Data members
|
|
|
|
//
|
|
|
|
|
|
|
|
/// Reference to TestbenchOutput instance
|
2018-09-19 07:58:03 +08:00
|
|
|
TestbenchOutput<GemmProblem> &output;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
/// Reference to options object
|
|
|
|
TestbenchOptions const &options;
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
// Reference to config object
|
|
|
|
Config const &config;
|
|
|
|
|
2018-05-17 02:44:56 +08:00
|
|
|
/// Performance test environment
|
|
|
|
PerfTestbed testbed;
|
|
|
|
|
|
|
|
/// Kernel name
|
|
|
|
std::string kernel_name;
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
/// Cutlass algorithm
|
|
|
|
std::string cutlass_algo;
|
|
|
|
|
2018-05-17 02:44:56 +08:00
|
|
|
/// Timing events
|
|
|
|
cudaEvent_t events[2];
|
|
|
|
|
|
|
|
public:
|
|
|
|
/// Delays
|
|
|
|
static void pause(int seconds) {
|
|
|
|
#if defined(WIN32)
|
|
|
|
Sleep(1000 * seconds);
|
|
|
|
#else
|
|
|
|
sleep(seconds);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
public:
|
|
|
|
//
|
|
|
|
// Methods
|
|
|
|
//
|
|
|
|
|
|
|
|
/// Constructs performance testebed
|
2018-09-19 07:58:03 +08:00
|
|
|
GemmProfiler(TestbenchOutput<GemmProblem> &_output,
|
2018-05-17 02:44:56 +08:00
|
|
|
std::string const &_kernel_name,
|
2018-09-19 07:58:03 +08:00
|
|
|
std::string const &_cutlass_algo,
|
|
|
|
TestbenchOptions const &_options,
|
|
|
|
Config const &_config)
|
2018-05-17 02:44:56 +08:00
|
|
|
: output(_output),
|
|
|
|
options(_options),
|
2018-09-19 07:58:03 +08:00
|
|
|
config(_config),
|
2018-05-17 02:44:56 +08:00
|
|
|
kernel_name(_kernel_name),
|
2018-09-19 07:58:03 +08:00
|
|
|
cutlass_algo(_cutlass_algo),
|
2018-05-17 02:44:56 +08:00
|
|
|
testbed(_options.initial_distribution) {
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
cudaError_t result = cudaEventCreate(&events[i]);
|
|
|
|
if (result != cudaSuccess) {
|
|
|
|
throw std::runtime_error("GemmPerfTestbed() failed to create CUDA events");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~GemmProfiler() {}
|
|
|
|
|
|
|
|
/// Writes the workspace to text files
|
2018-09-19 07:58:03 +08:00
|
|
|
void write_problem(Provider::Kind provider, std::string const &kernel_name) {
|
|
|
|
std::stringstream base_filename;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
base_filename << provider << "_" << kernel_name << "_" << testbed.M() << "x" << testbed.N()
|
|
|
|
<< "x" << testbed.K();
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
std::string results_name = base_filename.str() + "_results.txt";
|
|
|
|
std::string errors_name = base_filename.str() + "_errors.txt";
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
std::ofstream results(results_name.c_str());
|
|
|
|
std::ofstream errors(errors_name.c_str());
|
|
|
|
testbed.write_problem(results, errors);
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Profiles Cutlass
|
|
|
|
template <typename CutlassDispatch>
|
2018-09-19 07:58:03 +08:00
|
|
|
PerformanceResult<GemmProblem> execute_cutlass(GemmProblem const &problem,
|
|
|
|
cublasGemmAlgo_t algorithm) {
|
|
|
|
PerformanceResult<GemmProblem> result(
|
|
|
|
Provider::Cutlass
|
|
|
|
, kernel_name
|
|
|
|
, problem
|
|
|
|
);
|
|
|
|
|
|
|
|
if (options.dry_run) {
|
|
|
|
result.disposition = Disposition::NotRun;
|
|
|
|
return result;
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
if (CutlassDispatch::kRunCuBLAS) {
|
|
|
|
testbed.compute_reference(algorithm);
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
if (cudaDeviceSynchronize() != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::NotVerified;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
result.disposition = Disposition::Passed;
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
2018-10-27 05:38:46 +08:00
|
|
|
CutlassDispatch *dispatch_ptr;
|
|
|
|
|
|
|
|
// check to see if we need to launch batched strided gemm
|
|
|
|
if (testbed.batch_count() == 1) {
|
|
|
|
dispatch_ptr = new CutlassDispatch(testbed.M(),
|
|
|
|
testbed.N(),
|
|
|
|
testbed.K(),
|
|
|
|
testbed.alpha(),
|
|
|
|
testbed.ptr_A(),
|
|
|
|
testbed.lda(),
|
|
|
|
testbed.ptr_B(),
|
|
|
|
testbed.ldb(),
|
|
|
|
testbed.beta(),
|
|
|
|
testbed.ptr_C_initial(),
|
|
|
|
testbed.ldc(),
|
|
|
|
testbed.ptr_experimental(),
|
|
|
|
testbed.ldc());
|
|
|
|
|
|
|
|
dispatch_ptr->operator()();
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
dispatch_ptr = new CutlassDispatch(testbed.M(),
|
|
|
|
testbed.N(),
|
|
|
|
testbed.K(),
|
|
|
|
testbed.alpha(),
|
|
|
|
testbed.ptr_A(),
|
|
|
|
testbed.lda(),
|
|
|
|
testbed.batch_stride_a(),
|
|
|
|
testbed.ptr_B(),
|
|
|
|
testbed.ldb(),
|
|
|
|
testbed.batch_stride_b(),
|
|
|
|
testbed.beta(),
|
|
|
|
testbed.ptr_C_initial(),
|
|
|
|
testbed.ldc(),
|
|
|
|
testbed.batch_stride_c(),
|
|
|
|
testbed.ptr_experimental(),
|
|
|
|
testbed.ldc(),
|
|
|
|
testbed.batch_stride_c(),
|
|
|
|
testbed.batch_count());
|
|
|
|
|
|
|
|
dispatch_ptr->operator()();
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
if (cudaDeviceSynchronize() != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
if (CutlassDispatch::kRunCuBLAS) {
|
|
|
|
if (testbed.verify_with_reference()) {
|
|
|
|
result.disposition = Disposition::Passed;
|
|
|
|
} else {
|
|
|
|
result.disposition = Disposition::Incorrect;
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (options.save_workspace(result.disposition == Disposition::Passed)) {
|
2018-09-19 07:58:03 +08:00
|
|
|
write_problem(Provider::Cutlass, kernel_name);
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaDeviceSynchronize() != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
|
|
|
}
|
|
|
|
|
|
|
|
// warmup launch
|
2018-10-27 05:38:46 +08:00
|
|
|
dispatch_ptr->operator()();
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
if (cudaDeviceSynchronize() != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaEventRecord(events[0]) != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int iter = 0; iter < options.iterations; ++iter) {
|
2018-10-27 05:38:46 +08:00
|
|
|
dispatch_ptr->operator()();
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaEventRecord(events[1]) != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cudaEventSynchronize(events[1]) != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
float average_ms = 0;
|
|
|
|
if (cudaEventElapsedTime(&average_ms, events[0], events[1]) != cudaSuccess) {
|
|
|
|
result.disposition = Disposition::Failed;
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
result.runtime = double(average_ms) / double(options.iterations);
|
|
|
|
result.gflops = testbed.GFLOPs_per_sec(result.runtime);
|
|
|
|
|
|
|
|
if (result.disposition != Disposition::Passed) {
|
2018-09-19 07:58:03 +08:00
|
|
|
std::cout << "[\033[1;31mFAILED\033[0m]: " << kernel_name
|
|
|
|
<< " failed with disposition: " << result.disposition << "\n";
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
2018-10-27 05:38:46 +08:00
|
|
|
delete dispatch_ptr;
|
2018-05-17 02:44:56 +08:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
template <typename T, typename F>
|
|
|
|
bool contains(T const &container, F const &val) {
|
|
|
|
return std::find(container.begin(), container.end(), val) != container.end();
|
|
|
|
}
|
|
|
|
|
2018-05-17 02:44:56 +08:00
|
|
|
/// Executes all kernels for this problem size
|
|
|
|
template <typename CutlassDispatch>
|
2018-09-19 07:58:03 +08:00
|
|
|
std::vector<PerformanceResult<GemmProblem> > execute(GemmProblem const &problem) {
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
// New problem size
|
|
|
|
output.begin_problem();
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
bool const tensor_op = !(CutlassDispatch::kThreadMultiplyAdd);
|
|
|
|
cublasGemmAlgo_t algorithm = tensor_op ?
|
|
|
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
testbed.resize(problem);
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
std::vector<PerformanceResult<GemmProblem> > results;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-10-27 05:38:46 +08:00
|
|
|
results.push_back(execute_cutlass<CutlassDispatch>(problem, algorithm));
|
2018-05-17 02:44:56 +08:00
|
|
|
// cool-down period
|
2018-09-19 07:58:03 +08:00
|
|
|
if (!options.dry_run) {
|
|
|
|
pause(options.sleep_time);
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
|
|
|
|
return results;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Runs the test and collects performance for all results
|
|
|
|
template <typename CutlassDispatch>
|
2018-10-27 05:38:46 +08:00
|
|
|
void schmoo(Range const &M, Range const &N, Range const &K, Range const &batch_count) {
|
|
|
|
for (int b = batch_count.start; b <= batch_count.end; b = batch_count.next(b)) {
|
|
|
|
for (int m = M.start; m <= M.end; m = M.next(m)) {
|
|
|
|
for (int n = N.start; n <= N.end; n = N.next(n)) {
|
|
|
|
for (int k = K.start; k <= K.end; k = K.next(k)) {
|
|
|
|
std::vector<PerformanceResult<GemmProblem> > results =
|
2018-05-17 02:44:56 +08:00
|
|
|
execute<CutlassDispatch>(GemmProblem(m,
|
2018-10-27 05:38:46 +08:00
|
|
|
n,
|
|
|
|
k,
|
|
|
|
CutlassDispatch::kLayoutA,
|
|
|
|
CutlassDispatch::kLayoutB,
|
|
|
|
config.alpha,
|
|
|
|
config.beta,
|
|
|
|
b));
|
|
|
|
|
|
|
|
for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();
|
|
|
|
it != results.end();
|
|
|
|
++it) {
|
|
|
|
output.append(*it);
|
|
|
|
}
|
|
|
|
}//k
|
|
|
|
}//n
|
|
|
|
}//m
|
|
|
|
}//batch_count
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Runs the test over the problem space and reports only the best performance
|
|
|
|
template <typename CutlassDispatch>
|
|
|
|
void peak(Range const &M, Range const &N, Range const &K) {
|
2018-09-19 07:58:03 +08:00
|
|
|
typedef std::map<Provider::Kind, PerformanceResult<GemmProblem> > ProviderPerformanceMap;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
ProviderPerformanceMap max_perf;
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
for (int m = M.start; m <= M.end; m += M.next(m)) {
|
|
|
|
for (int n = N.start; n <= N.end; n += N.next(n)) {
|
|
|
|
for (int k = K.start; k <= K.end; k += K.next(k)) {
|
|
|
|
std::vector<PerformanceResult<GemmProblem> > results =
|
2018-05-17 02:44:56 +08:00
|
|
|
execute<CutlassDispatch>(GemmProblem(m,
|
|
|
|
n,
|
|
|
|
k,
|
|
|
|
CutlassDispatch::kLayoutA,
|
|
|
|
CutlassDispatch::kLayoutB,
|
2018-09-19 07:58:03 +08:00
|
|
|
config.alpha,
|
|
|
|
config.beta));
|
2018-05-17 02:44:56 +08:00
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
for (std::vector<PerformanceResult<GemmProblem> >::const_iterator it = results.begin();
|
2018-05-17 02:44:56 +08:00
|
|
|
it != results.end();
|
|
|
|
++it) {
|
|
|
|
/// Writes the output without appending it
|
|
|
|
output.pretty_print(*it);
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
if (it->disposition == Disposition::Passed) {
|
|
|
|
/// Updates maximum performing kernel
|
|
|
|
ProviderPerformanceMap::iterator max_perf_it = max_perf.find(it->provider);
|
|
|
|
|
|
|
|
if (max_perf_it == max_perf.end()) {
|
|
|
|
max_perf.insert(std::make_pair(it->provider, *it));
|
|
|
|
} else if (max_perf_it->second.gflops < it->gflops) {
|
|
|
|
max_perf_it->second = *it;
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-19 07:58:03 +08:00
|
|
|
Provider::Kind providers[] = {
|
|
|
|
Provider::Cutlass,
|
|
|
|
Provider::Invalid
|
|
|
|
};
|
|
|
|
for (int i = 0; providers[i] != Provider::Invalid; ++i) {
|
|
|
|
ProviderPerformanceMap::const_iterator it = max_perf.find(providers[i]);
|
|
|
|
if (it != max_perf.end()) {
|
|
|
|
output.append(it->second);
|
|
|
|
}
|
|
|
|
}
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
/// Dispatches to GEMM performance profiler
|
|
|
|
template <typename Dispatch, typename GemmProfiler>
|
2018-09-19 07:58:03 +08:00
|
|
|
int profile_gemm(TestbenchOutput<GemmProblem> &output,
|
2018-05-17 02:44:56 +08:00
|
|
|
std::string const &kernel,
|
2018-09-19 07:58:03 +08:00
|
|
|
TestbenchOptions const &options,
|
|
|
|
Config const &config,
|
|
|
|
std::string const &cutlass_algo = "") {
|
|
|
|
if (config.kernel_enabled(kernel)) {
|
|
|
|
GemmProfiler perf(output, kernel, cutlass_algo, options, config);
|
2018-05-17 02:44:56 +08:00
|
|
|
if (options.peak_performance) {
|
|
|
|
perf.template peak<Dispatch>(
|
2018-09-19 07:58:03 +08:00
|
|
|
config.problem_range.M, config.problem_range.N, config.problem_range.K);
|
2018-05-17 02:44:56 +08:00
|
|
|
} else {
|
|
|
|
perf.template schmoo<Dispatch>(
|
2018-10-27 05:38:46 +08:00
|
|
|
config.problem_range.M, config.problem_range.N, config.problem_range.K, config.problem_range.batch_count);
|
2018-05-17 02:44:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
} // namespace perf
|