/*************************************************************************************************** * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include #include #include #include #include "cutlass/util/platform.h" #if defined(CUTLASS_OS_WINDOWS) #include #else // needed for sleep #include #endif #include "tools/test/perf/gemm/gemm_perf_testbed.h" #include "tools/test/perf/testbench_configs.h" #include "tools/test/perf/testbench_options.h" #include "tools/test/perf/testbench_output.h" //////////////////////////////////////////////////////////////////////////////////////////////////// namespace perf { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Performance measuring testbed template class GemmProfiler { public: /// Test environment typedef GemmTestbed PerfTestbed; private: // // Data members // /// Reference to TestbenchOutput instance TestbenchOutput &output; /// Reference to options object TestbenchOptions const &options; // Reference to config object Config const &config; /// Performance test environment PerfTestbed testbed; /// Kernel name std::string kernel_name; /// Cutlass algorithm std::string cutlass_algo; /// Timing events cudaEvent_t events[2]; public: /// Delays static void pause(int seconds) { #if defined(WIN32) Sleep(1000 * seconds); #else sleep(seconds); #endif } public: // // Methods // /// Constructs performance testebed GemmProfiler(TestbenchOutput &_output, std::string const &_kernel_name, std::string const &_cutlass_algo, TestbenchOptions const &_options, Config const &_config) : output(_output), options(_options), config(_config), kernel_name(_kernel_name), cutlass_algo(_cutlass_algo), testbed(_options.initial_distribution) { for (int i = 0; i < 2; ++i) { cudaError_t result = cudaEventCreate(&events[i]); if (result != cudaSuccess) { throw std::runtime_error("GemmPerfTestbed() failed to create CUDA events"); } } } ~GemmProfiler() {} /// Writes the workspace to text files void write_problem(Provider::Kind provider, std::string const &kernel_name) { std::stringstream base_filename; base_filename << provider << "_" << kernel_name << "_" << testbed.M() << "x" << testbed.N() << "x" << testbed.K(); std::string results_name = base_filename.str() + "_results.txt"; std::string errors_name = base_filename.str() + "_errors.txt"; std::ofstream results(results_name.c_str()); std::ofstream errors(errors_name.c_str()); testbed.write_problem(results, errors); } /// Profiles Cutlass template PerformanceResult execute_cutlass(GemmProblem const &problem, cublasGemmAlgo_t algorithm) { PerformanceResult result( Provider::Cutlass , kernel_name , problem ); if (options.dry_run) { result.disposition = Disposition::NotRun; return result; } if (CutlassDispatch::kRunCuBLAS) { testbed.compute_reference(algorithm); if (cudaDeviceSynchronize() != cudaSuccess) { result.disposition = Disposition::NotVerified; return result; } } else { result.disposition = Disposition::Passed; } CutlassDispatch *dispatch_ptr; // check to see if we need to launch batched strided gemm if (testbed.batch_count() == 1) { dispatch_ptr = new CutlassDispatch(testbed.M(), testbed.N(), testbed.K(), testbed.alpha(), testbed.ptr_A(), testbed.lda(), testbed.ptr_B(), testbed.ldb(), testbed.beta(), testbed.ptr_C_initial(), testbed.ldc(), testbed.ptr_experimental(), testbed.ldc()); dispatch_ptr->operator()(); } else { dispatch_ptr = new CutlassDispatch(testbed.M(), testbed.N(), testbed.K(), testbed.alpha(), testbed.ptr_A(), testbed.lda(), testbed.batch_stride_a(), testbed.ptr_B(), testbed.ldb(), testbed.batch_stride_b(), testbed.beta(), testbed.ptr_C_initial(), testbed.ldc(), testbed.batch_stride_c(), testbed.ptr_experimental(), testbed.ldc(), testbed.batch_stride_c(), testbed.batch_count()); dispatch_ptr->operator()(); } if (cudaDeviceSynchronize() != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } if (CutlassDispatch::kRunCuBLAS) { if (testbed.verify_with_reference()) { result.disposition = Disposition::Passed; } else { result.disposition = Disposition::Incorrect; } } if (options.save_workspace(result.disposition == Disposition::Passed)) { write_problem(Provider::Cutlass, kernel_name); } if (cudaDeviceSynchronize() != cudaSuccess) { result.disposition = Disposition::Failed; } // warmup launch dispatch_ptr->operator()(); if (cudaDeviceSynchronize() != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } if (cudaEventRecord(events[0]) != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } for (int iter = 0; iter < options.iterations; ++iter) { dispatch_ptr->operator()(); } if (cudaEventRecord(events[1]) != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } if (cudaEventSynchronize(events[1]) != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } float average_ms = 0; if (cudaEventElapsedTime(&average_ms, events[0], events[1]) != cudaSuccess) { result.disposition = Disposition::Failed; delete dispatch_ptr; return result; } result.runtime = double(average_ms) / double(options.iterations); result.gflops = testbed.GFLOPs_per_sec(result.runtime); if (result.disposition != Disposition::Passed) { std::cout << "[\033[1;31mFAILED\033[0m]: " << kernel_name << " failed with disposition: " << result.disposition << "\n"; } delete dispatch_ptr; return result; } template bool contains(T const &container, F const &val) { return std::find(container.begin(), container.end(), val) != container.end(); } /// Executes all kernels for this problem size template std::vector > execute(GemmProblem const &problem) { // New problem size output.begin_problem(); bool const tensor_op = !(CutlassDispatch::kThreadMultiplyAdd); cublasGemmAlgo_t algorithm = tensor_op ? CUBLAS_GEMM_DEFAULT_TENSOR_OP : CUBLAS_GEMM_DEFAULT; testbed.resize(problem); std::vector > results; results.push_back(execute_cutlass(problem, algorithm)); // cool-down period if (!options.dry_run) { pause(options.sleep_time); } return results; } /// Runs the test and collects performance for all results template void schmoo(Range const &M, Range const &N, Range const &K, Range const &batch_count) { for (int b = batch_count.start; b <= batch_count.end; b = batch_count.next(b)) { for (int m = M.start; m <= M.end; m = M.next(m)) { for (int n = N.start; n <= N.end; n = N.next(n)) { for (int k = K.start; k <= K.end; k = K.next(k)) { std::vector > results = execute(GemmProblem(m, n, k, CutlassDispatch::kLayoutA, CutlassDispatch::kLayoutB, config.alpha, config.beta, b)); for (std::vector >::const_iterator it = results.begin(); it != results.end(); ++it) { output.append(*it); } }//k }//n }//m }//batch_count } /// Runs the test over the problem space and reports only the best performance template void peak(Range const &M, Range const &N, Range const &K) { typedef std::map > ProviderPerformanceMap; ProviderPerformanceMap max_perf; for (int m = M.start; m <= M.end; m += M.next(m)) { for (int n = N.start; n <= N.end; n += N.next(n)) { for (int k = K.start; k <= K.end; k += K.next(k)) { std::vector > results = execute(GemmProblem(m, n, k, CutlassDispatch::kLayoutA, CutlassDispatch::kLayoutB, config.alpha, config.beta)); for (std::vector >::const_iterator it = results.begin(); it != results.end(); ++it) { /// Writes the output without appending it output.pretty_print(*it); if (it->disposition == Disposition::Passed) { /// Updates maximum performing kernel ProviderPerformanceMap::iterator max_perf_it = max_perf.find(it->provider); if (max_perf_it == max_perf.end()) { max_perf.insert(std::make_pair(it->provider, *it)); } else if (max_perf_it->second.gflops < it->gflops) { max_perf_it->second = *it; } } } } } } Provider::Kind providers[] = { Provider::Cutlass, Provider::Invalid }; for (int i = 0; providers[i] != Provider::Invalid; ++i) { ProviderPerformanceMap::const_iterator it = max_perf.find(providers[i]); if (it != max_perf.end()) { output.append(it->second); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// /// Dispatches to GEMM performance profiler template int profile_gemm(TestbenchOutput &output, std::string const &kernel, TestbenchOptions const &options, Config const &config, std::string const &cutlass_algo = "") { if (config.kernel_enabled(kernel)) { GemmProfiler perf(output, kernel, cutlass_algo, options, config); if (options.peak_performance) { perf.template peak( config.problem_range.M, config.problem_range.N, config.problem_range.K); } else { perf.template schmoo( config.problem_range.M, config.problem_range.N, config.problem_range.K, config.problem_range.batch_count); } } return 0; } //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace perf