/*************************************************************************************************** * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include #include #include #include #include "cutlass/cutlass.h" #include "tools/util/command_line.h" #include "tools/util/distribution.h" #include "tools/test/perf/provider.h" namespace perf { /////////////////////////////////////////////////////////////////////////////////////////////////// /// Range of problem sizes struct Range { enum Operator { Add, Multiply }; // // Data members // int start; int end; int increment; Operator increment_op; // // Methods // Range(int _start = 0) : start(_start), end(_start), increment(1), increment_op(Add) {} Range(int _start, int _end, int _increment = 1, Operator _op = Add) : start(_start), end(_end), increment(_increment), increment_op(_op) {} /// Returns the next item in series int next(int val) const { switch (increment_op) { case Add: val += increment; break; case Multiply: val *= increment; break; default: val = end; break; } return val; } void import_from_strings(const std::vector& values) { if (values.size() > 0) { std::stringstream ss; ss << values.at(0); ss >> start; } if (values.size() > 1) { std::stringstream ss; ss << values.at(1); ss >> end; } else { end = start; } if (values.size() > 2 && !values.at(2).empty()) { std::stringstream ss; char first = values.at(2).at(0); if (first == '*' || first == '+') { ss << values.at(2).substr(1); switch (first) { case '*': increment_op = Multiply; break; case '+': increment_op = Add; break; default: break; } } else { ss << values.at(2); } ss >> increment; } } }; /////////////////////////////////////////////////////////////////////////////////////////////////// /// Defines a space of problem sizes struct GemmProblemRange { public: /// Range of sizes in GEMM M dimension Range M; /// Range of sizes in GEMM N dimension Range N; /// Range of sizes in GEMM K dimension Range K; // // Methods // /// Constructor to define a space of probelm sizes GemmProblemRange(Range _M = Range(256), Range _N = Range(256), Range _K = Range(256)) : M(_M), N(_N), K(_K) {} /// Parses a command line argument as a Range object static void get_range(Range &range, cutlass::CommandLine const &args, std::string const &arg, Range const &_default = Range(256)) { range = Range(0, 0, 1); if (args.check_cmd_line_flag(arg.c_str())) { std::vector values; args.get_cmd_line_arguments(arg.c_str(), values, ':'); range.import_from_strings(values); } else { range = _default; } } /// Initializes the GEMM problem size from command line arguments GemmProblemRange(cutlass::CommandLine const &args) { get_range(M, args, "m", Range(10240)); get_range(N, args, "n", Range(4096)); get_range(K, args, "k", Range(4096)); } }; //////////////////////////////////////////////////////////////////////////////////////////////////// /// Defines a vector of string pairs typedef std::vector > KeyValueVector; /// Defines a const iterator to a KeyValueVector typedef KeyValueVector::const_iterator KeyValueIterator; /// Structure captures the initial configuration of matrices struct InitialDistribution { /// Distribution of A matrix operand cutlass::Distribution dist_A; /// Distribution of B matrix operand cutlass::Distribution dist_B; /// cutlass::Distribution of C matrix operand cutlass::Distribution dist_C; /// Seed for random number generation int64_t seed; // // Static function members // /// Gets the initial distribution static void get_distribution(cutlass::CommandLine const &args, std::string const &arg, cutlass::Distribution &dist) { struct { const char *label; cutlass::Distribution::Kind kind; } distribution_kinds[] = {{"uniform", cutlass::Distribution::Uniform}, {"gaussian", cutlass::Distribution::Gaussian}, {"linear", cutlass::Distribution::Linear}, {"identity", cutlass::Distribution::Identity}, {0, cutlass::Distribution::Invalid}}; struct { char const *label; double *member; } members[] = {{"min", &dist.uniform.min}, {"max", &dist.uniform.max}, {"mean", &dist.gaussian.mean}, {"stddev", &dist.gaussian.stddev}, {"offset", &dist.linear.offset}, {"delta_row", &dist.linear.delta_row}, {"delta_column", &dist.linear.delta_column}, {0, 0}}; KeyValueVector values; args.get_cmd_line_argument_pairs(arg.c_str(), values); // The parser expects the first token to be a string identifying the distribution type. KeyValueIterator it = values.begin(); if (it != values.end()) { for (int i = 0; distribution_kinds[i].label; ++i) { if (it->first == distribution_kinds[i].label) { dist.kind = distribution_kinds[i].kind; break; } } ++it; } // Subsequent key-value pairs update the named field of the distribution struct. for (; it != values.end(); ++it) { // Integer scaling factor - if < 0, no integer rounding is performed. if (it->first == "scale" && !it->second.empty()) { std::stringstream ss; ss << it->second; ss >> dist.int_scale; continue; // next token } // Casts as integer without scaling if (it->first == "integer") { dist.int_scale = 0; continue; // next token } // Casts as integer without scaling if (it->first == "integer") { dist.int_scale = 0; continue; // next token } // initialize other members for (int m = 0; members[m].label; ++m) { if (it->first == members[m].label && !it->second.empty()) { std::stringstream ss; ss << it->second; ss >> *(members[m].member); } } } } // // Methods // /// Basic uniform random distribution InitialDistribution(int64_t _seed = 700) : seed(_seed) { dist_A.set_uniform(-8, 8); dist_B.set_uniform(-8, 8); dist_C.set_uniform(-8, 8); } /// Extracts initial distribution from command line arguments InitialDistribution(cutlass::CommandLine const &args) { // Set initial values seed = 700; dist_A.set_uniform(-8, 8); dist_B.set_uniform(-8, 8); dist_C.set_uniform(-8, 8); // Update with command line arguments args.get_cmd_line_argument("seed", seed, seed); // Update all distributions at once cutlass::Distribution dist_all; if (args.check_cmd_line_flag("dist")) { get_distribution(args, "dist", dist_all); dist_A = dist_all; dist_B = dist_all; dist_C = dist_all; } get_distribution(args, "dist_A", dist_A); get_distribution(args, "dist_B", dist_B); get_distribution(args, "dist_C", dist_C); } }; /////////////////////////////////////////////////////////////////////////////////////////////////// /// Defines how to execute the benchmarks struct ExecutionMode { enum Kind { Profile, Verify, Single, Invalid }; static std::string to_string(Kind kind) { switch (kind) { case Profile: return "profile"; case Verify: return "verify"; case Single: return "single"; default: return "invalid"; } } static Kind from_string(std::string const &str) { if (str == "profile") return Profile; if (str == "verify") return Verify; if (str == "single") return Single; return Profile; } }; /// Indicates when the workspace is saved struct WorkspaceSaveMode { enum Kind { Never, Incorrect, Always }; static std::string to_string(Kind kind) { switch (kind) { case Never: return "never"; case Incorrect: return "incorrect"; case Always: return "always"; default: return "incorrect"; } } static Kind from_string(std::string const &str) { if (str == "never") return Never; if (str == "incorrect") return Incorrect; if (str == "always") return Always; return Incorrect; } }; /////////////////////////////////////////////////////////////////////////////////////////////////// /// Class holding testbench command line options struct TestbenchOptions { // // Data members // /// Describes the random initial state of the input matrices InitialDistribution initial_distribution; // Path to output file name std::string output_filename; // Path to input file name std::string threshold_filename; /// If true, output is appended bool append; /// Number of iterations int iterations; /// Defines how to run the benchmark ExecutionMode::Kind execution_mode; /// Indicates when the workspace is saved WorkspaceSaveMode::Kind save_workspace_mode; /// Properties of CUDA device cudaDeviceProp device_properties; /// Enabled kernel names std::vector kernels; /// Scalar value for GEMM double alpha; /// Scalar value for GEMM double beta; /// Range of problem sizes GemmProblemRange problem_range; /// If true, kernels are not executed, and no sleep waits are inserted bool dry_run; /// Tags to describe the profiler output KeyValueVector pivot_tags; /// If enabled, only the peak performance for a given kernel is reported bool peak_performance; /// Performance Degradatiom Margin before flagging as test failure double perf_margin; /// Cool-down period int sleep_time; // // Methods // /// Constructs the testbench from tags TestbenchOptions(cutlass::CommandLine const &args) : initial_distribution(args), execution_mode(ExecutionMode::Profile), save_workspace_mode(WorkspaceSaveMode::Never), problem_range(args), dry_run(false), sleep_time(1) { // Set the CUDA device and/or specify clock rate configure_cuda_device(args); // fetch command line arguments args.get_cmd_line_argument("iterations", iterations, 25); args.get_cmd_line_argument("append", append, false); args.get_cmd_line_argument("output", output_filename); args.get_cmd_line_argument("threshold", threshold_filename); args.get_cmd_line_argument("alpha", alpha, 1.0); args.get_cmd_line_argument("beta", beta, 0.0); args.get_cmd_line_argument("peak", peak_performance, false); args.get_cmd_line_argument_pairs("tags", pivot_tags); args.get_cmd_line_argument("perf-margin", perf_margin, 0.97); args.get_cmd_line_argument("dry-run", dry_run, false); args.get_cmd_line_argument("sleep-time", sleep_time, 1); if (args.check_cmd_line_flag("execution-mode")) { std::string str; args.get_cmd_line_argument("execution-mode", str); execution_mode = ExecutionMode::from_string(str); } if (args.check_cmd_line_flag("save-workspace")) { std::string str; args.get_cmd_line_argument("save-workspace", str); save_workspace_mode = WorkspaceSaveMode::from_string(str); } if (args.check_cmd_line_flag("execution-mode")) { std::string str; args.get_cmd_line_argument("execution-mode", str); execution_mode = ExecutionMode::from_string(str); } if (args.check_cmd_line_flag("save-workspace")) { std::string str; args.get_cmd_line_argument("save-workspace", str); save_workspace_mode = WorkspaceSaveMode::from_string(str); } // query for enabled kernels or enable all of them if (args.check_cmd_line_flag("kernels")) { args.get_cmd_line_arguments("kernels", kernels, ','); } else { char const *gemms[] = { "sgemm", "dgemm", "hgemm", "igemm", "wmma_gemm", "wmma_gemm_f16", "wmma_binary_gemm", "wmma_integer_gemm", 0 }; char const *layouts[] = {"nn", "nt", "tn", "tt", 0}; for (int i = 0; gemms[i]; ++i) { for (int j = 0; layouts[j]; ++j) { if ((std::string(gemms[i]).compare("wmma_binary_gemm") == 0 || std::string(gemms[i]).compare("wmma_integer_gemm") == 0) && std::string(layouts[j]).compare("tn") != 0) { continue; } kernels.push_back(std::string(gemms[i]) + "_" + layouts[j]); } } } } void configure_cuda_device(cutlass::CommandLine const &args) { int device_id = 0; args.get_cmd_line_argument("device", device_id, 0); cudaError_t result; result = cudaGetDeviceProperties(&device_properties, device_id); if (result != cudaSuccess) { throw std::runtime_error("cudaGetDeviceProperties() failed for given device."); } result = cudaSetDevice(device_id); if (result != cudaSuccess) { throw std::runtime_error("cudaSetDevice() failed for given device."); } // Get the clock rate (specified in cmd line in MHz) if (args.check_cmd_line_flag("clock")) { args.get_cmd_line_argument("clock", device_properties.clockRate); device_properties.clockRate *= 1000; } } /// Returns true if the kernel name appears among the enabled kernels bool kernel_enabled(std::string const &kernel) const { typedef std::vector::const_iterator kernel_iterator; for (kernel_iterator it = kernels.begin(); it != kernels.end(); ++it) { if (kernel.find(*it) != std::string::npos) { return true; } } return false; } /// Given the disposition of a GEMM problem, returns true if the results should /// be saved to the file system. bool save_workspace(bool correct) const { if (save_workspace_mode == WorkspaceSaveMode::Always || (save_workspace_mode == WorkspaceSaveMode::Incorrect && !correct)) { return true; } return false; } /// Returns true if the selected device can satisfy the given compute capability bool compute_capability(int major, int minor) const { return (device_properties.major > major || (device_properties.major == major && device_properties.minor >= minor)); } /// Requires an exact match of compute capability bool compute_capability_exact(int major, int minor) const { return major == device_properties.major && minor == device_properties.minor; } /// Prints version static void version(std::ostream &out) { out << "CUTLASS " << CUTLASS_MAJOR << "." << CUTLASS_MINOR << "." << CUTLASS_PATCH << " built on " << __DATE__ << " at " << __TIME__; } /// Prints the usage statement static void usage(std::ostream &out) { out << "cutlass_perf_test [options]\n\n" << " --help\n" << " --append= " << " If true, appends output to existing CSV file. If false, overwrites.\n" << " --alpha= " << " Value for alpha to be used in GEMM experiments\n" << " --beta= " << " Value for beta to be used in GEMM experiments\n" << " --device= " << " Specifies the CUDA device to use. Default is device 0.\n" << " --clock= " << " Specifies the SM clock rate in MHz.\n" << " --dist-{A,B,C}= " << " Describes the random distribution of each of the input matrix operands.\n" << " --dry-run= " << " If true, kernels are not executed and sleep is not inserted.\n" << " --execution-mode= " << " Specifies execution mode: profile, verify, single\n" << " --output= " << " Writes summary of profiling to specified .csv file\n" << " --threshold= " << " Reads previous output summary and re-executes the same configurations.\n" << " --iterations= " << " maximum number of iterations to execute when profiling\n" << " --m=[:max height[:step]] " << " Height of GEMM problem (number of rows of C). May specify a range with optional " "step size.\n" << " --n=[:max width[:step]] " << " Width of GEMM problem (number of columns of C). May specify a range with optional " "step size.\n" << " --k=[:max depth[:step]] " << " Size of inner dimension of A and B. May specify a range with optional step size.\n" << " --kernels=<{s|d|h|i|wmma_|wmma_binary_|wmma_integer_}gemm_{nn,nt,tn,tt}>\n" << " " << " Select GEMM datatype and layout to use for tests\n" << " --peak= " << " If true, only reports peak performance per kernel after profiling specified " "problem space.\n" << " --perf-margin= " << " Allowable performance degradation before flagging test as failure (e.g. 3% slowdown" " = 0.97).\n" << " --save-workspace={*never,incorrect,always} " << " Specifies when to save the GEMM inputs and results to the filesystem.\n" << " --seed= " << " Random seed used by the random number generator in initializing input matrices.\n" << " --tags= " << " Inserts leading columns in output table and uniform values for each column. Useful " "for generating pivot tables.\n" << " --sleep-time= " << " Sleep period between profiling kernels to cool down the device.\n" << " --version " << " "; version(out); out << "\n\n"; out << "\n\n" << "Example usage:\n\n" << "# Runs one problem size for all kernels\n" << "./tools/test/perf/cutlass_perf_test --m=10240 --n=1024 --k=1024\n\n" << "# Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands\n" << "./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 " "--kernels=sgemm_nn,igemm_nn\n\n" << std::flush; } }; /////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace perf