/*************************************************************************************************** * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /* \file \brief Command line options for performance test program */ #include #include "cutlass/cutlass.h" #include "cutlass/version.h" #include "cutlass/library/util.h" #include "options.h" ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { namespace profiler { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Newline and indent for help strings static char const *end_of_line = "\n "; ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Device::Device(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("device", device, 0); cudaError_t result; result = cudaGetDeviceProperties(&properties, device); if (result != cudaSuccess) { throw std::runtime_error("cudaGetDeviceProperties() failed for given device"); } result = cudaSetDevice(device); if (result != cudaSuccess) { throw std::runtime_error("cudaSetDevice() failed for given device."); } // Permit overriding the compute capability if (cmdline.check_cmd_line_flag("compute-capability")) { int cc = compute_capability(); cmdline.get_cmd_line_argument("compute-capability", cc, cc); properties.major = cc / 10; properties.minor = cc % 10; } // Permit overriding the L2 cache capacity if (cmdline.check_cmd_line_flag("llc-capacity")) { int llc_capacity = 0; cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0); if (llc_capacity >= 0) { properties.l2CacheSize = (llc_capacity << 10); } } } void Options::Device::print_usage(std::ostream &out) const { out << "Device:\n" << " --device= " << " CUDA Device ID\n\n"; int device_count = 0; cudaError_t result = cudaGetDeviceCount(&device_count); if (result != cudaSuccess) { out << " \n"; } else { for (int idx = 0; idx < device_count; ++idx) { cudaDeviceProp prop; result = cudaGetDeviceProperties(&prop, idx); if (result != cudaSuccess) { out << " " << std::endl; break; } else { out << " [" << idx << "] - " << prop.name << " - SM " << prop.major << "." << prop.minor << ", " << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, " << "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB" << std::endl; } } out << "\n"; } out << " --compute-capability= " << " Override the compute capability.\n\n" << " --llc-capacity= " << " Capacity of last-level cache in kilobytes. If this is non-zero," << end_of_line << " profiling phases cycle through different input tensors to induce" << end_of_line << " capacity misses in the L2.\n\n"; } void Options::Device::print_device_info(std::ostream &out) const { int num_devices; cudaDeviceProp props; cudaError_t result; result = cudaGetDeviceCount(&num_devices); if (result != cudaSuccess) { throw std::runtime_error("cudaGetNumDevices() failed"); } out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl; for(int device = 0; device < num_devices; device++) { result = cudaSetDevice(device); if (result != cudaSuccess) { throw std::runtime_error("cudaSetDevice() failed for device"); } result = cudaGetDeviceProperties(&props, device); if (result != cudaSuccess) { throw std::runtime_error("cudaGetDeviceProperties failed for device"); } out << props.name << "," << props.major << props.minor << "," << device << "," << props.multiGpuBoardGroupID << std::endl; } } void Options::Device::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "device: " << device << "\n" << indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n" << indent_str(indent) << "compute-capability: " << compute_capability() << "\n"; } /// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75) int Options::Device::compute_capability() const { return properties.major * 10 + properties.minor; } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("initialization-enabled", enabled, true); if (cmdline.check_cmd_line_flag("initialization-provider")) { std::string str; cmdline.get_cmd_line_argument("initialization-provider", str); provider = library::from_string(str); if (provider == library::Provider::kInvalid) { enabled = false; } else if (provider != library::Provider::kReferenceHost && provider != library::Provider::kReferenceDevice) { throw std::runtime_error("Unsupported intialization provider specified."); } } else { provider = library::Provider::kReferenceDevice; } cmdline.get_cmd_line_argument("seed", seed, 2019); if (cmdline.check_cmd_line_flag("dist")) { // user has set the data distribution (fix data distribution once set) fix_data_distribution = true; // set user provided data distribution get_distribution(cmdline, "dist", data_distribution); } else { // profiler choosen data distribution (allowed to change based on numeric types) fix_data_distribution = false; // set uniform data distribution with range [-4, 4] data_distribution.set_uniform(-4, 4, 0); } } /// Gets the initial distribution void Options::Initialization::get_distribution( cutlass::CommandLine const &args, std::string const &arg, cutlass::Distribution &dist) { struct { const char *label; cutlass::Distribution::Kind kind; } distribution_kinds[] = { {"uniform", cutlass::Distribution::Uniform}, {"gaussian", cutlass::Distribution::Gaussian}, {"identity", cutlass::Distribution::Identity}, {"sequential", cutlass::Distribution::Sequential}, {0, cutlass::Distribution::Invalid} }; struct { char const *label; double *member; } members[] = { {"min", &dist.uniform.min}, {"max", &dist.uniform.max}, {"mean", &dist.gaussian.mean}, {"stddev", &dist.gaussian.stddev}, {"start", &dist.sequential.start}, {"delta", &dist.sequential.delta}, {0, 0} }; using KeyValueVector = std::vector >; KeyValueVector values; args.get_cmd_line_argument_pairs(arg.c_str(), values); // The parser expects the first token to be a string identifying the distribution type. auto it = values.begin(); if (it != values.end()) { for (int i = 0; distribution_kinds[i].label; ++i) { if (it->first == distribution_kinds[i].label) { dist.kind = distribution_kinds[i].kind; break; } } ++it; } // Subsequent key-value pairs update the named field of the distribution struct. for (; it != values.end(); ++it) { // Integer scaling factor - if < 0, no integer rounding is performed. if ((it->first.compare("scale") == 0) && !it->second.empty()) { std::stringstream ss; ss << it->second; ss >> dist.int_scale; continue; // next token } // Casts as integer without scaling if (it->first.compare("integer") == 0) { dist.int_scale = 0; continue; // next token } // initialize other members for (int m = 0; members[m].label; ++m) { if (it->first == members[m].label && !it->second.empty()) { std::stringstream ss; ss << it->second; ss >> *(members[m].member); } } } } void Options::Initialization::print_usage(std::ostream &out) const { out << "Initialization:\n" << " --initialization= " << " Enables initialization (default: true). If false, device memory is" << end_of_line << " not initialized after allocation.\n\n" << " --initialization-provider= " << " Selects initialization provider {host, device*}. (default: '*')\n\n" << " --dist= " << " Data distribution of input tensors {uniform*, gaussian, identity, sequential}" << end_of_line << " --dist=uniform,min:,max:,scale:" << end_of_line << " --dist=gaussian,mean:,stddev:,scale:" << end_of_line << " --dist=sequential,start:,delta:,scale:" << end_of_line << " --dist=identity\n\n" << " --seed= " << " Random number generator seed. Used to enforce deterministic" << end_of_line << " initialization.\n\n"; } void Options::Initialization::print_options(std::ostream &out, int indent) const { } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Library::Library(cutlass::CommandLine const &cmdline) { algorithm_mode = AlgorithmMode::kDefault; if (cmdline.check_cmd_line_flag("library-algo-mode")) { std::string mode = "default"; cmdline.get_cmd_line_argument("library-algo-mode", mode); algorithm_mode = from_string(mode); } if (cmdline.check_cmd_line_flag("library-algos")) { // If algorithms are specified, override as kBest. algorithm_mode = AlgorithmMode::kBest; std::vector tokens; cmdline.get_cmd_line_arguments("library-algos", tokens); algorithms.reserve(tokens.size()); for (auto const & token : tokens) { if (token.find(":")) { // todo - tokenized range } else { int algo; std::stringstream ss; ss << token; ss >> algo; algorithms.push_back(algo); } } } } void Options::Library::print_usage(std::ostream &out) const { out << "Library:\n" << " --library-algo-mode= " << " Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n" << " " << " mode={default*,matching,best}\n\n" << " --library-algos= " << " If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n"; } void Options::Library::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "library-algo-mode: " << to_string(algorithm_mode) << "\n" << indent_str(indent) << "library-algos: "; int j = 0; for (int x : algorithms) { out << (j++ ? "," : "") << x; } out << "\n\n"; } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0); cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10); cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100); cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50); cmdline.get_cmd_line_argument("profiling-enabled", enabled, true); if (cmdline.check_cmd_line_flag("providers")) { std::vector tokens; cmdline.get_cmd_line_arguments("providers", tokens); providers.clear(); for (auto const &token : tokens) { providers.push_back(library::from_string(token)); } } else { providers.push_back(library::Provider::kCUTLASS); providers.push_back(library::Provider::kCUBLAS); providers.push_back(library::Provider::kCUDNN); } } void Options::Profiling::print_usage(std::ostream &out) const { out << "Profiling:\n" << " --workspace-count= " << " Number of discrete workspaces maintained to avoid cache-resident " << end_of_line << " If zero (default), the amount is chosen for each workload based on " << end_of_line << " capacity of the last-level cache.\n\n" << " --profiling-iterations= " << " Number of iterations to profile each kernel. If zero, kernels" << end_of_line << " are launched up to the profiling duration.\n\n" << " --warmup-iterations= " << " Number of iterations to execute each kernel prior to profiling.\n\n" << " --sleep-duration= " << " Number of ms to sleep between profiling periods (ms).\n\n" << " --profiling-enabled= " << " If true, profiling is actually conducted.\n\n" << " --providers= " << " List of providers to be profiled for performance. (default: '*')" << end_of_line << " Gemm providers {cutlass*, cublas*}" << end_of_line << " Conv2d providers {cutlass*, cudnn*}" << "\n\n"; } void Options::Profiling::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "profiling_iterations: " << iterations << "\n" << indent_str(indent) << "sleep_duration: " << sleep_duration << "\n" << indent_str(indent) << "profiling_enabled: " << enabled << "\n" << indent_str(indent) << "providers: ["; int j = 0; for (auto const & provider : providers) { out << (j++ ? ", " : "") << library::to_string(provider); } out << "]\n"; } /// Returns true if a provider is enabled bool Options::Profiling::provider_enabled(library::Provider provider) const { return std::find(providers.begin(), providers.end(), provider) != providers.end(); } /// Returns the index of a provider if its enabled size_t Options::Profiling::index(library::Provider provider) const { size_t idx = 0; for (auto const & x : providers) { if (x == provider) { return idx; } ++idx; } return idx; } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Verification::Verification(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("verification-enabled", enabled, true); cmdline.get_cmd_line_argument("epsilon", epsilon, 0.05); cmdline.get_cmd_line_argument("nonzero-floor", nonzero_floor, 1.0 / 256.0); if (cmdline.check_cmd_line_flag("save-workspace")) { std::string value; cmdline.get_cmd_line_argument("save-workspace", value); save_workspace = from_string(value); } else { save_workspace = SaveWorkspace::kNever; } if (cmdline.check_cmd_line_flag("verification-providers")) { std::vector tokens; cmdline.get_cmd_line_arguments("verification-providers", tokens); providers.clear(); for (auto const &token : tokens) { library::Provider provider = library::from_string(token); if (provider != library::Provider::kInvalid) { providers.push_back(provider); } } } else { providers.push_back(library::Provider::kCUBLAS); providers.push_back(library::Provider::kReferenceDevice); providers.push_back(library::Provider::kCUDNN); } } void Options::Verification::print_usage(std::ostream &out) const { out << "Verification:\n" << " --verification-enabled= " << " Whether to perform verification checks.\n\n" << " --epsilon= " << " Error threshold. Setting to zero (default) requires" << end_of_line << " bit-level equivalence.\n\n" << " --nonzero-floor= " << " Results whose absolute value is less than this quantity" << end_of_line << " are treated as zero for comparisons.\n\n" << " --save-workspace= " << " Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line << " --save-workspace=never never save workspace (default)" << end_of_line << " --save-workspace=incorrect save workspace for incorrect results" << end_of_line << " --save-workspace=always always save workspace\n\n" << " --verification-providers= " << " List of providers used to verify result. (default: '*')" << end_of_line << " Gemm verification-providers {cublas*}" << end_of_line << " Conv2d verification-providers {cudnn*, device*, host}" << "\n\n"; } void Options::Verification::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "verification_enabled: " << enabled << "\n" << indent_str(indent) << "epsilon: " << epsilon << "\n" << indent_str(indent) << "save_workspace: " << to_string(save_workspace) << "\n" << indent_str(indent) << "verification_providers: ["; int j = 0; for (auto const & provider : providers) { out << (j++ ? ", " : "") << library::to_string(provider); } out << "]\n"; } /// Returns true if a provider is enabled bool Options::Verification::provider_enabled(library::Provider provider) const { return std::find(providers.begin(), providers.end(), provider) != providers.end(); } /// Returns the index of a provider if its enabled size_t Options::Verification::index(library::Provider provider) const { size_t idx = 0; for (auto const & x : providers) { if (x == provider) { return idx; } ++idx; } return idx; } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Report::Report(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("append", append, false); cmdline.get_cmd_line_argument("output", output_path); cmdline.get_cmd_line_argument("junit-output", junit_output_path); if (cmdline.check_cmd_line_flag("tags")) { cmdline.get_cmd_line_argument_pairs("tags", pivot_tags); } cmdline.get_cmd_line_argument("report-not-run", report_not_run, false); cmdline.get_cmd_line_argument("verbose", verbose, true); } void Options::Report::print_usage(std::ostream &out) const { out << "Report:\n" << " --append= " << " If true, result is appended to possibly existing file. Otherwise, " << end_of_line << " any existing file is overwritten.\n\n" << " --output= " << " Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n" << " --junit-output= " << " Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n" << " --report-not-run= " << " If true, reports the status of all kernels including those that" << end_of_line << " do not satisfy the given arguments.\n\n" << " --tags= " << " Inserts leading columns in output table and uniform values for each" << end_of_line << " column. Useful for generating pivot tables.\n\n" << " --verbose= " << " Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n"; } void Options::Report::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "append: " << append << "\n" << indent_str(indent) << "output: " << output_path << "\n" << indent_str(indent) << "junit-output: " << junit_output_path << "\n" << indent_str(indent) << "report_not_run: " << report_not_run << "\n" << indent_str(indent) << "tags:\n"; for (auto const & tag : pivot_tags) { out << indent_str(indent + 1) << tag.first << ": " << tag.second << "\n"; } out << indent_str(indent) << "verbose: " << verbose << "\n"; } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::About::About(cutlass::CommandLine const &cmdline) { help = cmdline.check_cmd_line_flag("help"); version = cmdline.check_cmd_line_flag("version"); device_info = cmdline.check_cmd_line_flag("device-info"); } void Options::About::print_usage(std::ostream &out) const { out << "About:\n" << " --version "; print_version(out); out << "\n"; } void Options::About::print_version(std::ostream &out) { out << "CUTLASS " << cutlass::getVersionString() << " built on " << __DATE__ << " at " << __TIME__; if (!cutlass::getGitRevision().empty()) out << " with commit " << cutlass::getGitRevision() << ""; } void Options::About::print_options(std::ostream &out, int indent) const { } ///////////////////////////////////////////////////////////////////////////////////////////////// Options::Options(cutlass::CommandLine const &cmdline): cmdline(cmdline), device(cmdline), initialization(cmdline), library(cmdline), profiling(cmdline), verification(cmdline), report(cmdline), about(cmdline) { if (cmdline.check_cmd_line_flag("mode")) { std::string token; cmdline.get_cmd_line_argument("mode", token); execution_mode = from_string(token); } else { execution_mode = ExecutionMode::kProfile; } // Enumerating kernels is equivalent to a dry run. if (execution_mode == ExecutionMode::kEnumerate) { execution_mode = ExecutionMode::kDryRun; } if (cmdline.check_cmd_line_flag("operation")) { std::string str; cmdline.get_cmd_line_argument("operation", str); operation_kind = library::from_string(str); } else if (cmdline.check_cmd_line_flag("function")) { std::string str; cmdline.get_cmd_line_argument("function", str); operation_kind = library::from_string(str); } else { operation_kind = library::OperationKind::kInvalid; } if (cmdline.check_cmd_line_flag("operation_names")) { cmdline.get_cmd_line_arguments("operation_names", operation_names); } else if (cmdline.check_cmd_line_flag("kernels")) { cmdline.get_cmd_line_arguments("kernels", operation_names); } if (cmdline.check_cmd_line_flag("ignore-kernels")) { cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names); } // Prevent launches on the device for anything other than CUTLASS operation if (execution_mode == ExecutionMode::kTrace) { initialization.provider = library::Provider::kReferenceHost; verification.enabled = false; profiling.enabled = false; } } void Options::print_usage(std::ostream &out) const { out << "CUTLASS Profiler\n" << "usage:\n\n" << " cutlass_profiler [options]\n\n" << " --help\n\n" << " --mode= " << " Cutlass profiler execution mode." << end_of_line << " --mode=profile regular verification and profiling (default)" << end_of_line << " --mode=dry_run no kernels are launched or workspaces allocated" << end_of_line << " --mode=enumerate lists all operation kind and operations" << end_of_line << " --mode=trace executes a single device-side computation with" << end_of_line << " no other kernel launches\n\n" << " --device-info " << " Prints information on all GPUs present in the system\n\n" << " --operation= " << " CUTLASS operation to profile.\n\n" << " --kernels= " << " Filter operations by kernel names. For example, call all kernels with" << end_of_line << " (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line << " operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n" << " --ignore-kernels= " << " Excludes kernels whose names match anything in this list.\n\n" ; // // Detailed options // device.print_usage(out); out << "\n"; initialization.print_usage(out); out << "\n"; library.print_usage(out); out << "\n"; profiling.print_usage(out); out << "\n"; verification.print_usage(out); out << "\n"; report.print_usage(out); out << "\n"; about.print_usage(out); out << "\n"; } void Options::print_options(std::ostream &out) const { out << "options:\n" << " help: " << about.help << "\n" << " mode: " << to_string(execution_mode) << "\n"; out << " device:\n"; device.print_options(out, 2); out << " initialization:\n"; initialization.print_options(out, 2); out << " profiling:\n"; profiling.print_options(out, 2); out << " verification:\n"; verification.print_options(out, 2); out << " report:\n"; report.print_options(out, 2); } std::string Options::indent_str(int indent) { return std::string(indent * 2, ' '); } ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler } // namespace cutlass