
CUTLASS 2.4 (Implicit GEMM Convolution) Co-authored-by: Manish Gupta <manigupta@nvidia.com>, Haicheng Wu <haichengw@nvidia.com>, Dustyn Blasig <dblasig@nvidia.com>, Andrew Kerr <akerr@nvidia.com>
810 lines
27 KiB
Plaintext
810 lines
27 KiB
Plaintext
/***************************************************************************************************
|
|
* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
* provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright notice, this list of
|
|
* conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
|
* conditions and the following disclaimer in the documentation and/or other materials
|
|
* provided with the distribution.
|
|
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without specific prior written
|
|
* permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/* \file
|
|
\brief Command line options for performance test program
|
|
*/
|
|
|
|
#include <algorithm>
|
|
|
|
#include "cutlass/cutlass.h"
|
|
#include "cutlass/version.h"
|
|
|
|
#include "cutlass/library/util.h"
|
|
|
|
#include "options.h"
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
namespace cutlass {
|
|
namespace profiler {
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/// Newline and indent for help strings
|
|
static char const *end_of_line = "\n ";
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Device::Device(cutlass::CommandLine const &cmdline) {
|
|
|
|
cmdline.get_cmd_line_argument("device", device, 0);
|
|
|
|
cudaError_t result;
|
|
result = cudaGetDeviceProperties(&properties, device);
|
|
|
|
if (result != cudaSuccess) {
|
|
throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
|
|
}
|
|
|
|
result = cudaSetDevice(device);
|
|
if (result != cudaSuccess) {
|
|
throw std::runtime_error("cudaSetDevice() failed for given device.");
|
|
}
|
|
|
|
// Permit overriding the compute capability
|
|
if (cmdline.check_cmd_line_flag("compute-capability")) {
|
|
int cc = compute_capability();
|
|
cmdline.get_cmd_line_argument("compute-capability", cc, cc);
|
|
properties.major = cc / 10;
|
|
properties.minor = cc % 10;
|
|
}
|
|
|
|
// Permit overriding the L2 cache capacity
|
|
if (cmdline.check_cmd_line_flag("llc-capacity")) {
|
|
int llc_capacity = 0;
|
|
cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
|
|
|
|
if (llc_capacity >= 0) {
|
|
properties.l2CacheSize = (llc_capacity << 10);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void Options::Device::print_usage(std::ostream &out) const {
|
|
|
|
out << "Device:\n"
|
|
<< " --device=<int> "
|
|
<< " CUDA Device ID\n\n";
|
|
|
|
int device_count = 0;
|
|
cudaError_t result = cudaGetDeviceCount(&device_count);
|
|
|
|
if (result != cudaSuccess) {
|
|
out << " <could not query for CUDA devices>\n";
|
|
}
|
|
else {
|
|
|
|
for (int idx = 0; idx < device_count; ++idx) {
|
|
cudaDeviceProp prop;
|
|
result = cudaGetDeviceProperties(&prop, idx);
|
|
if (result != cudaSuccess) {
|
|
out << " <could not obtain device properties for device " << idx << ">" << std::endl;
|
|
break;
|
|
}
|
|
else {
|
|
out << " [" << idx << "] - "
|
|
<< prop.name << " - SM " << prop.major << "." << prop.minor << ", "
|
|
<< prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
|
|
<< "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
|
|
<< std::endl;
|
|
}
|
|
}
|
|
out << "\n";
|
|
}
|
|
|
|
out
|
|
<< " --compute-capability=<int> "
|
|
<< " Override the compute capability.\n\n"
|
|
|
|
<< " --llc-capacity=<capacity in KiB> "
|
|
<< " Capacity of last-level cache in kilobytes. If this is non-zero," << end_of_line
|
|
<< " profiling phases cycle through different input tensors to induce" << end_of_line
|
|
<< " capacity misses in the L2.\n\n";
|
|
|
|
}
|
|
|
|
void Options::Device::print_device_info(std::ostream &out) const {
|
|
int num_devices;
|
|
cudaDeviceProp props;
|
|
|
|
cudaError_t result;
|
|
result = cudaGetDeviceCount(&num_devices);
|
|
|
|
if (result != cudaSuccess) {
|
|
throw std::runtime_error("cudaGetNumDevices() failed");
|
|
}
|
|
|
|
out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
|
|
|
|
for(int device = 0; device < num_devices; device++) {
|
|
result = cudaSetDevice(device);
|
|
if (result != cudaSuccess) {
|
|
throw std::runtime_error("cudaSetDevice() failed for device");
|
|
}
|
|
|
|
result = cudaGetDeviceProperties(&props, device);
|
|
if (result != cudaSuccess) {
|
|
throw std::runtime_error("cudaGetDeviceProperties failed for device");
|
|
}
|
|
|
|
out << props.name << "," << props.major << props.minor << ","
|
|
<< device << "," << props.multiGpuBoardGroupID << std::endl;
|
|
|
|
}
|
|
}
|
|
|
|
void Options::Device::print_options(std::ostream &out, int indent) const {
|
|
|
|
out
|
|
<< indent_str(indent) << "device: " << device << "\n"
|
|
<< indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
|
|
<< indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
|
|
}
|
|
|
|
/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
|
|
int Options::Device::compute_capability() const {
|
|
return properties.major * 10 + properties.minor;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
|
|
|
|
cmdline.get_cmd_line_argument("initialization-enabled", enabled, true);
|
|
|
|
if (cmdline.check_cmd_line_flag("initialization-provider")) {
|
|
std::string str;
|
|
cmdline.get_cmd_line_argument("initialization-provider", str);
|
|
provider = library::from_string<library::Provider>(str);
|
|
if (provider == library::Provider::kInvalid) {
|
|
enabled = false;
|
|
}
|
|
else if (provider != library::Provider::kReferenceHost && provider != library::Provider::kReferenceDevice) {
|
|
throw std::runtime_error("Unsupported intialization provider specified.");
|
|
}
|
|
}
|
|
else {
|
|
provider = library::Provider::kReferenceDevice;
|
|
}
|
|
|
|
cmdline.get_cmd_line_argument("seed", seed, 2019);
|
|
|
|
if (cmdline.check_cmd_line_flag("dist")) {
|
|
// user has set the data distribution (fix data distribution once set)
|
|
fix_data_distribution = true;
|
|
// set user provided data distribution
|
|
get_distribution(cmdline, "dist", data_distribution);
|
|
}
|
|
else {
|
|
// profiler choosen data distribution (allowed to change based on numeric types)
|
|
fix_data_distribution = false;
|
|
// set uniform data distribution with range [-4, 4]
|
|
data_distribution.set_uniform(-4, 4, 0);
|
|
}
|
|
|
|
|
|
}
|
|
|
|
/// Gets the initial distribution
|
|
void Options::Initialization::get_distribution(
|
|
cutlass::CommandLine const &args,
|
|
std::string const &arg,
|
|
cutlass::Distribution &dist) {
|
|
|
|
struct {
|
|
const char *label;
|
|
cutlass::Distribution::Kind kind;
|
|
} distribution_kinds[] = {
|
|
{"uniform", cutlass::Distribution::Uniform},
|
|
{"gaussian", cutlass::Distribution::Gaussian},
|
|
{"identity", cutlass::Distribution::Identity},
|
|
{"sequential", cutlass::Distribution::Sequential},
|
|
{0, cutlass::Distribution::Invalid}
|
|
};
|
|
|
|
struct {
|
|
char const *label;
|
|
double *member;
|
|
} members[] = {
|
|
{"min", &dist.uniform.min},
|
|
{"max", &dist.uniform.max},
|
|
{"mean", &dist.gaussian.mean},
|
|
{"stddev", &dist.gaussian.stddev},
|
|
{"start", &dist.sequential.start},
|
|
{"delta", &dist.sequential.delta},
|
|
{0, 0}
|
|
};
|
|
|
|
using KeyValueVector = std::vector<std::pair<std::string, std::string> >;
|
|
|
|
KeyValueVector values;
|
|
args.get_cmd_line_argument_pairs(arg.c_str(), values);
|
|
|
|
// The parser expects the first token to be a string identifying the distribution type.
|
|
auto it = values.begin();
|
|
if (it != values.end()) {
|
|
for (int i = 0; distribution_kinds[i].label; ++i) {
|
|
if (it->first == distribution_kinds[i].label) {
|
|
dist.kind = distribution_kinds[i].kind;
|
|
break;
|
|
}
|
|
}
|
|
++it;
|
|
}
|
|
|
|
// Subsequent key-value pairs update the named field of the distribution struct.
|
|
for (; it != values.end(); ++it) {
|
|
// Integer scaling factor - if < 0, no integer rounding is performed.
|
|
if ((it->first.compare("scale") == 0) && !it->second.empty()) {
|
|
std::stringstream ss;
|
|
ss << it->second;
|
|
ss >> dist.int_scale;
|
|
continue; // next token
|
|
}
|
|
|
|
// Casts as integer without scaling
|
|
if (it->first.compare("integer") == 0) {
|
|
dist.int_scale = 0;
|
|
continue; // next token
|
|
}
|
|
|
|
// initialize other members
|
|
for (int m = 0; members[m].label; ++m) {
|
|
if (it->first == members[m].label && !it->second.empty()) {
|
|
std::stringstream ss;
|
|
ss << it->second;
|
|
ss >> *(members[m].member);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Options::Initialization::print_usage(std::ostream &out) const {
|
|
|
|
out << "Initialization:\n"
|
|
|
|
<< " --initialization=<bool> "
|
|
<< " Enables initialization (default: true). If false, device memory is" << end_of_line
|
|
<< " not initialized after allocation.\n\n"
|
|
|
|
<< " --initialization-provider=<provider> "
|
|
<< " Selects initialization provider {host, device*}. (default: '*')\n\n"
|
|
|
|
<< " --dist=<distribution> "
|
|
<< " Data distribution of input tensors {uniform*, gaussian, identity, sequential}" << end_of_line
|
|
<< " --dist=uniform,min:<double>,max:<double>,scale:<integer>" << end_of_line
|
|
<< " --dist=gaussian,mean:<double>,stddev:<double>,scale:<integer>" << end_of_line
|
|
<< " --dist=sequential,start:<double>,delta:<double>,scale:<integer>" << end_of_line
|
|
<< " --dist=identity\n\n"
|
|
|
|
<< " --seed=<int> "
|
|
<< " Random number generator seed. Used to enforce deterministic" << end_of_line
|
|
<< " initialization.\n\n";
|
|
|
|
}
|
|
|
|
void Options::Initialization::print_options(std::ostream &out, int indent) const {
|
|
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Library::Library(cutlass::CommandLine const &cmdline) {
|
|
|
|
algorithm_mode = AlgorithmMode::kDefault;
|
|
|
|
if (cmdline.check_cmd_line_flag("library-algo-mode")) {
|
|
std::string mode = "default";
|
|
cmdline.get_cmd_line_argument("library-algo-mode", mode);
|
|
algorithm_mode = from_string<AlgorithmMode>(mode);
|
|
}
|
|
|
|
if (cmdline.check_cmd_line_flag("library-algos")) {
|
|
|
|
// If algorithms are specified, override as kBest.
|
|
algorithm_mode = AlgorithmMode::kBest;
|
|
|
|
std::vector<std::string> tokens;
|
|
cmdline.get_cmd_line_arguments("library-algos", tokens);
|
|
|
|
algorithms.reserve(tokens.size());
|
|
|
|
for (auto const & token : tokens) {
|
|
if (token.find(":")) {
|
|
// todo - tokenized range
|
|
}
|
|
else {
|
|
int algo;
|
|
std::stringstream ss;
|
|
|
|
ss << token;
|
|
ss >> algo;
|
|
|
|
algorithms.push_back(algo);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Options::Library::print_usage(std::ostream &out) const {
|
|
|
|
out << "Library:\n"
|
|
|
|
<< " --library-algo-mode=<mode> "
|
|
<< " Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n"
|
|
<< " "
|
|
<< " mode={default*,matching,best}\n\n"
|
|
|
|
<< " --library-algos=<range-list> "
|
|
<< " If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n";
|
|
|
|
}
|
|
|
|
void Options::Library::print_options(std::ostream &out, int indent) const {
|
|
|
|
out
|
|
<< indent_str(indent) << "library-algo-mode: " << to_string(algorithm_mode) << "\n"
|
|
<< indent_str(indent) << "library-algos: ";
|
|
|
|
int j = 0;
|
|
for (int x : algorithms) {
|
|
out << (j++ ? "," : "") << x;
|
|
}
|
|
|
|
out << "\n\n";
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
|
|
|
|
cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
|
|
cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
|
|
cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
|
|
cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
|
|
cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
|
|
|
|
if (cmdline.check_cmd_line_flag("providers")) {
|
|
|
|
std::vector<std::string> tokens;
|
|
cmdline.get_cmd_line_arguments("providers", tokens);
|
|
|
|
providers.clear();
|
|
|
|
for (auto const &token : tokens) {
|
|
providers.push_back(library::from_string<library::Provider>(token));
|
|
}
|
|
}
|
|
else {
|
|
providers.push_back(library::Provider::kCUTLASS);
|
|
providers.push_back(library::Provider::kCUBLAS);
|
|
providers.push_back(library::Provider::kCUDNN);
|
|
}
|
|
}
|
|
|
|
void Options::Profiling::print_usage(std::ostream &out) const {
|
|
|
|
out << "Profiling:\n"
|
|
|
|
<< " --workspace-count=<workspace count> "
|
|
<< " Number of discrete workspaces maintained to avoid cache-resident " << end_of_line
|
|
<< " If zero (default), the amount is chosen for each workload based on " << end_of_line
|
|
<< " capacity of the last-level cache.\n\n"
|
|
|
|
<< " --profiling-iterations=<iterations> "
|
|
<< " Number of iterations to profile each kernel. If zero, kernels" << end_of_line
|
|
<< " are launched up to the profiling duration.\n\n"
|
|
|
|
<< " --warmup-iterations=<iterations> "
|
|
<< " Number of iterations to execute each kernel prior to profiling.\n\n"
|
|
|
|
<< " --sleep-duration=<duration> "
|
|
<< " Number of ms to sleep between profiling periods (ms).\n\n"
|
|
|
|
<< " --profiling-enabled=<bool> "
|
|
<< " If true, profiling is actually conducted.\n\n"
|
|
|
|
<< " --providers=<providers> "
|
|
<< " List of providers to be profiled for performance. (default: '*')" << end_of_line
|
|
<< " Gemm providers {cutlass*, cublas*}" << end_of_line
|
|
<< " Conv2d providers {cutlass*, cudnn*}"
|
|
<< "\n\n";
|
|
|
|
}
|
|
|
|
void Options::Profiling::print_options(std::ostream &out, int indent) const {
|
|
|
|
out
|
|
<< indent_str(indent) << "profiling_iterations: " << iterations << "\n"
|
|
<< indent_str(indent) << "sleep_duration: " << sleep_duration << "\n"
|
|
<< indent_str(indent) << "profiling_enabled: " << enabled << "\n"
|
|
<< indent_str(indent) << "providers: [";
|
|
|
|
int j = 0;
|
|
for (auto const & provider : providers) {
|
|
out << (j++ ? ", " : "") << library::to_string(provider);
|
|
}
|
|
out << "]\n";
|
|
}
|
|
|
|
/// Returns true if a provider is enabled
|
|
bool Options::Profiling::provider_enabled(library::Provider provider) const {
|
|
return std::find(providers.begin(), providers.end(), provider) != providers.end();
|
|
}
|
|
|
|
/// Returns the index of a provider if its enabled
|
|
size_t Options::Profiling::index(library::Provider provider) const {
|
|
size_t idx = 0;
|
|
for (auto const & x : providers) {
|
|
if (x == provider) {
|
|
return idx;
|
|
}
|
|
++idx;
|
|
}
|
|
return idx;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
|
|
|
|
cmdline.get_cmd_line_argument("verification-enabled", enabled, true);
|
|
|
|
cmdline.get_cmd_line_argument("epsilon", epsilon, 0.05);
|
|
|
|
cmdline.get_cmd_line_argument("nonzero-floor", nonzero_floor, 1.0 / 256.0);
|
|
|
|
if (cmdline.check_cmd_line_flag("save-workspace")) {
|
|
std::string value;
|
|
cmdline.get_cmd_line_argument("save-workspace", value);
|
|
save_workspace = from_string<SaveWorkspace>(value);
|
|
}
|
|
else {
|
|
save_workspace = SaveWorkspace::kNever;
|
|
}
|
|
|
|
if (cmdline.check_cmd_line_flag("verification-providers")) {
|
|
|
|
std::vector<std::string> tokens;
|
|
cmdline.get_cmd_line_arguments("verification-providers", tokens);
|
|
|
|
providers.clear();
|
|
|
|
for (auto const &token : tokens) {
|
|
library::Provider provider = library::from_string<library::Provider>(token);
|
|
if (provider != library::Provider::kInvalid) {
|
|
providers.push_back(provider);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
providers.push_back(library::Provider::kCUBLAS);
|
|
providers.push_back(library::Provider::kReferenceDevice);
|
|
providers.push_back(library::Provider::kCUDNN);
|
|
}
|
|
}
|
|
|
|
void Options::Verification::print_usage(std::ostream &out) const {
|
|
|
|
out << "Verification:\n"
|
|
|
|
<< " --verification-enabled=<bool> "
|
|
<< " Whether to perform verification checks.\n\n"
|
|
|
|
<< " --epsilon=<error> "
|
|
<< " Error threshold. Setting to zero (default) requires" << end_of_line
|
|
<< " bit-level equivalence.\n\n"
|
|
|
|
<< " --nonzero-floor=<floor> "
|
|
<< " Results whose absolute value is less than this quantity" << end_of_line
|
|
<< " are treated as zero for comparisons.\n\n"
|
|
|
|
<< " --save-workspace=<string> "
|
|
<< " Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line
|
|
<< " --save-workspace=never never save workspace (default)" << end_of_line
|
|
<< " --save-workspace=incorrect save workspace for incorrect results" << end_of_line
|
|
<< " --save-workspace=always always save workspace\n\n"
|
|
|
|
<< " --verification-providers=<providers> "
|
|
<< " List of providers used to verify result. (default: '*')" << end_of_line
|
|
<< " Gemm verification-providers {cublas*}" << end_of_line
|
|
<< " Conv2d verification-providers {cudnn*, device*, host}"
|
|
<< "\n\n";
|
|
}
|
|
|
|
void Options::Verification::print_options(std::ostream &out, int indent) const {
|
|
|
|
out
|
|
<< indent_str(indent) << "verification_enabled: " << enabled << "\n"
|
|
<< indent_str(indent) << "epsilon: " << epsilon << "\n"
|
|
<< indent_str(indent) << "save_workspace: " << to_string(save_workspace) << "\n"
|
|
<< indent_str(indent) << "verification_providers: [";
|
|
|
|
int j = 0;
|
|
for (auto const & provider : providers) {
|
|
out << (j++ ? ", " : "") << library::to_string(provider);
|
|
}
|
|
out << "]\n";
|
|
}
|
|
|
|
/// Returns true if a provider is enabled
|
|
bool Options::Verification::provider_enabled(library::Provider provider) const {
|
|
return std::find(providers.begin(), providers.end(), provider) != providers.end();
|
|
}
|
|
|
|
/// Returns the index of a provider if its enabled
|
|
size_t Options::Verification::index(library::Provider provider) const {
|
|
size_t idx = 0;
|
|
for (auto const & x : providers) {
|
|
if (x == provider) {
|
|
return idx;
|
|
}
|
|
++idx;
|
|
}
|
|
return idx;
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Report::Report(cutlass::CommandLine const &cmdline) {
|
|
|
|
cmdline.get_cmd_line_argument("append", append, false);
|
|
cmdline.get_cmd_line_argument("output", output_path);
|
|
cmdline.get_cmd_line_argument("junit-output", junit_output_path);
|
|
|
|
if (cmdline.check_cmd_line_flag("tags")) {
|
|
cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
|
|
}
|
|
|
|
cmdline.get_cmd_line_argument("report-not-run", report_not_run, false);
|
|
|
|
cmdline.get_cmd_line_argument("verbose", verbose, true);
|
|
}
|
|
|
|
void Options::Report::print_usage(std::ostream &out) const {
|
|
|
|
out << "Report:\n"
|
|
|
|
<< " --append=<bool> "
|
|
<< " If true, result is appended to possibly existing file. Otherwise, " << end_of_line
|
|
<< " any existing file is overwritten.\n\n"
|
|
|
|
<< " --output=<path> "
|
|
<< " Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n"
|
|
|
|
<< " --junit-output=<path> "
|
|
<< " Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"
|
|
|
|
<< " --report-not-run=<bool> "
|
|
<< " If true, reports the status of all kernels including those that" << end_of_line
|
|
<< " do not satisfy the given arguments.\n\n"
|
|
|
|
<< " --tags=<column:tag,...> "
|
|
<< " Inserts leading columns in output table and uniform values for each" << end_of_line
|
|
<< " column. Useful for generating pivot tables.\n\n"
|
|
|
|
<< " --verbose=<bool> "
|
|
<< " Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n";
|
|
}
|
|
|
|
void Options::Report::print_options(std::ostream &out, int indent) const {
|
|
|
|
out
|
|
<< indent_str(indent) << "append: " << append << "\n"
|
|
<< indent_str(indent) << "output: " << output_path << "\n"
|
|
<< indent_str(indent) << "junit-output: " << junit_output_path << "\n"
|
|
<< indent_str(indent) << "report_not_run: " << report_not_run << "\n"
|
|
<< indent_str(indent) << "tags:\n";
|
|
|
|
for (auto const & tag : pivot_tags) {
|
|
out << indent_str(indent + 1) << tag.first << ": " << tag.second << "\n";
|
|
}
|
|
|
|
out
|
|
<< indent_str(indent) << "verbose: " << verbose << "\n";
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::About::About(cutlass::CommandLine const &cmdline) {
|
|
help = cmdline.check_cmd_line_flag("help");
|
|
version = cmdline.check_cmd_line_flag("version");
|
|
device_info = cmdline.check_cmd_line_flag("device-info");
|
|
}
|
|
|
|
void Options::About::print_usage(std::ostream &out) const {
|
|
|
|
out << "About:\n"
|
|
<< " --version ";
|
|
|
|
print_version(out);
|
|
|
|
out << "\n";
|
|
}
|
|
|
|
void Options::About::print_version(std::ostream &out) {
|
|
out << "CUTLASS " << cutlass::getVersionString()
|
|
<< " built on " << __DATE__ << " at " << __TIME__;
|
|
if (!cutlass::getGitRevision().empty()) out << " with commit " << cutlass::getGitRevision() << "";
|
|
}
|
|
|
|
void Options::About::print_options(std::ostream &out, int indent) const {
|
|
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
Options::Options(cutlass::CommandLine const &cmdline):
|
|
cmdline(cmdline),
|
|
device(cmdline),
|
|
initialization(cmdline),
|
|
library(cmdline),
|
|
profiling(cmdline),
|
|
verification(cmdline),
|
|
report(cmdline),
|
|
about(cmdline) {
|
|
|
|
if (cmdline.check_cmd_line_flag("mode")) {
|
|
std::string token;
|
|
cmdline.get_cmd_line_argument("mode", token);
|
|
execution_mode = from_string<ExecutionMode>(token);
|
|
}
|
|
else {
|
|
execution_mode = ExecutionMode::kProfile;
|
|
}
|
|
|
|
// Enumerating kernels is equivalent to a dry run.
|
|
if (execution_mode == ExecutionMode::kEnumerate) {
|
|
execution_mode = ExecutionMode::kDryRun;
|
|
}
|
|
|
|
if (cmdline.check_cmd_line_flag("operation")) {
|
|
std::string str;
|
|
cmdline.get_cmd_line_argument("operation", str);
|
|
operation_kind = library::from_string<library::OperationKind>(str);
|
|
}
|
|
else if (cmdline.check_cmd_line_flag("function")) {
|
|
std::string str;
|
|
cmdline.get_cmd_line_argument("function", str);
|
|
operation_kind = library::from_string<library::OperationKind>(str);
|
|
}
|
|
else {
|
|
operation_kind = library::OperationKind::kInvalid;
|
|
}
|
|
|
|
if (cmdline.check_cmd_line_flag("operation_names")) {
|
|
cmdline.get_cmd_line_arguments("operation_names", operation_names);
|
|
}
|
|
else if (cmdline.check_cmd_line_flag("kernels")) {
|
|
cmdline.get_cmd_line_arguments("kernels", operation_names);
|
|
}
|
|
|
|
if (cmdline.check_cmd_line_flag("ignore-kernels")) {
|
|
cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
|
|
}
|
|
|
|
// Prevent launches on the device for anything other than CUTLASS operation
|
|
if (execution_mode == ExecutionMode::kTrace) {
|
|
initialization.provider = library::Provider::kReferenceHost;
|
|
verification.enabled = false;
|
|
profiling.enabled = false;
|
|
}
|
|
}
|
|
|
|
void Options::print_usage(std::ostream &out) const {
|
|
|
|
out
|
|
<< "CUTLASS Profiler\n"
|
|
<< "usage:\n\n"
|
|
<< " cutlass_profiler [options]\n\n"
|
|
<< " --help\n\n"
|
|
|
|
<< " --mode=<string> "
|
|
<< " Cutlass profiler execution mode." << end_of_line
|
|
<< " --mode=profile regular verification and profiling (default)" << end_of_line
|
|
<< " --mode=dry_run no kernels are launched or workspaces allocated" << end_of_line
|
|
<< " --mode=enumerate lists all operation kind and operations" << end_of_line
|
|
<< " --mode=trace executes a single device-side computation with" << end_of_line
|
|
<< " no other kernel launches\n\n"
|
|
|
|
<< " --device-info "
|
|
<< " Prints information on all GPUs present in the system\n\n"
|
|
|
|
<< " --operation=<operation_kind> "
|
|
<< " CUTLASS operation to profile.\n\n"
|
|
|
|
<< " --kernels=<string_list> "
|
|
<< " Filter operations by kernel names. For example, call all kernels with" << end_of_line
|
|
<< " (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line
|
|
<< " operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n"
|
|
|
|
<< " --ignore-kernels=<string_list> "
|
|
<< " Excludes kernels whose names match anything in this list.\n\n"
|
|
;
|
|
|
|
//
|
|
// Detailed options
|
|
//
|
|
|
|
device.print_usage(out);
|
|
out << "\n";
|
|
|
|
initialization.print_usage(out);
|
|
out << "\n";
|
|
|
|
library.print_usage(out);
|
|
out << "\n";
|
|
|
|
profiling.print_usage(out);
|
|
out << "\n";
|
|
|
|
verification.print_usage(out);
|
|
out << "\n";
|
|
|
|
report.print_usage(out);
|
|
out << "\n";
|
|
|
|
about.print_usage(out);
|
|
out << "\n";
|
|
}
|
|
|
|
void Options::print_options(std::ostream &out) const {
|
|
|
|
out
|
|
<< "options:\n"
|
|
<< " help: " << about.help << "\n"
|
|
<< " mode: " << to_string(execution_mode) << "\n";
|
|
|
|
out
|
|
<< " device:\n";
|
|
device.print_options(out, 2);
|
|
|
|
out
|
|
<< " initialization:\n";
|
|
initialization.print_options(out, 2);
|
|
|
|
out
|
|
<< " profiling:\n";
|
|
profiling.print_options(out, 2);
|
|
|
|
out
|
|
<< " verification:\n";
|
|
verification.print_options(out, 2);
|
|
|
|
out
|
|
<< " report:\n";
|
|
report.print_options(out, 2);
|
|
}
|
|
|
|
std::string Options::indent_str(int indent) {
|
|
return std::string(indent * 2, ' ');
|
|
}
|
|
|
|
/////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
} // namespace profiler
|
|
} // namespace cutlass
|