cutlass/tools/profiler/src/options.cu

/***************************************************************************************************
 * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
/* \file
   \brief Command line options for performance test program
*/

#include <algorithm>

#include "cutlass/cutlass.h"
#include "cutlass/version.h"

#include "cutlass/library/util.h"

#include "options.h"

/////////////////////////////////////////////////////////////////////////////////////////////////

namespace cutlass {
namespace profiler {

/////////////////////////////////////////////////////////////////////////////////////////////////

/// Newline and indent for help strings
static char const *end_of_line = "\n                                             ";

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Device::Device(cutlass::CommandLine const &cmdline) {

  cmdline.get_cmd_line_argument("device", device, 0);

  cudaError_t result;
  result = cudaGetDeviceProperties(&properties, device);

  if (result != cudaSuccess) {
    throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
  }

  result = cudaSetDevice(device);
  if (result != cudaSuccess) {
    throw std::runtime_error("cudaSetDevice() failed for given device.");
  }

  // Permit overriding the compute capability
  if (cmdline.check_cmd_line_flag("compute-capability")) {
    int cc = compute_capability();
    cmdline.get_cmd_line_argument("compute-capability", cc, cc);
    properties.major = cc / 10;
    properties.minor = cc % 10;
  }

  // Permit overriding the L2 cache capacity
  if (cmdline.check_cmd_line_flag("llc-capacity")) {
    int llc_capacity = 0;
    cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);

    if (llc_capacity >= 0) {
      properties.l2CacheSize = (llc_capacity << 10);
    }
  }

}

void Options::Device::print_usage(std::ostream &out) const {

  out << "Device:\n"
    << "  --device=<int>                               "
    << "    CUDA Device ID\n\n";

  int device_count = 0;
  cudaError_t result = cudaGetDeviceCount(&device_count);

  if (result != cudaSuccess) {
    out << "      <could not query for CUDA devices>\n";
  }
  else {

    for (int idx = 0; idx < device_count; ++idx) {
      cudaDeviceProp prop;
      result = cudaGetDeviceProperties(&prop, idx);
      if (result != cudaSuccess) {
        out << "      <could not obtain device properties for device " << idx << ">" << std::endl;
        break;
      }
      else {
        out << "    [" << idx << "] - "
          << prop.name << " - SM " << prop.major << "." << prop.minor << ", "
          << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
          << "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
          << std::endl;
      }
    }
    out << "\n";
  }

  out
    << "  --compute-capability=<int>                   "
    << "    Override the compute capability.\n\n"

    << "  --llc-capacity=<capacity in KiB>             "
    << "    Capacity of last-level cache in kilobytes. If this is non-zero," << end_of_line
    << "      profiling phases cycle through different input tensors to induce" << end_of_line
    << "      capacity misses in the L2.\n\n";

}

void Options::Device::print_device_info(std::ostream &out) const {
  int num_devices;
  cudaDeviceProp props;

  cudaError_t result;
  result = cudaGetDeviceCount(&num_devices);

  if (result != cudaSuccess) {
    throw std::runtime_error("cudaGetNumDevices() failed");
  }

  out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;

  for(int device = 0; device < num_devices; device++) {
    result = cudaSetDevice(device);
    if (result != cudaSuccess) {
      throw std::runtime_error("cudaSetDevice() failed for device");
    }

    result = cudaGetDeviceProperties(&props, device);
    if (result != cudaSuccess) {
      throw std::runtime_error("cudaGetDeviceProperties failed for device");
    }

    out << props.name << "," << props.major << props.minor << ","
      << device << "," << props.multiGpuBoardGroupID << std::endl;

  }
}

void Options::Device::print_options(std::ostream &out, int indent) const {

  out
    << indent_str(indent) << "device: " << device << "\n"
    << indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
    << indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
}

/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
int Options::Device::compute_capability() const {
  return properties.major * 10 + properties.minor;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {

  cmdline.get_cmd_line_argument("initialization-enabled", enabled, true);

  if (cmdline.check_cmd_line_flag("initialization-provider")) {
    std::string str;
    cmdline.get_cmd_line_argument("initialization-provider", str);
    provider = library::from_string<library::Provider>(str);
    if (provider == library::Provider::kInvalid) {
      enabled = false;
    }
    else if (provider != library::Provider::kReferenceHost && provider != library::Provider::kReferenceDevice) {
      throw std::runtime_error("Unsupported intialization provider specified.");
    }
  }
  else {
    provider = library::Provider::kReferenceDevice;
  }

  cmdline.get_cmd_line_argument("seed", seed, 2019);

  if (cmdline.check_cmd_line_flag("dist")) {
    // user has set the data distribution (fix data distribution once set)
    fix_data_distribution = true;
    // set user provided data distribution
    get_distribution(cmdline, "dist", data_distribution);
  }
  else {
    // profiler choosen data distribution (allowed to change based on numeric types)
    fix_data_distribution = false;
    // set uniform data distribution with range [-4, 4]
    data_distribution.set_uniform(-4, 4, 0);
  }


}

/// Gets the initial distribution
void Options::Initialization::get_distribution(
  cutlass::CommandLine const &args,
  std::string const &arg,
  cutlass::Distribution &dist) {

  struct {
    const char *label;
    cutlass::Distribution::Kind kind;
  } distribution_kinds[] = {
    {"uniform", cutlass::Distribution::Uniform},
    {"gaussian", cutlass::Distribution::Gaussian},
    {"identity", cutlass::Distribution::Identity},
    {"sequential", cutlass::Distribution::Sequential},
    {0, cutlass::Distribution::Invalid}
  };

  struct {
    char const *label;
    double *member;
  } members[] = {
    {"min", &dist.uniform.min},
    {"max", &dist.uniform.max},
    {"mean", &dist.gaussian.mean},
    {"stddev", &dist.gaussian.stddev},
    {"start", &dist.sequential.start},
    {"delta", &dist.sequential.delta},
    {0, 0}
  };

  using KeyValueVector = std::vector<std::pair<std::string, std::string> >;

  KeyValueVector values;
  args.get_cmd_line_argument_pairs(arg.c_str(), values);

  // The parser expects the first token to be a string identifying the distribution type.
  auto it = values.begin();
  if (it != values.end()) {
    for (int i = 0; distribution_kinds[i].label; ++i) {
      if (it->first == distribution_kinds[i].label) {
        dist.kind = distribution_kinds[i].kind;
        break;
      }
    }
    ++it;
  }

  // Subsequent key-value pairs update the named field of the distribution struct.
  for (; it != values.end(); ++it) {
    // Integer scaling factor - if < 0, no integer rounding is performed.
    if ((it->first.compare("scale") == 0) && !it->second.empty()) {
      std::stringstream ss;
      ss << it->second;
      ss >> dist.int_scale;
      continue;  // next token
    }

    // Casts as integer without scaling
    if (it->first.compare("integer") == 0) {
      dist.int_scale = 0;
      continue;  // next token
    }

    // initialize other members
    for (int m = 0; members[m].label; ++m) {
      if (it->first == members[m].label && !it->second.empty()) {
        std::stringstream ss;
        ss << it->second;
        ss >> *(members[m].member);
      }
    }
  }
}

void Options::Initialization::print_usage(std::ostream &out) const {

  out << "Initialization:\n"

    << "  --initialization=<bool>                      "
    << "    Enables initialization (default: true). If false, device memory is" << end_of_line
    << "      not initialized after allocation.\n\n"

    << "  --initialization-provider=<provider>         "
    << "    Selects initialization provider {host, device*}. (default: '*')\n\n"

    << "  --dist=<distribution>                        "
    << "    Data distribution of input tensors {uniform*, gaussian, identity, sequential}"  << end_of_line
    << "       --dist=uniform,min:<double>,max:<double>,scale:<integer>"  << end_of_line
    << "       --dist=gaussian,mean:<double>,stddev:<double>,scale:<integer>"  << end_of_line
    << "       --dist=sequential,start:<double>,delta:<double>,scale:<integer>"  << end_of_line
    << "       --dist=identity\n\n"

    << "  --seed=<int>                                 "
    << "    Random number generator seed. Used to enforce deterministic" << end_of_line
    << "      initialization.\n\n";

}

void Options::Initialization::print_options(std::ostream &out, int indent) const {

}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Library::Library(cutlass::CommandLine const &cmdline) {

  algorithm_mode = AlgorithmMode::kDefault;

  if (cmdline.check_cmd_line_flag("library-algo-mode")) {
    std::string mode = "default";
    cmdline.get_cmd_line_argument("library-algo-mode", mode);
    algorithm_mode = from_string<AlgorithmMode>(mode);
  }

  if (cmdline.check_cmd_line_flag("library-algos")) {

    // If algorithms are specified, override as kBest.
    algorithm_mode = AlgorithmMode::kBest;

    std::vector<std::string> tokens;
    cmdline.get_cmd_line_arguments("library-algos", tokens);

    algorithms.reserve(tokens.size());

    for (auto const & token : tokens) {
      if (token.find(":")) {
        // todo - tokenized range
      }
      else {
        int algo;
        std::stringstream ss;

        ss << token;
        ss >> algo;

        algorithms.push_back(algo);
      }
    }
  }
}

void Options::Library::print_usage(std::ostream &out) const {

  out << "Library:\n"

    << "  --library-algo-mode=<mode>                   "
    << "    Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n"
    << "                                               "
    << "    mode={default*,matching,best}\n\n"

    << "  --library-algos=<range-list>                 "
    << "    If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n";

}

void Options::Library::print_options(std::ostream &out, int indent) const {

  out
    << indent_str(indent) << "library-algo-mode: " << to_string(algorithm_mode) << "\n"
    << indent_str(indent) << "library-algos: ";

  int j = 0;
  for (int x : algorithms) {
    out << (j++ ? "," : "") << x;
  }

  out << "\n\n";
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {

  cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
  cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
  cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
  cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
  cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);

  if (cmdline.check_cmd_line_flag("providers")) {

    std::vector<std::string> tokens;
    cmdline.get_cmd_line_arguments("providers", tokens);

    providers.clear();

    for (auto const &token : tokens) {
      providers.push_back(library::from_string<library::Provider>(token));
    }
  }
  else {
    providers.push_back(library::Provider::kCUTLASS);
    providers.push_back(library::Provider::kCUBLAS);
    providers.push_back(library::Provider::kCUDNN);
  }
}

void Options::Profiling::print_usage(std::ostream &out) const {

  out << "Profiling:\n"

    << "  --workspace-count=<workspace count>          "
    << "    Number of discrete workspaces maintained to avoid cache-resident " << end_of_line
    << "    If zero (default), the amount is chosen for each workload based on " << end_of_line
    << "    capacity of the last-level cache.\n\n"

    << "  --profiling-iterations=<iterations>          "
    << "    Number of iterations to profile each kernel. If zero, kernels" << end_of_line
    << "      are launched up to the profiling duration.\n\n"

    << "  --warmup-iterations=<iterations>             "
    << "    Number of iterations to execute each kernel prior to profiling.\n\n"

    << "  --sleep-duration=<duration>                  "
    << "    Number of ms to sleep between profiling periods (ms).\n\n"

    << "  --profiling-enabled=<bool>                   "
    << "    If true, profiling is actually conducted.\n\n"

    << "  --providers=<providers>                      "
    << "    List of providers to be profiled for performance. (default: '*')" << end_of_line
    << "      Gemm providers {cutlass*, cublas*}" << end_of_line
    << "      Conv2d providers {cutlass*, cudnn*}"
    << "\n\n";

}

void Options::Profiling::print_options(std::ostream &out, int indent) const {

  out
    << indent_str(indent) << "profiling_iterations: " << iterations << "\n"
    << indent_str(indent) << "sleep_duration: " << sleep_duration << "\n"
    << indent_str(indent) << "profiling_enabled: " << enabled << "\n"
    << indent_str(indent) << "providers: [";

  int j = 0;
  for (auto const & provider : providers) {
    out << (j++ ? ", " : "") << library::to_string(provider);
  }
  out << "]\n";
}

/// Returns true if a provider is enabled
bool Options::Profiling::provider_enabled(library::Provider provider) const {
  return std::find(providers.begin(), providers.end(), provider) != providers.end();
}

/// Returns the index of a provider if its enabled
size_t Options::Profiling::index(library::Provider provider) const {
  size_t idx = 0;
  for (auto const & x : providers) {
    if (x == provider) {
      return idx;
    }
    ++idx;
  }
  return idx;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Verification::Verification(cutlass::CommandLine const &cmdline) {

  cmdline.get_cmd_line_argument("verification-enabled", enabled, true);

  cmdline.get_cmd_line_argument("epsilon", epsilon, 0.05);

  cmdline.get_cmd_line_argument("nonzero-floor", nonzero_floor, 1.0 / 256.0);

  if (cmdline.check_cmd_line_flag("save-workspace")) {
    std::string value;
    cmdline.get_cmd_line_argument("save-workspace", value);
    save_workspace = from_string<SaveWorkspace>(value);
  }
  else {
    save_workspace = SaveWorkspace::kNever;
  }

  if (cmdline.check_cmd_line_flag("verification-providers")) {

    std::vector<std::string> tokens;
    cmdline.get_cmd_line_arguments("verification-providers", tokens);

    providers.clear();

    for (auto const &token : tokens) {
      library::Provider provider = library::from_string<library::Provider>(token);
      if (provider != library::Provider::kInvalid) {
        providers.push_back(provider);
      }
    }
  }
  else {
    providers.push_back(library::Provider::kCUBLAS);
    providers.push_back(library::Provider::kReferenceDevice);
    providers.push_back(library::Provider::kCUDNN);
  }
}

void Options::Verification::print_usage(std::ostream &out) const {

  out << "Verification:\n"

    << "  --verification-enabled=<bool>                "
    << "    Whether to perform verification checks.\n\n"

    << "  --epsilon=<error>                            "
    << "    Error threshold. Setting to zero (default) requires" << end_of_line
    << "      bit-level equivalence.\n\n"

    << "  --nonzero-floor=<floor>                      "
    << "    Results whose absolute value is less than this quantity" << end_of_line
    << "      are treated as zero for comparisons.\n\n"

    << "  --save-workspace=<string>                    "
    << "    Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line
    << "       --save-workspace=never      never save workspace (default)" << end_of_line
    << "       --save-workspace=incorrect  save workspace for incorrect results" << end_of_line
    << "       --save-workspace=always     always save workspace\n\n"

    << "  --verification-providers=<providers>         "
    << "    List of providers used to verify result. (default: '*')" << end_of_line
    << "      Gemm verification-providers {cublas*}" << end_of_line
    << "      Conv2d verification-providers {cudnn*, device*, host}"
    << "\n\n";
}

void Options::Verification::print_options(std::ostream &out, int indent) const {

  out
    << indent_str(indent) << "verification_enabled: " << enabled << "\n"
    << indent_str(indent) << "epsilon: " << epsilon << "\n"
    << indent_str(indent) << "save_workspace: " << to_string(save_workspace) << "\n"
    << indent_str(indent) << "verification_providers: [";

  int j = 0;
  for (auto const & provider : providers) {
    out << (j++ ? ", " : "") << library::to_string(provider);
  }
  out << "]\n";
}

/// Returns true if a provider is enabled
bool Options::Verification::provider_enabled(library::Provider provider) const {
  return std::find(providers.begin(), providers.end(), provider) != providers.end();
}

/// Returns the index of a provider if its enabled
size_t Options::Verification::index(library::Provider provider) const {
  size_t idx = 0;
  for (auto const & x : providers) {
    if (x == provider) {
      return idx;
    }
    ++idx;
  }
  return idx;
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Report::Report(cutlass::CommandLine const &cmdline) {

  cmdline.get_cmd_line_argument("append", append, false);
  cmdline.get_cmd_line_argument("output", output_path);
  cmdline.get_cmd_line_argument("junit-output", junit_output_path);

  if (cmdline.check_cmd_line_flag("tags")) {
    cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
  }

  cmdline.get_cmd_line_argument("report-not-run", report_not_run, false);

  cmdline.get_cmd_line_argument("verbose", verbose, true);
}

void Options::Report::print_usage(std::ostream &out) const {

  out << "Report:\n"

    << "  --append=<bool>                              "
    << "    If true, result is appended to possibly existing file. Otherwise, " << end_of_line
    << "      any existing file is overwritten.\n\n"

    << "  --output=<path>                              "
    << "    Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n"

    << "  --junit-output=<path>                        "
    << "    Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"

    << "  --report-not-run=<bool>                      "
    << "    If true, reports the status of all kernels including those that" << end_of_line
    << "      do not satisfy the given arguments.\n\n"

    << "  --tags=<column:tag,...>                      "
    << "    Inserts leading columns in output table and uniform values for each" << end_of_line
    << "      column. Useful for generating pivot tables.\n\n"

    << "  --verbose=<bool>                             "
    << "    Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n";
}

void Options::Report::print_options(std::ostream &out, int indent) const {

  out
    << indent_str(indent) << "append: " << append << "\n"
    << indent_str(indent) << "output: " << output_path << "\n"
    << indent_str(indent) << "junit-output: " << junit_output_path << "\n"
    << indent_str(indent) << "report_not_run: " << report_not_run << "\n"
    << indent_str(indent) << "tags:\n";

  for (auto const & tag : pivot_tags) {
    out << indent_str(indent + 1) << tag.first << ": " << tag.second << "\n";
  }

  out
    << indent_str(indent) << "verbose: " << verbose << "\n";
}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::About::About(cutlass::CommandLine const &cmdline) {
  help = cmdline.check_cmd_line_flag("help");
  version = cmdline.check_cmd_line_flag("version");
  device_info = cmdline.check_cmd_line_flag("device-info");
}

void Options::About::print_usage(std::ostream &out) const {

  out << "About:\n"
    << "  --version                                        ";

  print_version(out);

  out << "\n";
}

void Options::About::print_version(std::ostream &out) {
  out << "CUTLASS " << cutlass::getVersionString()
      << " built on " << __DATE__ << " at " << __TIME__;
  if (!cutlass::getGitRevision().empty()) out << " with commit " << cutlass::getGitRevision() << "";
}

void Options::About::print_options(std::ostream &out, int indent) const {

}

/////////////////////////////////////////////////////////////////////////////////////////////////

Options::Options(cutlass::CommandLine const &cmdline):
  cmdline(cmdline),
  device(cmdline),
  initialization(cmdline),
  library(cmdline),
  profiling(cmdline),
  verification(cmdline),
  report(cmdline),
  about(cmdline) {

  if (cmdline.check_cmd_line_flag("mode")) {
    std::string token;
    cmdline.get_cmd_line_argument("mode", token);
    execution_mode = from_string<ExecutionMode>(token);
  }
  else {
    execution_mode = ExecutionMode::kProfile;
  }

  // Enumerating kernels is equivalent to a dry run.
  if (execution_mode == ExecutionMode::kEnumerate) {
    execution_mode = ExecutionMode::kDryRun;
  }

  if (cmdline.check_cmd_line_flag("operation")) {
    std::string str;
    cmdline.get_cmd_line_argument("operation", str);
    operation_kind = library::from_string<library::OperationKind>(str);
  }
  else if (cmdline.check_cmd_line_flag("function")) {
    std::string str;
    cmdline.get_cmd_line_argument("function", str);
    operation_kind = library::from_string<library::OperationKind>(str);
  }
  else {
    operation_kind = library::OperationKind::kInvalid;
  }

  if (cmdline.check_cmd_line_flag("operation_names")) {
    cmdline.get_cmd_line_arguments("operation_names", operation_names);
  }
  else if (cmdline.check_cmd_line_flag("kernels")) {
    cmdline.get_cmd_line_arguments("kernels", operation_names);
  }

  if (cmdline.check_cmd_line_flag("ignore-kernels")) {
    cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
  }

  // Prevent launches on the device for anything other than CUTLASS operation
  if (execution_mode == ExecutionMode::kTrace) {
    initialization.provider = library::Provider::kReferenceHost;
    verification.enabled = false;
    profiling.enabled = false;
  }
}

void Options::print_usage(std::ostream &out) const {

  out
    << "CUTLASS Profiler\n"
    << "usage:\n\n"
    << "    cutlass_profiler [options]\n\n"
    << "  --help\n\n"

    << "  --mode=<string>                              "
    << "    Cutlass profiler execution mode." << end_of_line
    << "       --mode=profile    regular verification and profiling (default)" << end_of_line
    << "       --mode=dry_run    no kernels are launched or workspaces allocated" << end_of_line
    << "       --mode=enumerate  lists all operation kind and operations" << end_of_line
    << "       --mode=trace      executes a single device-side computation with" << end_of_line
    << "                          no other kernel launches\n\n"

    << "  --device-info                                "
    << "    Prints information on all GPUs present in the system\n\n"

    << "  --operation=<operation_kind>                 "
    << "    CUTLASS operation to profile.\n\n"

    << "  --kernels=<string_list>                      "
    << "    Filter operations by kernel names. For example, call all kernels with" << end_of_line
    << "      (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line
    << "      operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n"

    << "  --ignore-kernels=<string_list>               "
    << "    Excludes kernels whose names match anything in this list.\n\n"
    ;

  //
  // Detailed options
  //

  device.print_usage(out);
  out << "\n";

  initialization.print_usage(out);
  out << "\n";

  library.print_usage(out);
  out << "\n";

  profiling.print_usage(out);
  out << "\n";

  verification.print_usage(out);
  out << "\n";

  report.print_usage(out);
  out << "\n";

  about.print_usage(out);
  out << "\n";
}

void Options::print_options(std::ostream &out) const {

  out
    << "options:\n"
    << "  help: " << about.help << "\n"
    << "  mode: " << to_string(execution_mode) << "\n";

  out
    << "  device:\n";
  device.print_options(out, 2);

  out
    << "  initialization:\n";
  initialization.print_options(out, 2);

  out
    << "  profiling:\n";
  profiling.print_options(out, 2);

  out
    << "  verification:\n";
  verification.print_options(out, 2);

  out
    << "  report:\n";
  report.print_options(out, 2);
}

std::string Options::indent_str(int indent) {
  return std::string(indent * 2, ' ');
}

/////////////////////////////////////////////////////////////////////////////////////////////////

} // namespace profiler
} // namespace cutlass