cutlass/tools/test/perf/testbench_options.h

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/

#pragma once

#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <stdint.h>
#include <stdexcept>

#include "cutlass/cutlass.h"
#include "tools/util/command_line.h"
#include "tools/util/distribution.h"
#include "tools/test/perf/provider.h"

namespace perf {

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Range of problem sizes
struct Range {

  enum Operator {
    Add,
    Multiply
  };

  //
  // Data members
  //

  int start;
  int end;
  int increment;
  Operator increment_op;

  //
  // Methods
  //

  Range(int _start = 0) : start(_start), end(_start), increment(1), increment_op(Add) {}

  Range(int _start, int _end, int _increment = 1, Operator _op = Add)
      : start(_start), end(_end), increment(_increment), increment_op(_op) {}

  /// Returns the next item in series
  int next(int val) const {
    switch (increment_op) {
      case Add: val += increment; break;
      case Multiply: val *= increment; break;
      default: val = end; break;
    }
    return val;
  }

  void import_from_strings(const std::vector<std::string>& values) {
    if (values.size() > 0) {
      std::stringstream ss;
      ss << values.at(0);
      ss >> start;
    }

    if (values.size() > 1) {
      std::stringstream ss;
      ss << values.at(1);
      ss >> end;
    } else {
      end = start;
    }

    if (values.size() > 2 && !values.at(2).empty()) {
      std::stringstream ss;

      char first = values.at(2).at(0);
      if (first == '*' || first == '+') {
        ss << values.at(2).substr(1);
        switch (first) {
        case '*': increment_op = Multiply; break;
        case '+': increment_op = Add; break;
        default: break;
        }
      }
      else {
        ss << values.at(2);
      }
      ss >> increment;
    }
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines a space of problem sizes
struct GemmProblemRange {
 public:
  /// Range of sizes in GEMM M dimension
  Range M;

  /// Range of sizes in GEMM N dimension
  Range N;

  /// Range of sizes in GEMM K dimension
  Range K;

  //
  // Methods
  //

  /// Constructor to define a space of probelm sizes
  GemmProblemRange(Range _M = Range(256), Range _N = Range(256), Range _K = Range(256))
      : M(_M), N(_N), K(_K) {}

  /// Parses a command line argument as a Range object
  static void get_range(Range &range,
                        cutlass::CommandLine const &args,
                        std::string const &arg,
                        Range const &_default = Range(256)) {
    range = Range(0, 0, 1);

    if (args.check_cmd_line_flag(arg.c_str())) {
      std::vector<std::string> values;
      args.get_cmd_line_arguments(arg.c_str(), values, ':');

      range.import_from_strings(values);
    } else {
      range = _default;
    }
  }

  /// Initializes the GEMM problem size from command line arguments
  GemmProblemRange(cutlass::CommandLine const &args) {
    get_range(M, args, "m", Range(10240));
    get_range(N, args, "n", Range(4096));
    get_range(K, args, "k", Range(4096));
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines a vector of string pairs
typedef std::vector<std::pair<std::string, std::string> > KeyValueVector;

/// Defines a const iterator to a KeyValueVector
typedef KeyValueVector::const_iterator KeyValueIterator;

/// Structure captures the initial configuration of matrices
struct InitialDistribution {
  /// Distribution of A matrix operand
  cutlass::Distribution dist_A;

  /// Distribution of B matrix operand
  cutlass::Distribution dist_B;

  /// cutlass::Distribution of C matrix operand
  cutlass::Distribution dist_C;

  /// Seed for random number generation
  int64_t seed;

  //
  // Static function members
  //

  /// Gets the initial distribution
  static void get_distribution(cutlass::CommandLine const &args,
                               std::string const &arg,
                               cutlass::Distribution &dist) {
    struct {
      const char *label;
      cutlass::Distribution::Kind kind;
    } distribution_kinds[] = {{"uniform", cutlass::Distribution::Uniform},
                              {"gaussian", cutlass::Distribution::Gaussian},
                              {"linear", cutlass::Distribution::Linear},
                              {"identity", cutlass::Distribution::Identity},
                              {0, cutlass::Distribution::Invalid}};

    struct {
      char const *label;
      double *member;
    } members[] = {{"min", &dist.uniform.min},
                   {"max", &dist.uniform.max},
                   {"mean", &dist.gaussian.mean},
                   {"stddev", &dist.gaussian.stddev},
                   {"offset", &dist.linear.offset},
                   {"delta_row", &dist.linear.delta_row},
                   {"delta_column", &dist.linear.delta_column},
                   {0, 0}};

    KeyValueVector values;
    args.get_cmd_line_argument_pairs(arg.c_str(), values);

    // The parser expects the first token to be a string identifying the distribution type.
    KeyValueIterator it = values.begin();
    if (it != values.end()) {
      for (int i = 0; distribution_kinds[i].label; ++i) {
        if (it->first == distribution_kinds[i].label) {
          dist.kind = distribution_kinds[i].kind;
          break;
        }
      }
      ++it;
    }

    // Subsequent key-value pairs update the named field of the distribution struct.
    for (; it != values.end(); ++it) {
      // Integer scaling factor - if < 0, no integer rounding is performed.
      if (it->first == "scale" && !it->second.empty()) {
        std::stringstream ss;
        ss << it->second;
        ss >> dist.int_scale;
        continue;  // next token
      }

      // Casts as integer without scaling
      if (it->first == "integer") {
        dist.int_scale = 0;
        continue;  // next token
      }

      // Casts as integer without scaling
      if (it->first == "integer") {
        dist.int_scale = 0;
        continue;  // next token
      }

      // initialize other members
      for (int m = 0; members[m].label; ++m) {
        if (it->first == members[m].label && !it->second.empty()) {
          std::stringstream ss;
          ss << it->second;
          ss >> *(members[m].member);
        }
      }
    }
  }

  //
  // Methods
  //

  /// Basic uniform random distribution
  InitialDistribution(int64_t _seed = 700) : seed(_seed) {
    dist_A.set_uniform(-4, 4);
    dist_B.set_uniform(-4, 4);
    dist_C.set_uniform(-4, 4);
  }

  /// Extracts initial distribution from command line arguments
  InitialDistribution(cutlass::CommandLine const &args) {
    // Set initial values
    seed = 700;
    dist_A.set_uniform(-4, 4);
    dist_B.set_uniform(-4, 4);
    dist_C.set_uniform(-4, 4);

    // Update with command line arguments
    args.get_cmd_line_argument("seed", seed, seed);

    // Update all distributions at once
    cutlass::Distribution dist_all;
    if (args.check_cmd_line_flag("dist")) {
      get_distribution(args, "dist", dist_all);
      dist_A = dist_all;
      dist_B = dist_all;
      dist_C = dist_all;
    }

    get_distribution(args, "dist_A", dist_A);
    get_distribution(args, "dist_B", dist_B);
    get_distribution(args, "dist_C", dist_C);
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines how to execute the benchmarks
struct ExecutionMode {
  enum Kind { Profile, Verify, Single, Invalid };

  static std::string to_string(Kind kind) {
    switch (kind) {
      case Profile:
        return "profile";
      case Verify:
        return "verify";
      case Single:
        return "single";
      default:
        return "invalid";
    }
  }

  static Kind from_string(std::string const &str) {
    if (str == "profile") return Profile;
    if (str == "verify") return Verify;
    if (str == "single") return Single;
    return Profile;
  }
};

/// Indicates when the workspace is saved
struct WorkspaceSaveMode {
  enum Kind { Never, Incorrect, Always };

  static std::string to_string(Kind kind) {
    switch (kind) {
      case Never:
        return "never";
      case Incorrect:
        return "incorrect";
      case Always:
        return "always";
      default:
        return "incorrect";
    }
  }

  static Kind from_string(std::string const &str) {
    if (str == "never") return Never;
    if (str == "incorrect") return Incorrect;
    if (str == "always") return Always;
    return Incorrect;
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Class holding testbench command line options
struct TestbenchOptions {
  //
  // Data members
  //

  /// Describes the random initial state of the input matrices
  InitialDistribution initial_distribution;

  // Path to output file name
  std::string output_filename;

  // Path to input file name
  std::string threshold_filename;

  /// If true, output is appended
  bool append;

  /// Number of iterations
  int iterations;

  /// Defines how to run the benchmark
  ExecutionMode::Kind execution_mode;

  /// Indicates when the workspace is saved
  WorkspaceSaveMode::Kind save_workspace_mode;

  /// Properties of CUDA device
  cudaDeviceProp device_properties;

  /// Enabled kernel names
  std::vector<std::string> kernels;

  /// Scalar value for GEMM
  double alpha;

  /// Scalar value for GEMM
  double beta;

  /// Range of problem sizes
  GemmProblemRange problem_range;

  /// If true, kernels are not executed, and no sleep waits are inserted
  bool dry_run;

  /// Tags to describe the profiler output
  KeyValueVector pivot_tags;

  /// If enabled, only the peak performance for a given kernel is reported
  bool peak_performance;

  /// Performance Degradatiom Margin before flagging as test failure
  double perf_margin;

  /// Cool-down period
  int sleep_time;

  //
  // Methods
  //

  /// Constructs the testbench from tags
  TestbenchOptions(cutlass::CommandLine const &args)
      : initial_distribution(args),
        execution_mode(ExecutionMode::Profile),
        save_workspace_mode(WorkspaceSaveMode::Never),
        problem_range(args),
        dry_run(false),
        sleep_time(1) {

    // Set the CUDA device and/or specify clock rate
    configure_cuda_device(args);

    // fetch command line arguments
    args.get_cmd_line_argument("iterations", iterations, 25);
    args.get_cmd_line_argument("append", append, false);
    args.get_cmd_line_argument("output", output_filename);
    args.get_cmd_line_argument("threshold", threshold_filename);
    args.get_cmd_line_argument("alpha", alpha, 1.0);
    args.get_cmd_line_argument("beta", beta, 0.0);
    args.get_cmd_line_argument("peak", peak_performance, false);
    args.get_cmd_line_argument_pairs("tags", pivot_tags);
    args.get_cmd_line_argument("perf-margin", perf_margin, 0.97);
    args.get_cmd_line_argument("dry-run", dry_run, false);
    args.get_cmd_line_argument("sleep-time", sleep_time, 1);

    if (args.check_cmd_line_flag("execution-mode")) {
      std::string str;
      args.get_cmd_line_argument("execution-mode", str);
      execution_mode = ExecutionMode::from_string(str);
    }

    if (args.check_cmd_line_flag("save-workspace")) {
      std::string str;
      args.get_cmd_line_argument("save-workspace", str);
      save_workspace_mode = WorkspaceSaveMode::from_string(str);
    }

    if (args.check_cmd_line_flag("execution-mode")) {
      std::string str;
      args.get_cmd_line_argument("execution-mode", str);
      execution_mode = ExecutionMode::from_string(str);
    }

    if (args.check_cmd_line_flag("save-workspace")) {
      std::string str;
      args.get_cmd_line_argument("save-workspace", str);
      save_workspace_mode = WorkspaceSaveMode::from_string(str);
    }

    // query for enabled kernels or enable all of them
    if (args.check_cmd_line_flag("kernels")) {
      args.get_cmd_line_arguments("kernels", kernels, ',');
    } else {
      char const *gemms[] = {
        "sgemm",
        "dgemm",
        "hgemm",
        "igemm",
        "wmma_gemm",
        "wmma_gemm_f16",
        "wmma_binary_gemm",
        "wmma_integer_gemm",
        0
      };
      char const *layouts[] = {"nn", "nt", "tn", "tt", 0};
      for (int i = 0; gemms[i]; ++i) {
        for (int j = 0; layouts[j]; ++j) {
          if ((std::string(gemms[i]).compare("wmma_binary_gemm") == 0 ||
               std::string(gemms[i]).compare("wmma_integer_gemm") == 0)
               && std::string(layouts[j]).compare("tn") != 0) {
            continue;
          }
          kernels.push_back(std::string(gemms[i]) + "_" + layouts[j]);
        }
      }

    }
  }

  void configure_cuda_device(cutlass::CommandLine const &args) {
    int device_id = 0;
    args.get_cmd_line_argument("device", device_id, 0);

    cudaError_t result;
    result = cudaGetDeviceProperties(&device_properties, device_id);
    if (result != cudaSuccess) {
      throw std::runtime_error("cudaGetDeviceProperties() failed for given device.");
    }
    result = cudaSetDevice(device_id);
    if (result != cudaSuccess) {
      throw std::runtime_error("cudaSetDevice() failed for given device.");
    }

    // Get the clock rate (specified in cmd line in MHz)
    if (args.check_cmd_line_flag("clock")) {
      args.get_cmd_line_argument("clock", device_properties.clockRate);
      device_properties.clockRate *= 1000;
    }
  }

  /// Returns true if the kernel name appears among the enabled kernels
  bool kernel_enabled(std::string const &kernel) const {
    typedef std::vector<std::string>::const_iterator kernel_iterator;

    for (kernel_iterator it = kernels.begin(); it != kernels.end(); ++it) {
      if (kernel.find(*it) != std::string::npos) {
        return true;
      }
    }

    return false;
  }

  /// Given the disposition of a GEMM problem, returns true if the results should
  /// be saved to the file system.
  bool save_workspace(bool correct) const {
    if (save_workspace_mode == WorkspaceSaveMode::Always ||
        (save_workspace_mode == WorkspaceSaveMode::Incorrect && !correct)) {
      return true;
    }
    return false;
  }

  /// Returns true if the selected device can satisfy the given compute capability
  bool compute_capability(int major, int minor) const {
    return (device_properties.major > major ||
      (device_properties.major == major && device_properties.minor >= minor));
  }

  /// Requires an exact match of compute capability
  bool compute_capability_exact(int major, int minor) const {
    return major == device_properties.major && minor == device_properties.minor;
  }

  /// Prints version
  static void version(std::ostream &out) {
    out << "CUTLASS " << CUTLASS_MAJOR << "." << CUTLASS_MINOR << "." << CUTLASS_PATCH
        << " built on " << __DATE__ << " at " << __TIME__;
  }

  /// Prints the usage statement
  static void usage(std::ostream &out) {
    out << "cutlass_perf_test [options]\n\n"

        << "  --help\n"

        << "  --append=<true|false*>                        "
        << "    If true, appends output to existing CSV file. If false, overwrites.\n"

        << "  --alpha=<alpha>                               "
        << "    Value for alpha to be used in GEMM experiments\n"

        << "  --beta=<beta>                                 "
        << "    Value for beta to be used in GEMM experiments\n"

        << "  --device=<int>                                "
        << "    Specifies the CUDA device to use. Default is device 0.\n"

        << "  --clock=<MHz>                                 "
        << "    Specifies the SM clock rate in MHz.\n"

        << "  --dist-{A,B,C}=<distribution>                 "
        << "    Describes the random distribution of each of the input matrix operands.\n"

        << "  --dry-run=<bool>                              "
        << "    If true, kernels are not executed and sleep is not inserted.\n"

        << "  --execution-mode=<mode>                       "
        << "    Specifies execution mode: profile, verify, single\n"

        << "  --output=<filename.csv>                       "
        << "    Writes summary of profiling to specified .csv file\n"

        << "  --threshold=<filename.csv>                    "
        << "    Reads previous output summary and re-executes the same configurations.\n"

        << "  --iterations=<timing iterations>              "
        << "    maximum number of iterations to execute when profiling\n"

        << "  --m=<height>[:max height[:step]]              "
        << "    Height of GEMM problem (number of rows of C). May specify a range with optional "
           "step size.\n"

        << "  --n=<width>[:max width[:step]]                "
        << "    Width of GEMM problem (number of columns of C). May specify a range with optional "
           "step size.\n"

        << "  --k=<depth>[:max depth[:step]]                "
        << "    Size of inner dimension of A and B. May specify a range with optional step size.\n"

        << "  --kernels=<{s|d|h|i|wmma_|wmma_binary_|wmma_integer_}gemm_{nn,nt,tn,tt}>\n"
        << "                                                "
        << "    Select GEMM datatype and layout to use for tests\n"

        << "  --peak=<bool>                                 "
        << "    If true, only reports peak performance per kernel after profiling specified "
           "problem space.\n"

        << "  --perf-margin=<perf-margin>                   "
        << "    Allowable performance degradation before flagging test as failure (e.g. 3% slowdown"
           " = 0.97).\n"

        << "  --save-workspace={*never,incorrect,always}    "
        << "    Specifies when to save the GEMM inputs and results to the filesystem.\n"

        << "  --seed=<seed>                                 "
        << "    Random seed used by the random number generator in initializing input matrices.\n"

        << "  --tags=<column:tag,...>                       "
        << "    Inserts leading columns in output table and uniform values for each column. Useful "
           "for generating pivot tables.\n"

        << "  --sleep-time=<second>                         "
        << "    Sleep period between profiling kernels to cool down the device.\n"

        << "  --version                                     "
        << "    ";

    version(out);

    out << "\n\n";

    out << "\n\n"
        << "Example usage:\n\n"

        << "# Runs one problem size for all kernels\n"
        << "./tools/test/perf/cutlass_perf_test --m=10240 --n=1024 --k=1024\n\n"

        << "# Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands\n"
        << "./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 "
           "--kernels=sgemm_nn,igemm_nn\n\n"

        << std::flush;
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace perf