/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/

#pragma once

#include <stdint.h>
#include <tools/util/command_line.h>

namespace perf {

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Range of problem sizes
struct Range {
  int start;
  int end;
  int increment;

  Range(int _start = 0) : start(_start), end(_start), increment(1) {}

  Range(int _start, int _end, int _increment = 1)
      : start(_start), end(_end), increment(_increment) {}
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines a space of problem sizes
struct GemmProblemRange {
 public:
  /// Range of sizes in GEMM M dimension
  Range M;

  /// Range of sizes in GEMM N dimension
  Range N;

  /// Range of sizes in GEMM K dimension
  Range K;

  //
  // Methods
  //

  /// Constructor to define a space of probelm sizes
  GemmProblemRange(Range _M = Range(256), Range _N = Range(256), Range _K = Range(256))
      : M(_M), N(_N), K(_K) {}

  /// Parses a command line argument as a Range object
  static void get_range(Range &range,
                        cutlass::CommandLine const &args,
                        std::string const &arg,
                        Range const &_default = Range(256)) {
    range = Range(0, 0, 1);

    if (args.check_cmd_line_flag(arg.c_str())) {
      std::vector<std::string> values;
      args.get_cmd_line_arguments(arg.c_str(), values, ':');

      if (values.size() > 0) {
        std::stringstream ss;
        ss << values.at(0);
        ss >> range.start;
      }

      if (values.size() > 1) {
        std::stringstream ss;
        ss << values.at(1);
        ss >> range.end;
      } else {
        range.end = range.start;
      }

      if (values.size() > 2) {
        std::stringstream ss;
        ss << values.at(2);
        ss >> range.increment;
      }
    } else {
      range = _default;
    }
  }

  /// Initializes the GEMM problem size from command line arguments
  GemmProblemRange(cutlass::CommandLine const &args) {
    get_range(M, args, "m", Range(10240));
    get_range(N, args, "n", Range(4096));
    get_range(K, args, "k", Range(4096));
  }
};

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Distribution type
struct Distribution {
  /// Variant types
  enum Kind { Invalid, Uniform, Gaussian, Linear, Identity };

  /// Distribution state
  union {
    /// Uniform distribution
    struct {
      double min;
      double max;
    } uniform;

    /// Gaussian distribution
    struct {
      double mean;
      double stddev;
    } gaussian;

    /// Elements are linear combination of row and column index
    struct {
      double offset;
      double delta_row;
      double delta_column;
    } linear;
  };

  /// Active variant kind
  Kind kind;

  /// Random values are cast to integer after scaling by this power of two
  int int_scale;

  //
  // Methods
  //

  Distribution() : kind(Invalid), int_scale(0) {}

  /// Configures distribution as uniform random
  Distribution &set_uniform(double _min, double _max, int _int_scale = 0) {
    kind = Uniform;
    uniform.min = _min;
    uniform.max = _max;
    int_scale = _int_scale;
    return *this;
  }

  /// Configures distribution as Gaussian distribution
  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0) {
    kind = Gaussian;
    gaussian.mean = _mean;
    gaussian.stddev = _stddev;
    int_scale = _int_scale;
    return *this;
  }


  /// Sets identity
  Distribution &set_identity() {
    kind = Identity;
    return *this;
  }
};

}  // namespace perf

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Prints a Distribution to ostream
inline std::ostream &operator<<(std::ostream &out, perf::Distribution const &dist) {
  switch (dist.kind) {
    case perf::Distribution::Uniform:
      out << "uniorm, min: " << dist.uniform.min << ", max: " << dist.uniform.max;
      break;
    case perf::Distribution::Gaussian:
      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev;
      break;
    case perf::Distribution::Linear:
      out << "linear, mean: " << dist.linear.offset << ", delta_row: " << dist.linear.delta_row
          << ", delta_column: " << dist.linear.delta_column;
      break;
    case perf::Distribution::Identity:
      break;
    default:
      out << "unknown";
  }

  out << ", int_scale: " << dist.int_scale;

  return out;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace perf {

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines a vector of string pairs
typedef std::vector<std::pair<std::string, std::string> > KeyValueVector;

/// Defines a const iterator to a KeyValueVector
typedef KeyValueVector::const_iterator KeyValueIterator;

/// Structure captures the initial configuration of matrices
struct InitialDistribution {
  /// Distribution of A matrix operand
  Distribution dist_A;

  /// Distribution of B matrix operand
  Distribution dist_B;

  /// Distribution of C matrix operand
  Distribution dist_C;

  /// Seed for random number generation
  int64_t seed;

  //
  // Static function members
  //

  /// Gets the initial distribution
  static void get_distribution(cutlass::CommandLine const &args,
                               std::string const &arg,
                               Distribution &dist) {
    struct {
      const char *label;
      Distribution::Kind kind;
    } distribution_kinds[] = {{"uniform", Distribution::Uniform},
                              {"gaussian", Distribution::Gaussian},
                              {"linear", Distribution::Linear},
                              {"identity", Distribution::Identity},
                              {0, Distribution::Invalid}};

    struct {
      char const *label;
      double *member;
    } members[] = {{"min", &dist.uniform.min},
                   {"max", &dist.uniform.max},
                   {"mean", &dist.gaussian.mean},
                   {"stddev", &dist.gaussian.stddev},
                   {"offset", &dist.linear.offset},
                   {"delta_row", &dist.linear.delta_row},
                   {"delta_column", &dist.linear.delta_column},
                   {0, 0}};

    KeyValueVector values;
    args.get_cmd_line_argument_pairs(arg.c_str(), values);

    // The parser expects the first token to be a string identifying the distribution type.
    KeyValueIterator it = values.begin();
    if (it != values.end()) {
      for (int i = 0; distribution_kinds[i].label; ++i) {
        if (it->first == distribution_kinds[i].label) {
          dist.kind = distribution_kinds[i].kind;
          break;
        }
      }
      ++it;
    }

    // Subsequent key-value pairs update the named field of the distribution struct.
    for (; it != values.end(); ++it) {

      // Integer scaling factor - if < 0, no integer rounding is performed.
      if (it->first == "scale" && !it->second.empty()) {
        std::stringstream ss;
        ss << it->second;
        ss >> dist.int_scale;

        continue;  // next token
      }

      // Casts as integer without scaling
      if (it->first == "integer") {
        dist.int_scale = 0;
        continue;  // next token
      }

      // initialize other members
      for (int m = 0; members[m].label; ++m) {
        if (it->first == members[m].label && !it->second.empty()) {
          std::stringstream ss;
          ss << it->second;
          ss >> *(members[m].member);
        }
      }
    }
  }

  //
  // Methods
  //

  /// Basic uniform random distribution
  InitialDistribution(int64_t _seed = 700) : seed(_seed) {
    dist_A.set_uniform(-8, 8);
    dist_B.set_uniform(-8, 8);
    dist_C.set_uniform(-8, 8);
  }

  /// Extracts initial distribution from command line arguments
  InitialDistribution(cutlass::CommandLine const &args) {
    // Set initial values
    seed = 700;
    dist_A.set_uniform(-8, 8);
    dist_B.set_uniform(-8, 8);
    dist_C.set_uniform(-8, 8);

    // Update with command line arguments
    args.get_cmd_line_argument("seed", seed, seed);

    // Update all distributions at once
    Distribution dist_all;
    if (args.check_cmd_line_flag("dist")) {
       get_distribution(args, "dist", dist_all);
       dist_A = dist_all;
       dist_B = dist_all;
       dist_C = dist_all;
    }

    get_distribution(args, "dist_A", dist_A);
    get_distribution(args, "dist_B", dist_B);
    get_distribution(args, "dist_C", dist_C);
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines how to execute the benchmarks
struct ExecutionMode {
  enum Kind {
    Profile,
    Verify,
    Single,
    Invalid
  };

  static std::string to_string(Kind kind) {
    switch (kind) {
      case Profile: return "profile";
      case Verify: return "verify";
      case Single: return "single";
      default: return "invalid";
    }
  }

  static Kind from_string(std::string const &str) {
    if (str == "profile") return Profile;
    if (str == "verify") return Verify;
    if (str == "single") return Single;
    return Profile;
  }
};

/// Indicates when the workspace is saved
struct WorkspaceSaveMode {
  enum Kind {
    Never,
    Incorrect,
    Always
  };

  static std::string to_string(Kind kind) {
    switch (kind) {
      case Never: return "never";
      case Incorrect: return "incorrect";
      case Always: return "always";
      default: return "incorrect";
    }
  }

  static Kind from_string(std::string const &str) {
    if (str == "never") return Never;
    if (str == "incorrect") return Incorrect;
    if (str == "always") return Always;
    return Incorrect;
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Class holding testbench command line options
struct TestbenchOptions {

  //
  // Data members
  //

  /// Describes the random initial state of the input matrices
  InitialDistribution initial_distribution;

  // Path to output file name
  std::string output_filename;

  /// If true, output is appended
  bool append;

  /// Number of iterations
  int iterations;

  /// Defines how to run the benchmark
  ExecutionMode::Kind execution_mode;

  /// Indicates when the workspace is saved
  WorkspaceSaveMode::Kind save_workspace_mode;

  /// Enabled kernel names
  std::vector<std::string> kernels;

  /// Scalar value for GEMM
  double alpha;

  /// Scalar value for GEMM
  double beta;

  /// Range of problem sizes
  GemmProblemRange problem_range;

  /// Tags to describe the profiler output
  KeyValueVector pivot_tags;

  /// If enabled, only the peak performance for a given kernel is reported
  bool peak_performance;

  //
  // Methods
  //

  /// Constructs the testbench from tags
  TestbenchOptions(cutlass::CommandLine const &args)
      : initial_distribution(args),
        execution_mode(ExecutionMode::Profile),
        save_workspace_mode(WorkspaceSaveMode::Never),
        problem_range(args) {

    // fetch command line arguments
    args.get_cmd_line_argument("iterations", iterations, 25);
    args.get_cmd_line_argument("append", append, false);
    args.get_cmd_line_argument("output", output_filename);
    args.get_cmd_line_argument("alpha", alpha, 1.0);
    args.get_cmd_line_argument("beta", beta, 0.0);
    args.get_cmd_line_argument("peak", peak_performance, false);
    args.get_cmd_line_argument_pairs("tags", pivot_tags);

    if (args.check_cmd_line_flag("execution_mode")) {
      std::string str;
      args.get_cmd_line_argument("execution_mode", str);
      execution_mode = ExecutionMode::from_string(str);
    }

    if (args.check_cmd_line_flag("save_workspace")) {
      std::string str;
      args.get_cmd_line_argument("save_workspace", str);
      save_workspace_mode = WorkspaceSaveMode::from_string(str);
    }

    // query for enabled kernels or enable all of them
    if (args.check_cmd_line_flag("kernels")) {
      args.get_cmd_line_arguments("kernels", kernels, ',');
    } else {
      char const *gemms[] = {"sgemm", "dgemm", "hgemm", "igemm", "wmma_gemm", 0};
      char const *layouts[] = {"nn", "nt", "tn", "tt", 0};
      for (int i = 0; gemms[i]; ++i) {
        for (int j = 0; layouts[j]; ++j) {
          kernels.push_back(std::string(gemms[i]) + "_" + layouts[j]);
        }
      }
    }
  }

  /// Returns true if the kernel name appears among the enabled kernels
  bool kernel_enabled(std::string const &kernel) const {
    typedef std::vector<std::string>::const_iterator kernel_iterator;

    for (kernel_iterator it = kernels.begin(); it != kernels.end(); ++it) {
      if (kernel.find(*it) != std::string::npos) {
        return true;
      }
    }

    return false;
  }

  /// Given the disposition of a GEMM problem, returns true if the results should
  /// be saved to the file system.
  bool save_workspace(bool correct) const {
    if (save_workspace_mode == WorkspaceSaveMode::Always ||
      (save_workspace_mode == WorkspaceSaveMode::Incorrect && !correct)) {
      return true;
    }
    return false;
  }

  /// Prints the usage statement
  static void usage(std::ostream &out) {

    out << "cutlass_perf_test [options]\n\n"

        << "  --help\n"

        << "  --append=<true|false*>                        "
        << "    If true, appends output to existing CSV file. If false, overwrites.\n"

        << "  --alpha=<alpha>                               "
        << "    Value for alpha to be used in GEMM experiments\n"

        << "  --beta=<beta>                                 "
        << "    Value for beta to be used in GEMM experiments\n"

        << "  --dist_{A,B,C}=<distribution>                 "
        << "    Describes the random distribution of each of the input matrix operands.\n"

        << "  --execution_mode=<mode>                       "
        << "    Specifies execution mode: profile, verify, single\n"

        << "  --output=<filename.csv>                       "
        << "    Writes summary of profiling to specified .csv file\n"

        << "  --iterations=<timing iterations>              "
        << "    maximum number of iterations to execute when profiling\n"

        << "  --m=<height>[:max height[:step]]              "
        << "    Height of GEMM problem (number of rows of C). May specify a range with optional "
           "step size.\n"

        << "  --n=<width>[:max width[:step]]                "
        << "    Width of GEMM problem (number of columns of C). May specify a range with optional "
           "step size.\n"

        << "  --k=<depth>[:max depth[:step]]                "
        << "    Size of inner dimension of A and B. May specify a range with optional step size.\n"

        << "  --kernels={s|d|h|i|wmma_}gemm_{nn,nt,tn,tt}   "
        << "    Select GEMM datatype and layout to use for tests\n"

        << "  --peak=<bool>                                 "
        << "    If true, only reports peak performance per kernel after profiling specified "
           "problem space.\n"

        << "  --save_workspace={*never,incorrect,always}    "
        << "    Specifies when to save the GEMM inputs and results to the filesystem.\n"

        << "  --seed=<seed>                                 "
        << "    Random seed used by the random number generator in initializing input matrices.\n"

        << "  --tags=<column:tag,...>                       "
        << "    Inserts leading columns in output table and uniform values for each column. Useful "
           "for generating pivot tables.\n"

        << "\n\n"

        << "Example usage:\n\n"

        << "# Runs one problem size for all kernels\n"
        << "./tools/test/perf/cutlass_perf_test --m=10240 --n=1024 --k=1024\n\n"

        << "# Varies GEMM K dimension for SGEMM and IGEMM with column-major multiplicands\n"
        << "./tools/test/perf/cutlass_perf_test --m=10240 --n=4096 --k=1024:8192:128 "
           "--kernels=sgemm_nn,igemm_nn\n\n"

        << std::flush;
  }
};

///////////////////////////////////////////////////////////////////////////////////////////////////

}  // namespace perf