cutlass/tools/test/perf/gemm/gemm_perf_testbed.h

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
#pragma once

// Standard Library includes
#include <fstream>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>

// CUDA includes
#include <cublas_v2.h>
#include <curand_kernel.h>

// Cutlass includes
#include "tools/test/perf/gemm/cublas_dispatch.h"
#include "tools/test/perf/performance_result.h"
#include "tools/test/perf/testbench_options.h"
#include "tools/util/device_memory.h"
#include "tools/util/host_matrix.h"
#include "tools/util/reference/device/tensor_elementwise.h"
#include "tools/util/tensor_view_io.h"
#include "tools/util/type_traits.h"

namespace perf {

////////////////////////////////////////////////////////////////////////////////////////////////////

namespace detail {

  template <typename T>
  struct ElementCount {
    static int const kValue = 1;
  };

  template <typename T, int Elements>
  struct ElementCount<cutlass::Vector<T, Elements> > {
    static int const kValue = Elements * ElementCount<T>::kValue;
  };

} // namespace detail

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Host-side implementation of performance testbed
template <typename AType, typename BType, typename CType, typename Accumulator, typename Scalar>
class GemmTestbed {
 public:
  /// Type used for device-side allocations
  typedef typename cutlass::TypeTraits<AType>::device_type ADeviceType;
  typedef typename cutlass::TypeTraits<BType>::device_type BDeviceType;
  typedef typename cutlass::TypeTraits<CType>::device_type CDeviceType;
  typedef typename cutlass::TypeTraits<Accumulator>::device_type AccumulatorDeviceType;
  typedef typename cutlass::TypeTraits<Scalar>::device_type ScalarDeviceType;

  /// Dispatch object to cuBLAS GEMM
  typedef CublasGemmDispatch<AType, BType, CType, Accumulator, Scalar> CublasDispatch;
  typedef CublasBatchedStridedGemmDispatch<AType, BType, CType, Accumulator, Scalar> CublasBatchedStridedGemmDispatch;

  //
  // Type definitions
  //

  /// Host tensor for operand A
  typedef cutlass::device_memory::allocation<ADeviceType> TensorA;

  /// Host tensor for operand B
  typedef cutlass::device_memory::allocation<BDeviceType> TensorB;

  /// Host tensor for operand C
  typedef cutlass::device_memory::allocation<CDeviceType> TensorC;

 private:
  //
  // Data members
  //

  InitialDistribution initial_distribution;

  /// Status
  cublasStatus_t status;

  /// cuBLAS handle
  cublasHandle_t handle;

  /// GEMM problem
  GemmProblem problem;

  /// A matrix operand
  TensorA A;

  /// B matrix operand
  TensorB B;

  /// C matrix operand
  TensorC C_initial;

  /// Reference result
  TensorC reference;

  /// Experimental result
  TensorC experimental;

 private:
  //
  // Methods
  //

  /// Helper to resize a matrix with a given size and layout if needed
  template <typename T>
  static void resize_device_allocation(cutlass::device_memory::allocation<T> &tensor,
                                       cutlass::Distribution const &dist,
                                       int64_t seed,
                                       int rows,
                                       int columns,
                                       cutlass::MatrixLayout::Kind layout,
                                       int ldm = 0) {
    if (!ldm) {
      ldm = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns);
    }

    size_t capacity = ldm * (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows);

    if (capacity > tensor.capacity) {
      tensor.reset(cutlass::device_memory::allocate<T>(capacity), capacity);

      int c_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns);
      int s_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows);

      cutlass::TensorView<T, 2> view(
        tensor.get(),
        cutlass::make_Coord(ldm, 1),
        cutlass::make_Coord(s_dim, c_dim));

      cutlass::reference::device::TensorInitialize(view, seed, dist);
    }
  }

  /// Resizes each tensor
  void resize_helper(GemmProblem const &problem) {

      resize_device_allocation(A,
        initial_distribution.dist_A,
        initial_distribution.seed,
        problem.m,
        problem.k * problem.batch_count,
        problem.layout_A);


    resize_device_allocation(
        B,
        initial_distribution.dist_B,
        initial_distribution.seed + 17,  // compute distinct value from initial seed
        problem.k * problem.batch_count,
        problem.n,
        problem.layout_B);

    resize_device_allocation(
        C_initial,
        initial_distribution.dist_C,
        initial_distribution.seed + 101,  // compute distinct value from initial seed
        problem.m,
        problem.n * problem.batch_count,
        cutlass::MatrixLayout::kColumnMajor);

    resize_device_allocation(reference,
                             cutlass::Distribution(),
                             0,
                             problem.m,
                             problem.n * problem.batch_count,
                             cutlass::MatrixLayout::kColumnMajor);

    resize_device_allocation(experimental,
                             cutlass::Distribution(),
                             0,
                             problem.m,
                             problem.n * problem.batch_count,
                             cutlass::MatrixLayout::kColumnMajor);
  }

  /// Functor to print errors
  struct PrintErrors {
    /// Equivalently sized integer type
    typedef typename cutlass::TypeTraits<CType>::integer_type integer_t;

    /// Performance testbench defined for a TensorView of rank-2 contiguous matrices
    typedef cutlass::TensorView<CType, 2, cutlass::MatrixLayout::ContiguousLayout> MatrixView;

    /// Output stream to write to
    std::ostream &out;

    /// Reference tensor view
    MatrixView const &reference;

    /// Computed tensor view
    MatrixView const &experimental;

    /// Errors greater than or this amount result in printing
    integer_t ulps_threshold;

    ///
    PrintErrors(std::ostream &_out,
                MatrixView const &_reference,
                MatrixView const &_experimental,
                integer_t _ulps_threshold = 1)
        : out(_out),
          reference(_reference),
          experimental(_experimental),
          ulps_threshold(_ulps_threshold) {}

    /// Compares one element
    void operator()(CType const &element, typename MatrixView::TensorCoord coord) {
      CType exp = experimental.at(coord);
      CType ref = reference.at(coord);

      int64_t int_exp = 0;
      int64_t int_ref = 0;

      *reinterpret_cast<CType *>(&int_exp) = exp;
      *reinterpret_cast<CType *>(&int_ref) = ref;

      integer_t ulps = integer_t(int_exp - int_ref);

      if (std::abs(ulps) >= ulps_threshold) {
        // width in hexadecimal digits of value
        int const width = sizeof(integer_t) * 2;

        double relative = double(exp) - double(ref);
        if (ref != CType(0)) {
          relative /= double(ref);
        }

        out << "[" << coord << "] expected: " << ref << " (0x" << std::hex << std::setw(width)
            << std::setfill('0') << integer_t(int_ref) << std::dec << ")"
            << ",  got: " << exp << " (0x" << std::hex << std::setw(width) << std::setfill('0')
            << integer_t(int_exp) << std::dec << ")"
            << "  relative error: " << relative << ", ulps: " << ulps << "\n";
      }
    }
  };

 public:
  /// Resizes tensors to accommodate the given problem
  void resize(GemmProblem const &_problem) {
    problem = _problem;

    try {
      resize_helper(problem);
    } catch (...) {
      // If out of memory, clear each allocation then allocate again
      A.reset();
      B.reset();
      C_initial.reset();
      reference.reset();
      experimental.reset();

      resize_helper(problem);
    }
  }

  /// Constructs a basic workspace
  GemmTestbed(InitialDistribution const &_dist = InitialDistribution())
      : initial_distribution(_dist) {
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
      throw cutlass::cuda_exception("Failed to create CUBLAS handle");
    }
  }

  /// Constructs a workspace for verifying GEMM, assumes
  /// dense packing.
  GemmTestbed(GemmProblem const &_problem,
              cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT,
              InitialDistribution const &_dist = InitialDistribution())
      : problem(_problem), initial_distribution(_dist) {
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
      throw cutlass::cuda_exception("Failed to create CUBLAS handle");
    }

    resize(problem);
  }

  ~GemmTestbed() { status = cublasDestroy(handle); }

  /// Returns true if the last CUBLAS call returned successfully
  bool good() const { return status == CUBLAS_STATUS_SUCCESS; }

  /// Rows of GEMM problem
  int M() const { return problem.m; }

  /// Columns of GEMM problem
  int N() const { return problem.n; }

  /// Inner dimension of GEMM problem
  int K() const { return problem.k; }

  /// batch count
  int batch_count() const { return problem.batch_count; }

  /// Returns a pointer to the A operand
  ADeviceType *ptr_A() const { return A.get(); }

  /// Leading dimension of A
  int lda() const { return problem.lda(); }

  ///
  long long int batch_stride_a() const{ return problem.batch_stride_a(); }

  /// Returns a pointer to the B operand
  BDeviceType *ptr_B() const { return B.get(); }

  /// Leading dimension of B
  int ldb() const { return problem.ldb(); }

  ///
  long long int batch_stride_b() const{ return problem.batch_stride_b(); }

  /// Returns a pointer to the initial state of the result tensor in device memory
  CDeviceType *ptr_C_initial() const { return C_initial.get(); }

  /// Leading dimension of C
  int ldc() const { return problem.ldc(); }

  ///
  long long int batch_stride_c() const { return problem.batch_stride_c(); }

  /// Returns a pointer to the result tensor in device memory
  CDeviceType *ptr_experimental() const { return experimental.get(); }

  /// Returns a pointer to the result tensor in device memory
  CDeviceType *ptr_reference() const { return reference.get(); }

  /// Returns the number of flops implied by the computation (1 multiply-accumulate = 2 flops)
  uint64_t flops() const {
    return uint64_t(problem.batch_count) * uint64_t(problem.m) * uint64_t(problem.n) * uint64_t(problem.k) * detail::ElementCount<AType>::kValue * 2ULL;
  }

  /// Computes the speed of the computation in GFLOPs/s
  double GFLOPs_per_sec(double runtime_ms) const { return double(flops()) / runtime_ms / 1.0e6; }

  /// Matrix layout of A
  cutlass::MatrixLayout::Kind layout_a() const { return problem.layout_A; }

  /// Matrix layout of B
  cutlass::MatrixLayout::Kind layout_b() const { return problem.layout_B; }

  /// Returns alpha scalar
  Scalar alpha() const { return Scalar(problem.alpha); }

  /// Returns alpha scalar
  Scalar beta() const { return Scalar(problem.beta); }

  /// Initializes C matrix by copying from C_initial
  void prepare_gemm(CDeviceType *target) {
    size_t count = ldc() * problem.n;
    cutlass::device_memory::copy_device_to_device(target, ptr_C_initial(), count);
  }

  /// Initializes output matrix of cublas
  void prepare_cublas() { prepare_gemm(ptr_reference()); }

  /// Initializes output matrix of cublas
  void prepare_experimental() { prepare_gemm(ptr_experimental()); }

  /// Launches the cuBLAS GEMM - does not initialize output matrix
  cublasStatus_t launch_cublas(cublasGemmAlgo_t algo) {
    if (problem.batch_count == 1) {
      CublasDispatch dispatch;

      Scalar alpha(Scalar(problem.alpha));
      Scalar beta(Scalar(problem.beta));

      status = dispatch(handle,
        problem.layout_A,
        problem.layout_B,
        problem.m,
        problem.n,
        problem.k,
        alpha,
        ptr_A(),
        lda(),
        ptr_B(),
        ldb(),
        beta,
        ptr_reference(),
        ldc(),
        algo);

      return status;
    }
    else {
      // call batched strided cublas
      CublasBatchedStridedGemmDispatch dispatch;

      Scalar alpha(Scalar(problem.alpha));
      Scalar beta(Scalar(problem.beta));

      status = dispatch(handle,
        problem.layout_A,
        problem.layout_B,
        problem.m,
        problem.n,
        problem.k,
        alpha,
        ptr_A(),
        lda(),
        batch_stride_a(),
        ptr_B(),
        ldb(),
        batch_stride_b(),
        beta,
        ptr_reference(),
        ldc(),
        batch_stride_c(),
        batch_count(),
        algo);

      return status;
    }
  }

  /// Verifies the 'test' tensor with 'ref'
  bool verify(TensorC const &test, TensorC const &ref) {

    return cutlass::reference::device::TensorEquals(
      cutlass::TensorView<CDeviceType, 2>(
        test.get(),
        cutlass::make_Coord(problem.m, 1),
        cutlass::make_Coord(problem.n, problem.m)),
      cutlass::TensorView<CDeviceType, 2>(
        ref.get(),
        cutlass::make_Coord(problem.m, 1),
        cutlass::make_Coord(problem.n, problem.m))
    );
  }

  /// Computes the reference output
  void compute_reference(cublasGemmAlgo_t algorithm) {
    prepare_cublas();
    launch_cublas(algorithm);
  }

  /// Helper to verify with reference
  bool verify_with_reference() { return verify(experimental, reference); }

  /// Writes the problem to an ostream in human-readable form
  void write_problem(std::ostream &results_output, std::ostream &errors_output) {
    cutlass::HostMatrix<AType> host_A;
    cutlass::HostMatrix<BType> host_B;
    cutlass::HostMatrix<CType> host_C;
    cutlass::HostMatrix<CType> host_D;
    cutlass::HostMatrix<CType> host_Ref;

    host_A.resize_matrix(M(), K(), layout_a());
    host_B.resize_matrix(K(), N(), layout_b());
    host_C.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);
    host_D.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);
    host_Ref.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);

    // copy from device allocations
    host_A.copy_to_host(ptr_A());
    host_B.copy_to_host(ptr_B());
    host_C.copy_to_host(ptr_C_initial());
    host_D.copy_to_host(ptr_experimental());
    host_Ref.copy_to_host(ptr_reference());

    // write out human readable
    results_output << "A =\n"
                   << host_A << "\n"
                   << "B =\n"
                   << host_B << "\n"
                   << "C = \n"
                   << host_C << "\n"
                   << "Ref =\n"
                   << host_Ref << "\n"
                   << "Experimental =\n"
                   << host_D << "\n";

    // write out list of errors
    PrintErrors printer(errors_output, host_Ref, host_D);

    host_D.visit(printer);
  }
};

}  // namespace perf