531 lines
16 KiB
C++
531 lines
16 KiB
C++
/***************************************************************************************************
|
|
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
|
* provided that the following conditions are met:
|
|
* * Redistributions of source code must retain the above copyright notice, this list of
|
|
* conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
|
* conditions and the following disclaimer in the documentation and/or other materials
|
|
* provided with the distribution.
|
|
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without specific prior written
|
|
* permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
|
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************************************/
|
|
/*! \file
|
|
\brief Test environment for GEMM
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
#include <cublas_v2.h>
|
|
|
|
#include <cutlass/matrix_traits.h>
|
|
#include <cutlass/util/platform.h>
|
|
|
|
#include <tools/util/host_tensor.h>
|
|
#include <tools/util/tensor_view_io.h>
|
|
#include <tools/util/type_traits.h>
|
|
|
|
namespace cutlass {
|
|
|
|
template <cutlass::GemmOperand::Kind kOperand_,
|
|
cutlass::MatrixLayout::Kind kLayout_,
|
|
typename Scalar_,
|
|
typename WmmaShape_>
|
|
struct WmmaMatrix;
|
|
}
|
|
|
|
namespace test {
|
|
|
|
template <typename T>
|
|
struct GemmTestbedTraits : public cutlass::TypeTraits<T> {};
|
|
|
|
template <cutlass::GemmOperand::Kind kOperand_,
|
|
cutlass::MatrixLayout::Kind kLayout_,
|
|
typename Scalar_,
|
|
typename WmmaShape_>
|
|
struct GemmTestbedTraits<cutlass::WmmaMatrix<kOperand_, kLayout_, Scalar_, WmmaShape_> > {
|
|
static cudaDataType_t const cublas_type = cutlass::TypeTraits<Scalar_>::cublas_type;
|
|
typedef Scalar_ host_type;
|
|
typedef Scalar_ device_type;
|
|
static inline double remove_negative_zero(double x) { return x == -0.0 ? 0.0 : x; }
|
|
static inline double to_print(double x) { return x; }
|
|
};
|
|
|
|
template <typename AType, typename BType, typename CType, typename Accumulator, typename Scalar>
|
|
struct GemmTestbed {
|
|
//
|
|
// Type definitions
|
|
//
|
|
|
|
/// Host tensor for operand A
|
|
typedef cutlass::HostTensor<AType> HostTensorA;
|
|
|
|
/// Host tensor for operand B
|
|
typedef cutlass::HostTensor<BType> HostTensorB;
|
|
|
|
/// Host tensor for operand C
|
|
typedef cutlass::HostTensor<CType> HostTensorC;
|
|
|
|
/// Functor to print errors
|
|
struct PrintErrors {
|
|
/// Equivalently sized integer type
|
|
typedef typename GemmTestbedTraits<CType>::integer_type integer_t;
|
|
|
|
/// Output stream to write to
|
|
std::ostream& out;
|
|
|
|
/// Reference tensor view
|
|
cutlass::HostTensorView<CType> const& reference;
|
|
|
|
/// Computed tensor view
|
|
cutlass::HostTensorView<CType> const& experimental;
|
|
|
|
/// Errors greater than or this amount result in printing
|
|
integer_t ulps_threshold;
|
|
|
|
///
|
|
PrintErrors(std::ostream& _out,
|
|
cutlass::HostTensorView<CType> const& _reference,
|
|
cutlass::HostTensorView<CType> const& _experimental,
|
|
integer_t _ulps_threshold = 1)
|
|
: out(_out),
|
|
reference(_reference),
|
|
experimental(_experimental),
|
|
ulps_threshold(_ulps_threshold) {}
|
|
|
|
/// Compares one element
|
|
void operator()(CType const& element, typename HostTensorC::Coord_t coord) {
|
|
CType exp = experimental.at(coord);
|
|
CType ref = reference.at(coord);
|
|
|
|
int64_t int_exp = 0;
|
|
int64_t int_ref = 0;
|
|
|
|
*reinterpret_cast<CType*>(&int_exp) = exp;
|
|
*reinterpret_cast<CType*>(&int_ref) = ref;
|
|
|
|
integer_t ulps = integer_t(int_exp - int_ref);
|
|
|
|
if (std::abs(ulps) >= ulps_threshold) {
|
|
// width in hexadecimal digits of value
|
|
int const width = sizeof(integer_t) * 2;
|
|
|
|
double relative = double(exp) - double(ref);
|
|
if (ref != CType(0)) {
|
|
relative /= double(ref);
|
|
}
|
|
|
|
out << "[" << coord << "] expected: " << GemmTestbedTraits<CType>::to_print(ref) << " (0x"
|
|
<< std::hex << std::setw(width) << std::setfill('0') << integer_t(int_ref) << std::dec
|
|
<< ")"
|
|
<< ", got: " << GemmTestbedTraits<CType>::to_print(exp) << " (0x" << std::hex
|
|
<< std::setw(width) << std::setfill('0') << integer_t(int_exp) << std::dec << ")"
|
|
<< " relative error: " << relative << ", ulps: " << ulps << "\n";
|
|
}
|
|
}
|
|
};
|
|
|
|
/// Generates random elements
|
|
template <typename T>
|
|
struct RandomGenerator {
|
|
RandomGenerator(int seed = -1, bool only_ones_ = false) : only_ones(only_ones_) { srand(seed); }
|
|
|
|
T operator()() {
|
|
if (only_ones) {
|
|
return T(1);
|
|
} else {
|
|
int val = (rand() % 16) - 8;
|
|
return T(val);
|
|
}
|
|
}
|
|
|
|
bool only_ones;
|
|
};
|
|
|
|
//
|
|
// Data members
|
|
//
|
|
|
|
/// Status
|
|
cublasStatus_t status;
|
|
|
|
/// cuBLAS handle
|
|
cublasHandle_t handle;
|
|
|
|
/// cuBLAS GEMM algorithm selector
|
|
cublasGemmAlgo_t algorithm;
|
|
|
|
/// A matrix operand
|
|
HostTensorA A;
|
|
|
|
/// Layout of A matrix
|
|
cublasOperation_t layout_A;
|
|
|
|
/// B matrix operand
|
|
HostTensorB B;
|
|
|
|
/// Layout of B matrix
|
|
cublasOperation_t layout_B;
|
|
|
|
/// C matrix operand
|
|
HostTensorC C_initial;
|
|
|
|
/// Reference result computed on the host
|
|
cutlass::HostTensor<CType, false> ref_host;
|
|
|
|
/// Reference result computed with cublas
|
|
HostTensorC ref_cublas;
|
|
|
|
/// Computed result
|
|
HostTensorC computed;
|
|
|
|
/// Linear scalaring factor
|
|
Scalar alpha;
|
|
|
|
/// Linear scaling factor
|
|
Scalar beta;
|
|
|
|
//
|
|
// Static helpers
|
|
//
|
|
|
|
/// Helper to resize a matrix with a given size and layout
|
|
template <typename T, bool DeviceBacked>
|
|
static void resize(cutlass::HostTensor<T, DeviceBacked>& tensor,
|
|
int rows,
|
|
int columns,
|
|
cublasOperation_t layout,
|
|
int ldm = 0) {
|
|
if (!ldm) {
|
|
ldm = (layout == CUBLAS_OP_N ? rows : columns);
|
|
}
|
|
|
|
typedef cutlass::Coord<cutlass::HostTensor<T>::Rank> Coord_t;
|
|
|
|
Coord_t stride = cutlass::make_Coord(
|
|
rows * columns, layout == CUBLAS_OP_N ? 1 : ldm, layout == CUBLAS_OP_N ? ldm : 1, 1);
|
|
|
|
Coord_t size = cutlass::make_Coord(1, rows, columns, 1);
|
|
|
|
tensor.reset(stride, size);
|
|
}
|
|
|
|
//
|
|
// Methods
|
|
//
|
|
|
|
/// Constructs a workspace for verifying GEMM, assumes
|
|
/// dense packing.
|
|
GemmTestbed(int M_,
|
|
int N_,
|
|
int K_,
|
|
cublasOperation_t layout_a,
|
|
cublasOperation_t layout_b,
|
|
Scalar alpha_ = Scalar(1),
|
|
Scalar beta_ = Scalar(0),
|
|
cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT,
|
|
cublasOperation_t layout_c = CUBLAS_OP_N)
|
|
: layout_A(layout_a), layout_B(layout_b), alpha(alpha_), beta(beta_), algorithm(algorithm_) {
|
|
status = cublasCreate(&handle);
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
throw cutlass::cuda_exception("Failed to create CUBLAS handle");
|
|
}
|
|
|
|
resize(A, M_, K_, layout_a);
|
|
resize(B, K_, N_, layout_b);
|
|
resize(C_initial, M_, N_, layout_c);
|
|
resize(ref_host, M_, N_, layout_c);
|
|
resize(ref_cublas, M_, N_, layout_c);
|
|
resize(computed, M_, N_, layout_c);
|
|
}
|
|
|
|
/// Constructs a workspace for verifying GEMM with arbitrary strides
|
|
GemmTestbed(int M_,
|
|
int N_,
|
|
int K_,
|
|
int ldc,
|
|
cublasOperation_t layout_a,
|
|
int lda,
|
|
cublasOperation_t layout_b,
|
|
int ldb,
|
|
Scalar alpha_ = Scalar(1),
|
|
Scalar beta_ = Scalar(0),
|
|
cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT,
|
|
cublasOperation_t layout_c = CUBLAS_OP_N)
|
|
: alpha(alpha_), beta(beta_), algorithm(algorithm_) {
|
|
status = cublasCreate(&handle);
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
throw cutlass::cuda_exception("Failed to create CUBLAS handle");
|
|
}
|
|
|
|
resize(A, M_, K_, layout_a, lda);
|
|
resize(B, K_, N_, layout_b, ldb);
|
|
resize(C_initial, M_, N_, layout_c, ldc);
|
|
resize(ref_host, M_, N_, layout_c, ldc);
|
|
resize(ref_cublas, M_, N_, layout_c, ldc);
|
|
resize(computed, M_, N_, layout_c, ldc);
|
|
}
|
|
|
|
~GemmTestbed() { status = cublasDestroy(handle); }
|
|
|
|
/// Returns true if the last CUBLAS call returned successfully
|
|
bool good() const { return status == CUBLAS_STATUS_SUCCESS; }
|
|
|
|
/// Returns a pointer to the A operand
|
|
typename HostTensorA::DeviceType* ptr_A() const { return A.device_data(); }
|
|
|
|
/// Stride of A matrix
|
|
int lda() const { return std::max(A.stride(HostTensorA::Dim_H), A.stride(HostTensorA::Dim_W)); }
|
|
|
|
/// Returns a pointer to the B operand
|
|
typename HostTensorB::DeviceType* ptr_B() const { return B.device_data(); }
|
|
|
|
/// Stride of B matrix
|
|
int ldb() const { return std::max(B.stride(HostTensorB::Dim_H), B.stride(HostTensorB::Dim_W)); }
|
|
|
|
/// Returns a pointer to the initial state of the result tensor in device memory
|
|
typename HostTensorC::DeviceType* ptr_C_initial() const { return C_initial.device_data(); }
|
|
|
|
/// Returns a pointer to the result tensor in device memory
|
|
typename HostTensorC::DeviceType* ptr_computed() const { return computed.device_data(); }
|
|
|
|
/// Returns a pointer to the result tensor in device memory
|
|
typename HostTensorC::DeviceType* ptr_cublas() const { return ref_cublas.device_data(); }
|
|
|
|
/// Stride of C matrix
|
|
int ldc() const {
|
|
return std::max(C_initial.stride(HostTensorC::Dim_H), C_initial.stride(HostTensorC::Dim_W));
|
|
}
|
|
|
|
/// Returns the number of flops implied by the computation (1 multiply-accumulate = 2 flops)
|
|
uint64_t flops() const { return uint64_t(M()) * uint64_t(N()) * uint64_t(K()) * 2ULL; }
|
|
|
|
/// Computes the speed of the computation in GFLOPs/s
|
|
double GFLOPs_per_sec(double runtime_ms) const { return double(flops()) / runtime_ms / 1.0e6; }
|
|
|
|
/// Matrix layout of A
|
|
cublasOperation_t layout_a() const { return layout_A; }
|
|
|
|
/// Matrix layout of B
|
|
cublasOperation_t layout_b() const { return layout_B; }
|
|
|
|
/// Number of rows of problem
|
|
int M() const { return C_initial.size(HostTensorC::Dim_H); }
|
|
|
|
/// Number of columns of problem
|
|
int N() const { return C_initial.size(HostTensorC::Dim_W); }
|
|
|
|
/// Number of columns of problem
|
|
int K() const { return A.size(HostTensorA::Dim_W); }
|
|
|
|
/// Initializes data, randomly
|
|
void initialize(int seed = -1) {
|
|
A.fill_random(RandomGenerator<AType>(seed));
|
|
B.fill_random(RandomGenerator<BType>(seed + 11));
|
|
C_initial.fill_random(RandomGenerator<CType>(seed + 13));
|
|
}
|
|
|
|
/// Computes the matrix product on the host
|
|
void compute_host() {
|
|
ref_host.fill(C_initial);
|
|
ref_host.template gemm<AType, BType, Accumulator, Scalar>(A, B, alpha, beta);
|
|
}
|
|
|
|
/// Excutes an equivalent GEMM using cuBLAS
|
|
bool execute_cublas() {
|
|
status = cublasGemmEx(handle,
|
|
layout_a(),
|
|
layout_b(),
|
|
M(),
|
|
N(),
|
|
K(),
|
|
&alpha,
|
|
ptr_A(),
|
|
cutlass::TypeTraits<AType>::cublas_type,
|
|
lda(),
|
|
ptr_B(),
|
|
cutlass::TypeTraits<BType>::cublas_type,
|
|
ldb(),
|
|
&beta,
|
|
ref_cublas.device_data(),
|
|
cutlass::TypeTraits<CType>::cublas_type,
|
|
ldc(),
|
|
cutlass::TypeTraits<Accumulator>::cublas_type,
|
|
algorithm);
|
|
|
|
return status == CUBLAS_STATUS_SUCCESS;
|
|
}
|
|
|
|
/// Computes the matrix product using cuBLAS
|
|
void compute_cublas() {
|
|
ref_cublas.fill(C_initial);
|
|
|
|
if (!execute_cublas()) {
|
|
throw std::runtime_error("compute_cublas() failed");
|
|
}
|
|
}
|
|
|
|
//
|
|
// Compute the GEMM yourself
|
|
//
|
|
|
|
/// Names a probelm based on data type and problem size
|
|
std::string workspace_name() const {
|
|
std::stringstream ss;
|
|
ss << "gemm_" << (layout_a() == CUBLAS_OP_N ? "n" : "t")
|
|
<< (layout_b() == CUBLAS_OP_N ? "n" : "t") << "_" << typeid(AType).name() << "_"
|
|
<< typeid(BType).name() << "_" << typeid(CType).name() << "_" << typeid(Accumulator).name()
|
|
<< "_" << typeid(Scalar).name() << "_" << M() << "x" << N() << "x" << K();
|
|
|
|
return ss.str();
|
|
}
|
|
|
|
/// Writes the workspace to an ostream
|
|
std::ostream& write(std::ostream& out) const {
|
|
out << "A = " << A << "\nB = " << B << "\nC_initial = " << C_initial
|
|
<< "\nref_host = " << ref_host << "\nref_cublas = " << ref_cublas
|
|
<< "\ncomputed = " << computed << std::endl;
|
|
|
|
return out;
|
|
}
|
|
|
|
/// Outputs each mismatching element
|
|
std::ostream& write_errors(std::ostream& out,
|
|
cutlass::HostTensorView<CType> const& experimental,
|
|
cutlass::HostTensorView<CType> const& ref) const {
|
|
PrintErrors printer(out, ref, experimental);
|
|
|
|
computed.visit(printer);
|
|
|
|
return out;
|
|
}
|
|
|
|
/// Sync's all input tensors to device
|
|
void sync_device() {
|
|
A.sync_device();
|
|
B.sync_device();
|
|
C_initial.sync_device();
|
|
|
|
ref_host.fill(C_initial);
|
|
ref_cublas.fill(C_initial);
|
|
computed.fill(C_initial);
|
|
|
|
ref_cublas.sync_device();
|
|
computed.sync_device();
|
|
}
|
|
|
|
/// Sync's all output tensors to host
|
|
void sync_host() {
|
|
computed.sync_host();
|
|
ref_cublas.sync_host();
|
|
}
|
|
|
|
/// Saves the workspace to files
|
|
void save_workspace(cutlass::HostTensorView<CType> const& experimental,
|
|
cutlass::HostTensorView<CType> const& ref) {
|
|
std::string name = workspace_name();
|
|
|
|
std::string results_name = name + "_results.txt";
|
|
std::string errors_name = name + "_errors.txt";
|
|
|
|
std::ofstream results(results_name.c_str());
|
|
std::ofstream errors(errors_name.c_str());
|
|
|
|
write(results);
|
|
write_errors(errors, experimental, ref);
|
|
}
|
|
|
|
/// Verifies the contents of C equal the host-side reference
|
|
bool verify_with_host(bool save_on_error = true, bool always_print = false) {
|
|
compute_host();
|
|
computed.sync_host();
|
|
|
|
bool passed = computed.bit_equals(ref_host);
|
|
|
|
if ((!passed && save_on_error) || always_print) {
|
|
save_workspace(computed, ref_host);
|
|
}
|
|
return passed;
|
|
}
|
|
|
|
/// Verifies the contents of computed equal cuBLAS
|
|
bool verify_with_cublas(bool save_on_error = true, bool always_print = false) {
|
|
compute_cublas();
|
|
|
|
ref_cublas.sync_host();
|
|
computed.sync_host();
|
|
|
|
bool passed = computed.bit_equals(ref_cublas);
|
|
|
|
if ((!passed && save_on_error) || always_print) {
|
|
save_workspace(computed, ref_cublas);
|
|
}
|
|
return passed;
|
|
}
|
|
|
|
/// Verifies the host computation with cuBLAS
|
|
bool verify_host_with_cublas(bool save_on_error = true, bool always_print = false) {
|
|
compute_host();
|
|
compute_cublas();
|
|
ref_cublas.sync_host();
|
|
|
|
bool passed = ref_host.bit_equals(ref_cublas);
|
|
|
|
if ((!passed && save_on_error) || always_print) {
|
|
save_workspace(ref_host, ref_cublas);
|
|
}
|
|
|
|
return passed;
|
|
}
|
|
|
|
/// Verifies with host-side and device-side computations
|
|
bool verify_with_all() {
|
|
bool passed = true;
|
|
|
|
computed.sync_host();
|
|
|
|
// verify on host
|
|
passed = (passed && verify_with_host());
|
|
|
|
// verify with cublas
|
|
passed = (passed && verify_with_cublas());
|
|
|
|
return passed;
|
|
}
|
|
|
|
bool has_cublas_support() const { return cutlass::platform::is_same<Accumulator, Scalar>::value; }
|
|
};
|
|
|
|
} // namespace test
|
|
|
|
namespace cutlass {
|
|
inline cublasOperation_t convert(cutlass::MatrixLayout::Kind layout) {
|
|
switch (layout) {
|
|
case cutlass::MatrixLayout::kRowMajor:
|
|
return CUBLAS_OP_T;
|
|
case cutlass::MatrixLayout::kColumnMajor:
|
|
return CUBLAS_OP_N;
|
|
default:
|
|
break;
|
|
}
|
|
return CUBLAS_OP_N;
|
|
}
|
|
}
|