cutlass/tools/test/perf/gemm/gemm_perf_testbed.h
2018-05-16 11:44:56 -07:00

625 lines
20 KiB
C++

/***************************************************************************************************
* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#pragma once
// Standard Library includes
#include <fstream>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>
// CUDA includes
#include <cublas_v2.h>
#include <curand_kernel.h>
// Cutlass includes
#include <tools/test/perf/gemm/cublas_dispatch.h>
#include <tools/test/perf/performance_result.h>
#include <tools/test/perf/testbench_options.h>
#include <tools/util/device_memory.h>
#include <tools/util/type_traits.h>
#include <tools/util/host_tensor.h>
#include <tools/util/tensor_view_io.h>
namespace perf {
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Kernel to determine if two tensors are equal
template <typename Type>
__global__ void tensor_equals(int *result,
int dim_contiguous,
int dim_strided,
Type const *experimental,
int lde,
Type const *reference,
int ldr) {
typedef typename cutlass::TypeTraits<Type>::unsigned_type UnsignedType;
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
experimental += s_idx * lde + c_idx;
reference += s_idx * ldr + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
UnsignedType exp = *reinterpret_cast<UnsignedType const *>(experimental);
UnsignedType ref = *reinterpret_cast<UnsignedType const *>(reference);
if (exp != ref) {
*result = -1;
return;
}
experimental += lde;
reference += ldr;
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
/// Kernel to initialize tensor to uniform distribution
template <typename T>
__global__ void initialize_uniform(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
double range = dist.uniform.max - dist.uniform.min;
double rnd = curand_uniform(&rng_state[threadIdx.x]);
rnd = dist.uniform.min + range * rnd;
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
if (dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << dist.int_scale)));
*tensor = T(rnd / double(1 << dist.int_scale));
} else {
*tensor = T(rnd);
}
tensor += ldm;
}
}
}
/// Kernel to initialize tensor to uniform distribution
template <typename T>
__global__ void initialize_gaussian(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
// Random values are cast to integer after scaling by a power of two to facilitate error
// testing
double rnd = curand_normal(&rng_state[threadIdx.x]);
rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
if (dist.int_scale >= 0) {
rnd = double(int(rnd * double(1 << dist.int_scale)));
*tensor = T(rnd / double(1 << dist.int_scale));
} else {
*tensor = T(rnd);
}
}
}
}
/// Kernel to initialize tensor to an identity matrix
template <typename T>
__global__ void initialize_linear(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
*tensor =
dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
}
}
}
/// Kernel to initialize tensor to an identity matrix
template <typename T>
__global__ void initialize_identity(
Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
__shared__ curandState_t rng_state[1024];
uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
int s_idx = blockIdx.y * blockDim.x;
tensor += s_idx * ldm + c_idx;
for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
if (s_idx < dim_strided && c_idx < dim_contiguous) {
*tensor = (c_idx == s_idx ? T(1) : T(0));
}
}
}
/// Dispatcher to appropriate initialization kernel
template <typename T>
inline void initialize(Distribution const &dist,
int64_t seed,
int dim_contiguous,
int dim_strided,
T *tensor,
int ldm) {
dim3 block(256, 1, 1);
dim3 grid((dim_contiguous + block.x - 1) / block.x, (dim_strided + block.x - 1) / block.x);
switch (dist.kind) {
case Distribution::Uniform:
initialize_uniform<<<grid, block>>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm);
break;
case Distribution::Gaussian:
initialize_gaussian<<<grid, block>>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm);
break;
case Distribution::Linear:
initialize_linear<<<grid, block>>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm);
break;
case Distribution::Identity:
initialize_identity<<<grid, block>>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm);
break;
default:
break;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Host-side implementation of performance testbed
template <typename AType, typename BType, typename CType, typename Accumulator, typename Scalar>
class GemmTestbed {
public:
/// Type used for device-side allocations
typedef typename cutlass::TypeTraits<AType>::device_type ADeviceType;
typedef typename cutlass::TypeTraits<BType>::device_type BDeviceType;
typedef typename cutlass::TypeTraits<CType>::device_type CDeviceType;
typedef typename cutlass::TypeTraits<Accumulator>::device_type AccumulatorDeviceType;
typedef typename cutlass::TypeTraits<Scalar>::device_type ScalarDeviceType;
/// Dispatch object to cuBLAS GEMM
typedef CublasGemmDispatch<AType, BType, CType, Accumulator, Scalar> CublasDispatch;
//
// Type definitions
//
/// Host tensor for operand A
typedef cutlass::device_memory::allocation<ADeviceType> TensorA;
/// Host tensor for operand B
typedef cutlass::device_memory::allocation<BDeviceType> TensorB;
/// Host tensor for operand C
typedef cutlass::device_memory::allocation<CDeviceType> TensorC;
private:
//
// Data members
//
InitialDistribution initial_distribution;
/// Status
cublasStatus_t status;
/// cuBLAS handle
cublasHandle_t handle;
/// GEMM problem
GemmProblem problem;
/// A matrix operand
TensorA A;
/// B matrix operand
TensorB B;
/// C matrix operand
TensorC C_initial;
/// Reference result
TensorC reference;
/// Experimental result
TensorC experimental;
private:
//
// Methods
//
/// Helper to resize a matrix with a given size and layout if needed
template <typename T>
static void resize_device_allocation(
cutlass::device_memory::allocation<T> &tensor,
Distribution const &dist,
int64_t seed,
int rows,
int columns,
cutlass::MatrixLayout::Kind layout,
int ldm = 0) {
if (!ldm) {
ldm = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns);
}
size_t capacity = ldm * (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows);
if (capacity > tensor.capacity) {
tensor.reset(cutlass::device_memory::allocate<T>(capacity), capacity);
int c_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns);
int s_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows);
initialize(dist, seed, c_dim, s_dim, tensor.get(), ldm);
}
}
/// Resizes each tensor
void resize_helper(GemmProblem const &problem) {
resize_device_allocation(
A,
initial_distribution.dist_A,
initial_distribution.seed,
problem.m,
problem.k,
problem.layout_A);
resize_device_allocation(
B,
initial_distribution.dist_B,
initial_distribution.seed + 17, // compute distinct value from initial seed
problem.k,
problem.n,
problem.layout_B);
resize_device_allocation(
C_initial,
initial_distribution.dist_C,
initial_distribution.seed + 101, // compute distinct value from initial seed
problem.m,
problem.n,
cutlass::MatrixLayout::kColumnMajor);
resize_device_allocation(
reference, Distribution(), 0, problem.m, problem.n, cutlass::MatrixLayout::kColumnMajor);
resize_device_allocation(
experimental, Distribution(), 0, problem.m, problem.n, cutlass::MatrixLayout::kColumnMajor);
}
/// Functor to print errors
struct PrintErrors {
/// Equivalently sized integer type
typedef typename cutlass::TypeTraits<CType>::integer_type integer_t;
/// Output stream to write to
std::ostream& out;
/// Reference tensor view
cutlass::HostTensorView<CType> const& reference;
/// Computed tensor view
cutlass::HostTensorView<CType> const& experimental;
/// Errors greater than or this amount result in printing
integer_t ulps_threshold;
///
PrintErrors(std::ostream& _out,
cutlass::HostTensorView<CType> const& _reference,
cutlass::HostTensorView<CType> const& _experimental,
integer_t _ulps_threshold = 1)
: out(_out),
reference(_reference),
experimental(_experimental),
ulps_threshold(_ulps_threshold) {}
/// Compares one element
void operator()(
CType const& element,
typename cutlass::HostTensorView<CType>::Coord_t coord) {
CType exp = experimental.at(coord);
CType ref = reference.at(coord);
int64_t int_exp = 0;
int64_t int_ref = 0;
*reinterpret_cast<CType*>(&int_exp) = exp;
*reinterpret_cast<CType*>(&int_ref) = ref;
integer_t ulps = integer_t(int_exp - int_ref);
if (std::abs(ulps) >= ulps_threshold) {
// width in hexadecimal digits of value
int const width = sizeof(integer_t) * 2;
double relative = double(exp) - double(ref);
if (ref != CType(0)) {
relative /= double(ref);
}
out << "[" << coord << "] expected: " << ref << " (0x"
<< std::hex << std::setw(width) << std::setfill('0') << integer_t(int_ref) << std::dec
<< ")"
<< ", got: " << exp << " (0x" << std::hex
<< std::setw(width) << std::setfill('0') << integer_t(int_exp) << std::dec << ")"
<< " relative error: " << relative << ", ulps: " << ulps << "\n";
}
}
};
public:
/// Resizes tensors to accommodate the given problem
void resize(GemmProblem const &_problem) {
problem = _problem;
try {
resize_helper(problem);
} catch (...) {
// If out of memory, clear each allocation then allocate again
A.reset();
B.reset();
C_initial.reset();
reference.reset();
experimental.reset();
resize_helper(problem);
}
}
/// Constructs a basic workspace
GemmTestbed(InitialDistribution const &_dist = InitialDistribution())
: initial_distribution(_dist) {
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
throw cutlass::cuda_exception("Failed to create CUBLAS handle");
}
}
/// Constructs a workspace for verifying GEMM, assumes
/// dense packing.
GemmTestbed(GemmProblem const &_problem,
cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT,
InitialDistribution const &_dist = InitialDistribution())
: problem(_problem), initial_distribution(_dist) {
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
throw cutlass::cuda_exception("Failed to create CUBLAS handle");
}
resize(problem);
}
~GemmTestbed() { status = cublasDestroy(handle); }
/// Returns true if the last CUBLAS call returned successfully
bool good() const { return status == CUBLAS_STATUS_SUCCESS; }
/// Rows of GEMM problem
int M() const { return problem.m; }
/// Columns of GEMM problem
int N() const { return problem.n; }
/// Inner dimension of GEMM problem
int K() const { return problem.k; }
/// Returns a pointer to the A operand
ADeviceType *ptr_A() const { return A.get(); }
/// Leading dimension of A
int lda() const { return problem.lda(); }
/// Returns a pointer to the B operand
BDeviceType *ptr_B() const { return B.get(); }
/// Leading dimension of B
int ldb() const { return problem.ldb(); }
/// Returns a pointer to the initial state of the result tensor in device memory
CDeviceType *ptr_C_initial() const { return C_initial.get(); }
/// Leading dimension of C
int ldc() const { return problem.ldc(); }
/// Returns a pointer to the result tensor in device memory
CDeviceType *ptr_experimental() const { return experimental.get(); }
/// Returns a pointer to the result tensor in device memory
CDeviceType *ptr_reference() const { return reference.get(); }
/// Returns the number of flops implied by the computation (1 multiply-accumulate = 2 flops)
uint64_t flops() const {
return uint64_t(problem.m) * uint64_t(problem.n) * uint64_t(problem.k) * 2ULL;
}
/// Computes the speed of the computation in GFLOPs/s
double GFLOPs_per_sec(double runtime_ms) const { return double(flops()) / runtime_ms / 1.0e6; }
/// Matrix layout of A
cutlass::MatrixLayout::Kind layout_a() const { return problem.layout_A; }
/// Matrix layout of B
cutlass::MatrixLayout::Kind layout_b() const { return problem.layout_B; }
/// Returns alpha scalar
Scalar alpha() const { return Scalar(problem.alpha); }
/// Returns alpha scalar
Scalar beta() const { return Scalar(problem.beta); }
/// Initializes C matrix by copying from C_initial
void prepare_gemm(CDeviceType *target) {
size_t count = ldc() * problem.n;
cutlass::device_memory::copy_device_to_device(target, ptr_C_initial(), count);
}
/// Initializes output matrix of cublas
void prepare_cublas() { prepare_gemm(ptr_reference()); }
/// Initializes output matrix of cublas
void prepare_experimental() { prepare_gemm(ptr_experimental()); }
/// Launches the cuBLAS GEMM - does not initialize output matrix
cublasStatus_t launch_cublas(cublasGemmAlgo_t algo) {
CublasDispatch dispatch;
Scalar alpha(Scalar(problem.alpha));
Scalar beta(Scalar(problem.beta));
status = dispatch(handle,
problem.layout_A,
problem.layout_B,
problem.m,
problem.n,
problem.k,
alpha,
ptr_A(),
lda(),
ptr_B(),
ldb(),
beta,
ptr_reference(),
ldc(),
algo);
return status;
}
/// Verifies the 'test' tensor with 'ref'
bool verify(TensorC const &test, TensorC const &ref) {
cutlass::device_memory::allocation<int> flag_device(1);
int flag = 0;
cutlass::device_memory::copy_to_device(flag_device.get(), &flag, 1);
dim3 block(256, 1, 1);
dim3 grid((problem.m + block.x - 1) / block.x, (problem.n + block.x - 1) / block.x);
tensor_equals<CDeviceType><<<grid, block>>>(flag_device.get(),
problem.m,
problem.n,
experimental.get(),
problem.m,
reference.get(),
problem.m);
cutlass::device_memory::copy_to_host(&flag, flag_device.get(), 1);
return flag == 0;
}
/// Computes the reference output
void compute_reference(cublasGemmAlgo_t algorithm) {
prepare_cublas();
launch_cublas(algorithm);
}
/// Helper to verify with reference
bool verify_with_reference() { return verify(experimental, reference); }
/// Writes the problem to an ostream in human-readable form
void write_problem(std::ostream &results_output, std::ostream &errors_output) {
cutlass::HostTensor<AType, false> host_A;
cutlass::HostTensor<BType, false> host_B;
cutlass::HostTensor<CType, false> host_C;
cutlass::HostTensor<CType, false> host_D;
cutlass::HostTensor<CType, false> host_Ref;
host_A.resize_matrix(M(), K(), layout_a());
host_B.resize_matrix(K(), N(), layout_b());
host_C.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);
host_D.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);
host_Ref.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor);
// copy from device allocations
host_A.copy_to_host(ptr_A());
host_B.copy_to_host(ptr_B());
host_C.copy_to_host(ptr_C_initial());
host_D.copy_to_host(ptr_experimental());
host_Ref.copy_to_host(ptr_reference());
// write out human readable
results_output << "A =\n" << host_A << "\n"
<< "B =\n" << host_B << "\n"
<< "C = \n" << host_C << "\n"
<< "Ref =\n" << host_Ref << "\n"
<< "Experimental =\n" << host_D << "\n";
// write out list of errors
PrintErrors printer(errors_output, host_Ref, host_D);
host_D.visit(printer);
}
};
} // namespace perf