/*************************************************************************************************** * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once // Standard Library includes #include #include #include #include #include // CUDA includes #include #include // Cutlass includes #include #include #include #include #include #include #include namespace perf { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Kernel to determine if two tensors are equal template __global__ void tensor_equals(int *result, int dim_contiguous, int dim_strided, Type const *experimental, int lde, Type const *reference, int ldr) { typedef typename cutlass::TypeTraits::unsigned_type UnsignedType; int c_idx = blockIdx.x * blockDim.x + threadIdx.x; int s_idx = blockIdx.y * blockDim.x; experimental += s_idx * lde + c_idx; reference += s_idx * ldr + c_idx; for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) { if (s_idx < dim_strided && c_idx < dim_contiguous) { UnsignedType exp = *reinterpret_cast(experimental); UnsignedType ref = *reinterpret_cast(reference); if (exp != ref) { *result = -1; return; } experimental += lde; reference += ldr; } } } //////////////////////////////////////////////////////////////////////////////////////////////////// /// Kernel to initialize tensor to uniform distribution template __global__ void initialize_uniform( Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) { __shared__ curandState_t rng_state[1024]; uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; curand_init(seed, gtid, 0, &rng_state[threadIdx.x]); int c_idx = blockIdx.x * blockDim.x + threadIdx.x; int s_idx = blockIdx.y * blockDim.x; tensor += s_idx * ldm + c_idx; for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) { if (s_idx < dim_strided && c_idx < dim_contiguous) { double range = dist.uniform.max - dist.uniform.min; double rnd = curand_uniform(&rng_state[threadIdx.x]); rnd = dist.uniform.min + range * rnd; // Random values are cast to integer after scaling by a power of two to facilitate error // testing if (dist.int_scale >= 0) { rnd = double(int(rnd * double(1 << dist.int_scale))); *tensor = T(rnd / double(1 << dist.int_scale)); } else { *tensor = T(rnd); } tensor += ldm; } } } /// Kernel to initialize tensor to uniform distribution template __global__ void initialize_gaussian( Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) { __shared__ curandState_t rng_state[1024]; uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; curand_init(seed, gtid, 0, &rng_state[threadIdx.x]); int c_idx = blockIdx.x * blockDim.x + threadIdx.x; int s_idx = blockIdx.y * blockDim.x; tensor += s_idx * ldm + c_idx; for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) { if (s_idx < dim_strided && c_idx < dim_contiguous) { // Random values are cast to integer after scaling by a power of two to facilitate error // testing double rnd = curand_normal(&rng_state[threadIdx.x]); rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd; if (dist.int_scale >= 0) { rnd = double(int(rnd * double(1 << dist.int_scale))); *tensor = T(rnd / double(1 << dist.int_scale)); } else { *tensor = T(rnd); } } } } /// Kernel to initialize tensor to an identity matrix template __global__ void initialize_linear( Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) { __shared__ curandState_t rng_state[1024]; uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; curand_init(seed, gtid, 0, &rng_state[threadIdx.x]); int c_idx = blockIdx.x * blockDim.x + threadIdx.x; int s_idx = blockIdx.y * blockDim.x; tensor += s_idx * ldm + c_idx; for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) { if (s_idx < dim_strided && c_idx < dim_contiguous) { *tensor = dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx; } } } /// Kernel to initialize tensor to an identity matrix template __global__ void initialize_identity( Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) { __shared__ curandState_t rng_state[1024]; uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; curand_init(seed, gtid, 0, &rng_state[threadIdx.x]); int c_idx = blockIdx.x * blockDim.x + threadIdx.x; int s_idx = blockIdx.y * blockDim.x; tensor += s_idx * ldm + c_idx; for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) { if (s_idx < dim_strided && c_idx < dim_contiguous) { *tensor = (c_idx == s_idx ? T(1) : T(0)); } } } /// Dispatcher to appropriate initialization kernel template inline void initialize(Distribution const &dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) { dim3 block(256, 1, 1); dim3 grid((dim_contiguous + block.x - 1) / block.x, (dim_strided + block.x - 1) / block.x); switch (dist.kind) { case Distribution::Uniform: initialize_uniform<<>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm); break; case Distribution::Gaussian: initialize_gaussian<<>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm); break; case Distribution::Linear: initialize_linear<<>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm); break; case Distribution::Identity: initialize_identity<<>>(dist, seed, dim_contiguous, dim_strided, tensor, ldm); break; default: break; } } /////////////////////////////////////////////////////////////////////////////////////////////////// /// Host-side implementation of performance testbed template class GemmTestbed { public: /// Type used for device-side allocations typedef typename cutlass::TypeTraits::device_type ADeviceType; typedef typename cutlass::TypeTraits::device_type BDeviceType; typedef typename cutlass::TypeTraits::device_type CDeviceType; typedef typename cutlass::TypeTraits::device_type AccumulatorDeviceType; typedef typename cutlass::TypeTraits::device_type ScalarDeviceType; /// Dispatch object to cuBLAS GEMM typedef CublasGemmDispatch CublasDispatch; // // Type definitions // /// Host tensor for operand A typedef cutlass::device_memory::allocation TensorA; /// Host tensor for operand B typedef cutlass::device_memory::allocation TensorB; /// Host tensor for operand C typedef cutlass::device_memory::allocation TensorC; private: // // Data members // InitialDistribution initial_distribution; /// Status cublasStatus_t status; /// cuBLAS handle cublasHandle_t handle; /// GEMM problem GemmProblem problem; /// A matrix operand TensorA A; /// B matrix operand TensorB B; /// C matrix operand TensorC C_initial; /// Reference result TensorC reference; /// Experimental result TensorC experimental; private: // // Methods // /// Helper to resize a matrix with a given size and layout if needed template static void resize_device_allocation( cutlass::device_memory::allocation &tensor, Distribution const &dist, int64_t seed, int rows, int columns, cutlass::MatrixLayout::Kind layout, int ldm = 0) { if (!ldm) { ldm = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns); } size_t capacity = ldm * (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows); if (capacity > tensor.capacity) { tensor.reset(cutlass::device_memory::allocate(capacity), capacity); int c_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? rows : columns); int s_dim = (layout == cutlass::MatrixLayout::kColumnMajor ? columns : rows); initialize(dist, seed, c_dim, s_dim, tensor.get(), ldm); } } /// Resizes each tensor void resize_helper(GemmProblem const &problem) { resize_device_allocation( A, initial_distribution.dist_A, initial_distribution.seed, problem.m, problem.k, problem.layout_A); resize_device_allocation( B, initial_distribution.dist_B, initial_distribution.seed + 17, // compute distinct value from initial seed problem.k, problem.n, problem.layout_B); resize_device_allocation( C_initial, initial_distribution.dist_C, initial_distribution.seed + 101, // compute distinct value from initial seed problem.m, problem.n, cutlass::MatrixLayout::kColumnMajor); resize_device_allocation( reference, Distribution(), 0, problem.m, problem.n, cutlass::MatrixLayout::kColumnMajor); resize_device_allocation( experimental, Distribution(), 0, problem.m, problem.n, cutlass::MatrixLayout::kColumnMajor); } /// Functor to print errors struct PrintErrors { /// Equivalently sized integer type typedef typename cutlass::TypeTraits::integer_type integer_t; /// Output stream to write to std::ostream& out; /// Reference tensor view cutlass::HostTensorView const& reference; /// Computed tensor view cutlass::HostTensorView const& experimental; /// Errors greater than or this amount result in printing integer_t ulps_threshold; /// PrintErrors(std::ostream& _out, cutlass::HostTensorView const& _reference, cutlass::HostTensorView const& _experimental, integer_t _ulps_threshold = 1) : out(_out), reference(_reference), experimental(_experimental), ulps_threshold(_ulps_threshold) {} /// Compares one element void operator()( CType const& element, typename cutlass::HostTensorView::Coord_t coord) { CType exp = experimental.at(coord); CType ref = reference.at(coord); int64_t int_exp = 0; int64_t int_ref = 0; *reinterpret_cast(&int_exp) = exp; *reinterpret_cast(&int_ref) = ref; integer_t ulps = integer_t(int_exp - int_ref); if (std::abs(ulps) >= ulps_threshold) { // width in hexadecimal digits of value int const width = sizeof(integer_t) * 2; double relative = double(exp) - double(ref); if (ref != CType(0)) { relative /= double(ref); } out << "[" << coord << "] expected: " << ref << " (0x" << std::hex << std::setw(width) << std::setfill('0') << integer_t(int_ref) << std::dec << ")" << ", got: " << exp << " (0x" << std::hex << std::setw(width) << std::setfill('0') << integer_t(int_exp) << std::dec << ")" << " relative error: " << relative << ", ulps: " << ulps << "\n"; } } }; public: /// Resizes tensors to accommodate the given problem void resize(GemmProblem const &_problem) { problem = _problem; try { resize_helper(problem); } catch (...) { // If out of memory, clear each allocation then allocate again A.reset(); B.reset(); C_initial.reset(); reference.reset(); experimental.reset(); resize_helper(problem); } } /// Constructs a basic workspace GemmTestbed(InitialDistribution const &_dist = InitialDistribution()) : initial_distribution(_dist) { status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { throw cutlass::cuda_exception("Failed to create CUBLAS handle"); } } /// Constructs a workspace for verifying GEMM, assumes /// dense packing. GemmTestbed(GemmProblem const &_problem, cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT, InitialDistribution const &_dist = InitialDistribution()) : problem(_problem), initial_distribution(_dist) { status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { throw cutlass::cuda_exception("Failed to create CUBLAS handle"); } resize(problem); } ~GemmTestbed() { status = cublasDestroy(handle); } /// Returns true if the last CUBLAS call returned successfully bool good() const { return status == CUBLAS_STATUS_SUCCESS; } /// Rows of GEMM problem int M() const { return problem.m; } /// Columns of GEMM problem int N() const { return problem.n; } /// Inner dimension of GEMM problem int K() const { return problem.k; } /// Returns a pointer to the A operand ADeviceType *ptr_A() const { return A.get(); } /// Leading dimension of A int lda() const { return problem.lda(); } /// Returns a pointer to the B operand BDeviceType *ptr_B() const { return B.get(); } /// Leading dimension of B int ldb() const { return problem.ldb(); } /// Returns a pointer to the initial state of the result tensor in device memory CDeviceType *ptr_C_initial() const { return C_initial.get(); } /// Leading dimension of C int ldc() const { return problem.ldc(); } /// Returns a pointer to the result tensor in device memory CDeviceType *ptr_experimental() const { return experimental.get(); } /// Returns a pointer to the result tensor in device memory CDeviceType *ptr_reference() const { return reference.get(); } /// Returns the number of flops implied by the computation (1 multiply-accumulate = 2 flops) uint64_t flops() const { return uint64_t(problem.m) * uint64_t(problem.n) * uint64_t(problem.k) * 2ULL; } /// Computes the speed of the computation in GFLOPs/s double GFLOPs_per_sec(double runtime_ms) const { return double(flops()) / runtime_ms / 1.0e6; } /// Matrix layout of A cutlass::MatrixLayout::Kind layout_a() const { return problem.layout_A; } /// Matrix layout of B cutlass::MatrixLayout::Kind layout_b() const { return problem.layout_B; } /// Returns alpha scalar Scalar alpha() const { return Scalar(problem.alpha); } /// Returns alpha scalar Scalar beta() const { return Scalar(problem.beta); } /// Initializes C matrix by copying from C_initial void prepare_gemm(CDeviceType *target) { size_t count = ldc() * problem.n; cutlass::device_memory::copy_device_to_device(target, ptr_C_initial(), count); } /// Initializes output matrix of cublas void prepare_cublas() { prepare_gemm(ptr_reference()); } /// Initializes output matrix of cublas void prepare_experimental() { prepare_gemm(ptr_experimental()); } /// Launches the cuBLAS GEMM - does not initialize output matrix cublasStatus_t launch_cublas(cublasGemmAlgo_t algo) { CublasDispatch dispatch; Scalar alpha(Scalar(problem.alpha)); Scalar beta(Scalar(problem.beta)); status = dispatch(handle, problem.layout_A, problem.layout_B, problem.m, problem.n, problem.k, alpha, ptr_A(), lda(), ptr_B(), ldb(), beta, ptr_reference(), ldc(), algo); return status; } /// Verifies the 'test' tensor with 'ref' bool verify(TensorC const &test, TensorC const &ref) { cutlass::device_memory::allocation flag_device(1); int flag = 0; cutlass::device_memory::copy_to_device(flag_device.get(), &flag, 1); dim3 block(256, 1, 1); dim3 grid((problem.m + block.x - 1) / block.x, (problem.n + block.x - 1) / block.x); tensor_equals<<>>(flag_device.get(), problem.m, problem.n, experimental.get(), problem.m, reference.get(), problem.m); cutlass::device_memory::copy_to_host(&flag, flag_device.get(), 1); return flag == 0; } /// Computes the reference output void compute_reference(cublasGemmAlgo_t algorithm) { prepare_cublas(); launch_cublas(algorithm); } /// Helper to verify with reference bool verify_with_reference() { return verify(experimental, reference); } /// Writes the problem to an ostream in human-readable form void write_problem(std::ostream &results_output, std::ostream &errors_output) { cutlass::HostTensor host_A; cutlass::HostTensor host_B; cutlass::HostTensor host_C; cutlass::HostTensor host_D; cutlass::HostTensor host_Ref; host_A.resize_matrix(M(), K(), layout_a()); host_B.resize_matrix(K(), N(), layout_b()); host_C.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor); host_D.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor); host_Ref.resize_matrix(M(), N(), cutlass::MatrixLayout::kColumnMajor); // copy from device allocations host_A.copy_to_host(ptr_A()); host_B.copy_to_host(ptr_B()); host_C.copy_to_host(ptr_C_initial()); host_D.copy_to_host(ptr_experimental()); host_Ref.copy_to_host(ptr_reference()); // write out human readable results_output << "A =\n" << host_A << "\n" << "B =\n" << host_B << "\n" << "C = \n" << host_C << "\n" << "Ref =\n" << host_Ref << "\n" << "Experimental =\n" << host_D << "\n"; // write out list of errors PrintErrors printer(errors_output, host_Ref, host_D); host_D.visit(printer); } }; } // namespace perf