/*************************************************************************************************** * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Test environment for GEMM */ #pragma once #include #include #include #include #include #include #include #include #include #include namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// template struct WmmaMatrix; } namespace test { //////////////////////////////////////////////////////////////////////////////////////////////////// template struct GemmTestbedTraits : public cutlass::TypeTraits {}; template struct GemmTestbedTraits > { static cudaDataType_t const cublas_type = cutlass::TypeTraits::cublas_type; typedef Scalar_ host_type; typedef Scalar_ device_type; static inline double remove_negative_zero(double x) { return x == -0.0 ? 0.0 : x; } static inline double to_print(double x) { return x; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct GemmTestbed { // // Type definitions // /// Host tensor for operand A typedef cutlass::HostTensor HostTensorA; /// Host tensor for operand B typedef cutlass::HostTensor HostTensorB; /// Host tensor for operand C typedef cutlass::HostTensor HostTensorC; /// Functor to print errors struct PrintErrors { /// Equivalently sized integer type typedef typename GemmTestbedTraits::integer_type integer_t; /// Output stream to write to std::ostream& out; /// Reference tensor view cutlass::HostTensorView const& reference; /// Computed tensor view cutlass::HostTensorView const& experimental; /// Errors greater than or this amount result in printing integer_t ulps_threshold; /// PrintErrors(std::ostream& _out, cutlass::HostTensorView const& _reference, cutlass::HostTensorView const& _experimental, integer_t _ulps_threshold = 1) : out(_out), reference(_reference), experimental(_experimental), ulps_threshold(_ulps_threshold) {} /// Compares one element void operator()(CType const& element, typename HostTensorC::Coord_t coord) { CType exp = experimental.at(coord); CType ref = reference.at(coord); int64_t int_exp = 0; int64_t int_ref = 0; *reinterpret_cast(&int_exp) = exp; *reinterpret_cast(&int_ref) = ref; integer_t ulps = integer_t(int_exp - int_ref); if (std::abs(ulps) >= ulps_threshold) { // width in hexadecimal digits of value int const width = sizeof(integer_t) * 2; double relative = double(exp) - double(ref); if (ref != CType(0)) { relative /= double(ref); } out << "[" << coord << "] expected: " << GemmTestbedTraits::to_print(ref) << " (0x" << std::hex << std::setw(width) << std::setfill('0') << integer_t(int_ref) << std::dec << ")" << ", got: " << GemmTestbedTraits::to_print(exp) << " (0x" << std::hex << std::setw(width) << std::setfill('0') << integer_t(int_exp) << std::dec << ")" << " relative error: " << relative << ", ulps: " << ulps << "\n"; } } }; /// Generates random elements template struct RandomGenerator { RandomGenerator(int seed = -1, bool only_ones_ = false) : only_ones(only_ones_) { srand(seed); } T operator()() { if (only_ones) { return T(1); } else { int val = (rand() % 16) - 8; return T(val); } } bool only_ones; }; // // Data members // /// Status cublasStatus_t status; /// cuBLAS handle cublasHandle_t handle; /// cuBLAS GEMM algorithm selector cublasGemmAlgo_t algorithm; /// A matrix operand HostTensorA A; /// Layout of A matrix cublasOperation_t layout_A; /// B matrix operand HostTensorB B; /// Layout of B matrix cublasOperation_t layout_B; /// C matrix operand HostTensorC C_initial; /// Reference result computed on the host cutlass::HostTensor ref_host; /// Reference result computed with cublas HostTensorC ref_cublas; /// Computed result HostTensorC computed; /// Linear scalaring factor Scalar alpha; /// Linear scaling factor Scalar beta; // // Static helpers // /// Helper to resize a matrix with a given size and layout template static void resize(cutlass::HostTensor& tensor, int rows, int columns, cublasOperation_t layout, int ldm = 0) { if (!ldm) { ldm = (layout == CUBLAS_OP_N ? rows : columns); } typedef cutlass::Coord::Rank> Coord_t; size_t matrix_stride = layout == CUBLAS_OP_N ? columns * ldm : rows * ldm; // TODO: Remove that (int) cast. Coord_t stride = cutlass::make_Coord( (int)matrix_stride, layout == CUBLAS_OP_N ? 1 : ldm, layout == CUBLAS_OP_N ? ldm : 1, 1); Coord_t size = cutlass::make_Coord(1, rows, columns, 1); tensor.reset(stride, size); } // // Methods // /// Constructs a workspace for verifying GEMM. GemmTestbed(int M_, int N_, int K_, int lda, int ldb, int ldc, cublasOperation_t layout_a, cublasOperation_t layout_b, Scalar alpha_ = Scalar(1), Scalar beta_ = Scalar(0), cublasGemmAlgo_t algorithm_ = CUBLAS_GEMM_DEFAULT, cublasOperation_t layout_c = CUBLAS_OP_N) : layout_A(layout_a), layout_B(layout_b), alpha(alpha_), beta(beta_), algorithm(algorithm_) { status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { throw cutlass::cuda_exception("Failed to create CUBLAS handle"); } resize(A, M_, K_, layout_a, lda); resize(B, K_, N_, layout_b, ldb); resize(C_initial, M_, N_, layout_c, ldc); resize(ref_host, M_, N_, layout_c, ldc); resize(ref_cublas, M_, N_, layout_c, ldc); resize(computed, M_, N_, layout_c, ldc); } ~GemmTestbed() { status = cublasDestroy(handle); } /// Returns true if the last CUBLAS call returned successfully bool good() const { return status == CUBLAS_STATUS_SUCCESS; } /// Returns a pointer to the A operand typename HostTensorA::DeviceType* ptr_A() const { return A.device_data(); } /// Stride of A matrix int lda() const { return std::max(A.stride(HostTensorA::Dim_H), A.stride(HostTensorA::Dim_W)); } /// Returns a pointer to the B operand typename HostTensorB::DeviceType* ptr_B() const { return B.device_data(); } /// Stride of B matrix int ldb() const { return std::max(B.stride(HostTensorB::Dim_H), B.stride(HostTensorB::Dim_W)); } /// Returns a pointer to the initial state of the result tensor in device memory typename HostTensorC::DeviceType* ptr_C_initial() const { return C_initial.device_data(); } /// Returns a pointer to the result tensor in device memory typename HostTensorC::DeviceType* ptr_computed() const { return computed.device_data(); } /// Returns a pointer to the result tensor in device memory typename HostTensorC::DeviceType* ptr_cublas() const { return ref_cublas.device_data(); } /// Stride of C matrix int ldc() const { return std::max(C_initial.stride(HostTensorC::Dim_H), C_initial.stride(HostTensorC::Dim_W)); } /// Returns the number of flops implied by the computation (1 multiply-accumulate = 2 flops) uint64_t flops() const { return uint64_t(M()) * uint64_t(N()) * uint64_t(K()) * 2ULL; } /// Computes the speed of the computation in GFLOPs/s double GFLOPs_per_sec(double runtime_ms) const { return double(flops()) / runtime_ms / 1.0e6; } /// Matrix layout of A cublasOperation_t layout_a() const { return layout_A; } /// Matrix layout of B cublasOperation_t layout_b() const { return layout_B; } /// Number of rows of problem int M() const { return C_initial.size(HostTensorC::Dim_H); } /// Number of columns of problem int N() const { return C_initial.size(HostTensorC::Dim_W); } /// Number of columns of problem int K() const { return A.size(HostTensorA::Dim_W); } /// Initializes data, randomly void initialize(int seed = -1) { A.fill_random(RandomGenerator(seed)); B.fill_random(RandomGenerator(seed + 11)); C_initial.fill_random(RandomGenerator(seed + 13)); } /// Computes the matrix product on the host void compute_host() { ref_host.fill(C_initial); ref_host.template gemm(A, B, alpha, beta); } /// Excutes an equivalent GEMM using cuBLAS bool execute_cublas() { status = cublasGemmEx(handle, layout_a(), layout_b(), M(), N(), K(), &alpha, ptr_A(), cutlass::TypeTraits::cublas_type, lda(), ptr_B(), cutlass::TypeTraits::cublas_type, ldb(), &beta, ref_cublas.device_data(), cutlass::TypeTraits::cublas_type, ldc(), cutlass::TypeTraits::cublas_type, algorithm); return status == CUBLAS_STATUS_SUCCESS; } /// Computes the matrix product using cuBLAS void compute_cublas() { ref_cublas.fill(C_initial); if (!execute_cublas()) { throw std::runtime_error("compute_cublas() failed"); } } // // Compute the GEMM yourself // /// Names a probelm based on data type and problem size std::string workspace_name() const { std::stringstream ss; ss << "gemm_" << (layout_a() == CUBLAS_OP_N ? "n" : "t") << (layout_b() == CUBLAS_OP_N ? "n" : "t") << "_" << typeid(AType).name() << "_" << typeid(BType).name() << "_" << typeid(CType).name() << "_" << typeid(Accumulator).name() << "_" << typeid(Scalar).name() << "_" << M() << "x" << N() << "x" << K(); return ss.str(); } /// Writes the workspace to an ostream std::ostream& write(std::ostream& out) const { out << "A = " << A << "\nB = " << B << "\nC_initial = " << C_initial << "\nref_host = " << ref_host << "\nref_cublas = " << ref_cublas << "\ncomputed = " << computed << std::endl; return out; } /// Outputs each mismatching element std::ostream& write_errors(std::ostream& out, cutlass::HostTensorView const& experimental, cutlass::HostTensorView const& ref) const { PrintErrors printer(out, ref, experimental); computed.visit(printer); return out; } /// Sync's all input tensors to device void sync_device() { A.sync_device(); B.sync_device(); C_initial.sync_device(); ref_host.fill(C_initial); ref_cublas.fill(C_initial); computed.fill(C_initial); ref_cublas.sync_device(); computed.sync_device(); } /// Sync's all output tensors to host void sync_host() { computed.sync_host(); ref_cublas.sync_host(); } /// Saves the workspace to files void save_workspace(cutlass::HostTensorView const& experimental, cutlass::HostTensorView const& ref) { std::string name = workspace_name(); std::string results_name = name + "_results.txt"; std::string errors_name = name + "_errors.txt"; std::ofstream results(results_name.c_str()); std::ofstream errors(errors_name.c_str()); write(results); write_errors(errors, experimental, ref); } /// Verifies the contents of C equal the host-side reference bool verify_with_host(bool save_on_error = true, bool always_print = false) { compute_host(); computed.sync_host(); bool passed = computed.bit_equals(ref_host); if ((!passed && save_on_error) || always_print) { save_workspace(computed, ref_host); } return passed; } /// Verifies the contents of computed equal cuBLAS bool verify_with_cublas(bool save_on_error = true, bool always_print = false) { compute_cublas(); ref_cublas.sync_host(); computed.sync_host(); bool passed = computed.bit_equals(ref_cublas); if ((!passed && save_on_error) || always_print) { save_workspace(computed, ref_cublas); } return passed; } /// Verifies the host computation with cuBLAS bool verify_host_with_cublas(bool save_on_error = true, bool always_print = false) { compute_host(); compute_cublas(); ref_cublas.sync_host(); bool passed = ref_host.bit_equals(ref_cublas); if ((!passed && save_on_error) || always_print) { save_workspace(ref_host, ref_cublas); } return passed; } /// Verifies with host-side and device-side computations bool verify_with_all() { bool passed = true; computed.sync_host(); // verify on host passed = (passed && verify_with_host()); // verify with cublas passed = (passed && verify_with_cublas()); return passed; } bool has_cublas_support() const { return cutlass::platform::is_same::value; } }; } // namespace test //////////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { inline cublasOperation_t convert(cutlass::MatrixLayout::Kind layout) { switch (layout) { case cutlass::MatrixLayout::kRowMajor: return CUBLAS_OP_T; case cutlass::MatrixLayout::kColumnMajor: return CUBLAS_OP_N; default: break; } return CUBLAS_OP_N; } //////////////////////////////////////////////////////////////////////////////////////////////////// }