/*************************************************************************************************** * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include #include #include #include "cutlass/util/host_tensor.h" #include "cutlass/util/tensor_view_io.h" #include "cutlass/util/distribution.h" #include "cutlass/util/reference/host/tensor_fill.h" #include "cutlass/util/reference/host/tensor_copy.h" #include "cutlass/util/reference/host/tensor_compare.h" #include "cutlass/util/reference/host/tensor_norm.h" #include "cutlass/util/host_reorder.h" #include "cutlass/util/reference/device/gemm.h" #include "helper.h" #define CHECK_GT(val1, val2) \ if((val1) <= (val2)) \ std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; #define CHECK_TRUE(val) \ if(!(val)) \ std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; template struct B2bInterleavedNonFusedGemmRun { using Gemm0 = Gemm0_; using Gemm1 = Gemm1_; using ElementAccumulator = typename Gemm0::ElementAccumulator; using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute; /// Initialization cutlass::Distribution::Kind init_A; cutlass::Distribution::Kind init_B; cutlass::Distribution::Kind init_C; uint64_t seed; // // Methods // B2bInterleavedNonFusedGemmRun( cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, uint64_t seed_ = 2080 ): init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } /// Helper to initialize a tensor view template bool initialize_tensor( cutlass::TensorView view, cutlass::Distribution::Kind dist_kind, uint64_t seed) { if (dist_kind == cutlass::Distribution::Uniform) { cutlass::reference::host::TensorFillRandomUniform( view, seed, 2, -2, 0); } else if (dist_kind == cutlass::Distribution::Identity) { cutlass::reference::host::TensorFillIdentity(view); } else if (dist_kind == cutlass::Distribution::Sequential) { cutlass::reference::host::BlockFillSequential( view.data(), view.capacity()); } else { // TODO: Implement the rest std::cerr << "Not implemented\n"; return false; } return true; } /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size_0, cutlass::gemm::GemmCoord problem_size_1, ElementCompute alpha0 = ElementCompute(1), ElementCompute beta0 = ElementCompute(0), ElementCompute alpha1 = ElementCompute(1), ElementCompute beta1 = ElementCompute(0), bool relu = true) { // // Allocate the GEMM workspace // cutlass::HostTensor< typename Gemm0::ElementA, typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk()); cutlass::HostTensor< typename Gemm0::ElementB, typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn()); cutlass::HostTensor< typename Gemm0::ElementB, typename Gemm0::LayoutB> tensor_B0_reordered(problem_size_0.kn()); cutlass::HostTensor< typename Gemm0::ElementC, typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn()); cutlass::HostTensor< typename Gemm0::ElementC, typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn()); cutlass::HostTensor< typename Gemm0::ElementC, typename Gemm0::LayoutC> reference_D0(problem_size_0.mn()); cutlass::HostTensor< typename Gemm1::ElementB, typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn()); cutlass::HostTensor< typename Gemm1::ElementB, typename Gemm1::LayoutB> tensor_B1_reordered(problem_size_1.kn()); cutlass::HostTensor< typename Gemm1::ElementC, typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn()); cutlass::HostTensor< typename Gemm1::ElementC, typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn()); cutlass::HostTensor< typename Gemm1::ElementC, typename Gemm1::LayoutC> reference_D1(problem_size_1.mn()); CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); //Reorder B0 and B1 cutlass::reorder_column( tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); cutlass::reorder_column( tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); cutlass::reference::host::TensorFill( tensor_D0.host_view()); cutlass::reference::host::TensorFill( tensor_D1.host_view()); cutlass::reference::host::TensorFill( reference_D0.host_view()); cutlass::reference::host::TensorFill( reference_D1.host_view()); tensor_A0.sync_device(); tensor_B0.sync_device(); tensor_B0_reordered.sync_device(); tensor_C0.sync_device(); tensor_D0.sync_device(); tensor_B1.sync_device(); tensor_B1_reordered.sync_device(); tensor_C1.sync_device(); tensor_D1.sync_device(); reference_D0.sync_device(); reference_D1.sync_device(); // // Initialize the GEMM operator // typename Gemm0::Arguments arguments_0{ problem_size_0, tensor_A0.device_ref(), tensor_B0_reordered.device_ref(), tensor_C0.device_ref(), tensor_D0.device_ref(), {alpha0, beta0} }; typename Gemm1::Arguments arguments_1{ problem_size_1, tensor_D0.device_ref(), tensor_B1_reordered.device_ref(), tensor_C1.device_ref(), tensor_D1.device_ref(), {alpha1, beta1} }; Gemm0 gemm_op_0; Gemm1 gemm_op_1; cutlass::Status status = gemm_op_0.initialize(arguments_0); CUTLASS_CHECK(status); status = gemm_op_1.initialize(arguments_1); CUTLASS_CHECK(status); // // Run the GEMM // cudaEvent_t start, stop1, stop2; cudaEventCreate(&start); cudaEventCreate(&stop1); cudaEventCreate(&stop2); cudaEventRecord(start); for(int i = 0; i < 100; i++) { status = gemm_op_0(); CUTLASS_CHECK(status); } cudaEventRecord(stop1); for(int i = 0; i < 100; i++) { status = gemm_op_1(); CUTLASS_CHECK(status); } cudaEventRecord(stop2); cudaDeviceSynchronize(); float gemm0Time, gemm1Time, totalTime; cudaEventElapsedTime(&gemm0Time, start, stop1); cudaEventElapsedTime(&gemm1Time, stop1, stop2); cudaEventElapsedTime(&totalTime, start, stop2); std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; std::cout << "total time " << totalTime / 100.0 << " ms\n"; tensor_D0.sync_host(); tensor_D1.sync_host(); // // Verify // cutlass::reference::device::Gemm< typename Gemm0::ElementA, typename Gemm0::LayoutA, typename Gemm0::ElementB, typename Gemm0::LayoutB, typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute, ElementAccumulator, typename Gemm0::Operator> reference_gemm_0; cutlass::reference::device::Gemm< typename Gemm1::ElementA, typename Gemm1::LayoutA, typename Gemm1::ElementB, typename Gemm1::LayoutB, typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute, ElementAccumulator, typename Gemm1::Operator> reference_gemm_1; reference_gemm_0( problem_size_0, alpha0, tensor_A0.device_ref(), tensor_B0.device_ref(), beta0, tensor_C0.device_ref(), reference_D0.device_ref() ); if(relu) { cutlass::reference::device::TensorReLu(reference_D0.device_view()); } reference_gemm_1( problem_size_1, alpha1, tensor_D0.device_ref(), tensor_B1.device_ref(), beta1, tensor_C1.device_ref(), reference_D1.device_ref() ); if(relu) { cutlass::reference::device::TensorReLu(reference_D1.device_view()); } cudaDeviceSynchronize(); reference_D0.sync_host(); reference_D1.sync_host(); CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0); CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); bool passed = cutlass::reference::host::TensorEquals( reference_D1.host_view(), tensor_D1.host_view()); CHECK_TRUE(passed); if (!passed) { std::stringstream fname; fname << "error_B2bGemm_device_interleaved_nonfused.txt"; std::cerr << "Dumping results in " << fname.str() << "\n"; std::ofstream file(fname.str()); file << "A0 =\n" << tensor_A0.host_view() << "\nB0 =\n" << tensor_B0.host_view() << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() << "\nC0 =\n" << tensor_C0.host_view() << "\nD0 =\n" << tensor_D0.host_view() << "\nB1 =\n" << tensor_B1.host_view() << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() << "\nC1 =\n" << tensor_C1.host_view() << "\n\nReference =\n" << reference_D1.host_view() << "\nComputed =\n" << tensor_D1.host_view(); } return passed; } }; template struct B2bInterleavedFusedGemmRun { using B2bGemm = B2bGemm_; using ElementAccumulator = typename B2bGemm::ElementAccumulator; using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute; /// Initialization cutlass::Distribution::Kind init_A; cutlass::Distribution::Kind init_B; cutlass::Distribution::Kind init_C; uint64_t seed; // // Methods // B2bInterleavedFusedGemmRun( cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, uint64_t seed_ = 2080 ): init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } /// Helper to initialize a tensor view template bool initialize_tensor( cutlass::TensorView view, cutlass::Distribution::Kind dist_kind, uint64_t seed) { if (dist_kind == cutlass::Distribution::Uniform) { cutlass::reference::host::TensorFillRandomUniform( view, seed, 2, -2, 0); } else if (dist_kind == cutlass::Distribution::Identity) { cutlass::reference::host::TensorFillIdentity(view); } else if (dist_kind == cutlass::Distribution::Sequential) { cutlass::reference::host::BlockFillSequential( view.data(), view.capacity()); } else { // TODO: Implement the rest std::cerr << "Not implemented\n"; return false; } return true; } /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size_0, cutlass::gemm::GemmCoord problem_size_1, ElementCompute alpha0 = ElementCompute(1), ElementCompute beta0 = ElementCompute(0), ElementCompute alpha1 = ElementCompute(1), ElementCompute beta1 = ElementCompute(0), bool relu = true) { // // Allocate the GEMM workspace // cutlass::HostTensor< typename B2bGemm::ElementA, typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk()); cutlass::HostTensor< typename B2bGemm::ElementB, typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn()); cutlass::HostTensor< typename B2bGemm::ElementB, typename B2bGemm::LayoutB> tensor_B0_reordered(problem_size_0.kn()); cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); // cutlass::HostTensor< // typename B2bGemm::ElementC, // typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); cutlass::HostTensor< typename B2bGemm::ElementB, typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn()); cutlass::HostTensor< typename B2bGemm::ElementB, typename B2bGemm::LayoutB> tensor_B1_reordered(problem_size_1.kn()); cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn()); cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn()); cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn()); CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); //Reorder B0 cutlass::reorder_column( tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); cutlass::reorder_column( tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); cutlass::reference::host::TensorFill( tensor_D1.host_view()); cutlass::reference::host::TensorFill( reference_D0.host_view()); cutlass::reference::host::TensorFill( reference_D1.host_view()); tensor_A0.sync_device(); tensor_B0.sync_device(); tensor_B0_reordered.sync_device(); tensor_C0.sync_device(); //tensor_D0.sync_device(); tensor_B1.sync_device(); tensor_B1_reordered.sync_device(); tensor_C1.sync_device(); tensor_D1.sync_device(); reference_D0.sync_device(); reference_D1.sync_device(); // // Initialize the GEMM operator // typename B2bGemm::Arguments arguments{ problem_size_0, problem_size_1, tensor_A0.device_ref(), tensor_B0_reordered.device_ref(), tensor_C0.device_ref(), tensor_B1_reordered.device_ref(), tensor_C1.device_ref(), tensor_D1.device_ref(), {alpha0, beta0}, {alpha1, beta1}, 1, /*threadblock_swizzle_k_tile*/ }; B2bGemm b2b_gemm_op; cutlass::Status status = b2b_gemm_op.initialize(arguments); CUTLASS_CHECK(status); // // Run the GEMM // cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for(int i = 0; i < 100; i++) { status = b2b_gemm_op(); CUTLASS_CHECK(status); } cudaEventRecord(stop); cudaDeviceSynchronize(); float gemmTime; cudaEventElapsedTime(&gemmTime, start, stop); std::cout << "time " << gemmTime / 100.0 << " ms\n"; //tensor_D0.sync_host(); tensor_D1.sync_host(); // // Verify // cutlass::reference::device::Gemm< typename B2bGemm::ElementA, typename B2bGemm::LayoutA, typename B2bGemm::ElementB, typename B2bGemm::LayoutB, typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute, ElementAccumulator, typename B2bGemm::Operator> reference_gemm_0, reference_gemm_1; reference_gemm_0( problem_size_0, alpha0, tensor_A0.device_ref(), tensor_B0.device_ref(), beta0, tensor_C0.device_ref(), reference_D0.device_ref() ); if(relu) { cutlass::reference::device::TensorReLu(reference_D0.device_view()); } reference_gemm_1( problem_size_1, alpha1, reference_D0.device_ref(), tensor_B1.device_ref(), beta1, tensor_C1.device_ref(), reference_D1.device_ref() ); if(relu) { cutlass::reference::device::TensorReLu(reference_D1.device_view()); } cudaDeviceSynchronize(); reference_D0.sync_host(); reference_D1.sync_host(); CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); bool passed = cutlass::reference::host::TensorEquals( reference_D1.host_view(), tensor_D1.host_view()); CHECK_TRUE(passed); if (!passed) { std::stringstream fname; fname << "error_B2bGemm_device_interleaved_fused.txt"; std::cerr << "Dumping results in " << fname.str() << "\n"; std::ofstream file(fname.str()); file << "A0 =\n" << tensor_A0.host_view() << "\nB0 =\n" << tensor_B0.host_view() << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() << "\nC0 =\n" << tensor_C0.host_view() // << "\nD0 =\n" << tensor_D0.host_view() << "\nB1 =\n" << tensor_B1.host_view() << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() << "\nC1 =\n" << tensor_C1.host_view() << "\n\nReference =\n" << reference_D1.host_view() << "\nComputed =\n" << tensor_D1.host_view(); } return passed; } }; ////////////////////////////////////////////////////////////////////////////////