/*************************************************************************************************** * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include "../../common/cutlass_unit_test.h" #include "cutlass/core_io.h" #include "cutlass/numeric_types.h" #include "cutlass/gemm/gemm.h" #include "cutlass/layout/matrix.h" #include "cutlass/tensor_ref.h" #include "cutlass/util/host_tensor.h" #include "cutlass/util/tensor_view_io.h" #include "cutlass/util/host_tensor.h" #include "cutlass/util/reference/host/tensor_fill.h" #include "cutlass/util/reference/host/tensor_compare.h" #include "cutlass/util/reference/host/gemm.h" #include "cutlass/gemm/kernel/default_gemv.h" #include "cutlass/gemm/kernel/gemv_batched_strided.h" namespace test { namespace gemm { namespace kernel { template void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size, ElementCD_ alpha = ElementCD_(1), ElementCD_ beta = ElementCD_(0), bool perf_test = false, int perf_test_iter = 1) { using ThreadBlockShape = ThreadBlockShape_; using ThreadShape = ThreadShape_; using ElementA = ElementAB_; using LayoutA = LayoutA_; using ElementB = ElementAB_; using LayoutB = LayoutB_; using ElementAccumulator = ElementCD_; using ElementCD = ElementCD_; using LayoutCD = LayoutCD_; using GemvKernel = cutlass::gemm::kernel::DefaultGemv; using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv; using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle; if (DEBUG) { problem_size = cutlass::gemm::BatchedGemmCoord( problem_size.m(), problem_size.n(), problem_size.k(), 1); } // Create host tensors that will be the backing store for the batches // Note that no device memory is initially allocated cutlass::HostTensor matrix_A({problem_size.m(), problem_size.k()}, false); cutlass::HostTensor matrix_B({problem_size.k(), problem_size.n()}, false); cutlass::HostTensor matrix_C_computed({problem_size.m(), problem_size.n()}, false); cutlass::HostTensor matrix_C_reference({problem_size.m(), problem_size.n()}, false); // Reserve memory for the batch of tensors matrix_A.reserve(problem_size.m()*problem_size.k()*problem_size.batch()); matrix_B.reserve(problem_size.n()*problem_size.k()*problem_size.batch()); matrix_C_computed.reserve(problem_size.m()*problem_size.n()*problem_size.batch()); matrix_C_reference.reserve(problem_size.m()*problem_size.n()*problem_size.batch(), false); // Fill eatch tensor batch const int seed = 9876; for (int b = 0; b < problem_size.batch(); b++) { if(DEBUG) { cutlass::reference::host::BlockFillSequential( matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity()); cutlass::reference::host::BlockFillSequential( matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity()); } else { cutlass::reference::host::TensorFillRandomUniform( matrix_A.host_view(b*matrix_A.capacity()), seed + 1660, 8, -8, 0 ); cutlass::reference::host::TensorFillRandomUniform( matrix_B.host_view(b*matrix_B.capacity()), seed + 1880, 8, -8, 0 ); } cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity())); cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity())); } matrix_A.sync_device(); matrix_B.sync_device(); matrix_C_computed.sync_device(); ThreadBlockSwizzle swizzle; cutlass::gemm::BatchedGemmCoord tiled_size{ThreadBlockShape::kM, ThreadBlockShape::kN, problem_size.k(), // no split-k DEBUG ? 1 : LDG_B }; cutlass::gemm::BatchedGemmCoord tiled_shape = swizzle.get_tiled_shape(problem_size, tiled_size); #if 0 printf("tiled_size = %d %d %d %d\n", tiled_size.m(), tiled_size.n(), tiled_size.k(), tiled_size.batch()); printf("tiled_shape = %d %d %d %d\n", tiled_shape.m(), tiled_shape.n(), tiled_shape.k(), tiled_shape.batch()); #endif // No split-k EXPECT_EQ(tiled_size.k(), problem_size.k()); dim3 grid = swizzle.get_grid_shape(tiled_shape); dim3 block(tiled_size.n() / ThreadShape::kN, tiled_size.batch(), tiled_size.k() / problem_size.k()); // Some sanity checks EXPECT_TRUE( block.x*block.y*block.z <= 1024 ); EXPECT_TRUE( block.x <= 1024 ); EXPECT_TRUE( block.y <= 1024 ); EXPECT_TRUE( block.z <= 64 ); #if 0 printf("grid dim = %d, %d, %d\n", grid.x, grid.y, grid.z); printf("block dim = %d, %d, %d\n", block.x, block.y, block.z); #endif cudaError_t result; cudaEvent_t start_event, end_event; for (int iter = 0; iter < (perf_test ? (perf_test_iter+1) : 1); ++iter) { if (perf_test && iter == 1) { result = cudaEventCreate(&start_event); EXPECT_EQ(result, cudaSuccess); result = cudaEventCreate(&end_event); EXPECT_EQ(result, cudaSuccess); result = cudaEventRecord(start_event); EXPECT_EQ(result, cudaSuccess); } if (beta == ElementCD(0)) { if (alpha == ElementCD(1)) { cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( problem_size, matrix_A.device_ref(), matrix_A.capacity(), matrix_B.device_ref(), matrix_B.capacity(), matrix_C_computed.device_ref(), matrix_C_computed.capacity() ); } else { cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( problem_size, alpha, matrix_A.device_ref(), matrix_A.capacity(), matrix_B.device_ref(), matrix_B.capacity(), matrix_C_computed.device_ref(), matrix_C_computed.capacity() ); } } else { cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( problem_size, alpha, beta, matrix_A.device_ref(), matrix_A.capacity(), matrix_B.device_ref(), matrix_B.capacity(), matrix_C_computed.device_ref(), matrix_C_computed.capacity(), matrix_C_computed.device_ref(), matrix_C_computed.capacity() ); } if (iter == 0) { result = cudaGetLastError(); EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result); } } if (perf_test) { result = cudaEventRecord(end_event); EXPECT_EQ(result, cudaSuccess); } result = cudaDeviceSynchronize(); EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result); if (perf_test) { float ms; result = cudaEventElapsedTime(&ms, start_event, end_event); EXPECT_EQ(result, cudaSuccess); double flops = (double(problem_size.m()) * double(problem_size.n()) * double(problem_size.k()) * double(problem_size.batch()) * 2); // 2 for MAC double read_bytes = double(problem_size.batch()) * (sizeof(ElementA)*double(problem_size.m())*double(problem_size.k()) + sizeof(ElementB)*double(problem_size.k())*double(problem_size.n())); double write_bytes = double(problem_size.batch()) * (sizeof(ElementCD)*double(problem_size.m())*double(problem_size.n())); double avg_runtime = double(ms) / perf_test_iter; double gflops_per_sec = flops / 1.0e6 / avg_runtime; double read_bandwidth = read_bytes / 1.0e6 / avg_runtime; double write_bandwidth = write_bytes / 1.0e6 / avg_runtime; std::cout << "\n\nProblem size: " << problem_size.m() << " x " << problem_size.n() << " x " << problem_size.k() << " x " << problem_size.batch() << std::endl; std::cout << " GFLOPs: " << gflops_per_sec << std::endl; std::cout << "BW (R/W): " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl; std::cout << " Runtime: " << avg_runtime << " ms" << std::endl; } else { matrix_C_computed.sync_host(); // Compute the batched gemms for (int b = 0; b < problem_size.batch(); b++) { cutlass::reference::host::Gemm reference_gemm; reference_gemm( problem_size.mnk(), alpha, matrix_A.host_ref(b * matrix_A.capacity()), matrix_B.host_ref(b * matrix_B.capacity()), beta, matrix_C_reference.host_ref(b * matrix_C_computed.capacity())); bool passed = cutlass::reference::host::TensorEquals( matrix_C_computed.host_view(b * matrix_C_computed.capacity()), matrix_C_reference.host_view(b * matrix_C_reference.capacity())); EXPECT_TRUE(passed) //<< "A:\n" << matrix_A.host_view() << "\n" //<< "B:\n" << matrix_B.host_view() << "\n" << "Batch: " << b << "\n" << "Reference:\n" << matrix_C_reference.host_view(b * matrix_C_reference.capacity()) << "\n" << "Computed:\n" << matrix_C_computed.host_view(b * matrix_C_computed.capacity()) << "\n"; } } } template void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size, ElementCD_ alpha = ElementCD_(1), ElementCD_ beta = ElementCD_(0), int iter = 50) { batched_gemv_kernel_test(problem_size, alpha, beta, true, iter); } } // namespace threadblock } // namespace kernel } // namespace test