/*************************************************************************************************** * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Basic tests for BULK_COPY usage with various layouts. */ #include "cutlass_unit_test.h" #include #include #include #include using namespace cute; template struct SharedStorage { cute::array_aligned> smem; cute::uint64_t bulk_copy_mbar[1]; }; #if CUDA_12_0_SM90_FEATURES_SUPPORTED template __global__ void bulk_copy_test_device_cute(T const* g_in, T * g_out, GmemLayout gmem_layout, SmemLayout smem_layout) { // Use Shared Storage structure to allocate and distribute aligned SMEM addresses extern __shared__ char shared_memory[]; using SharedStorage = SharedStorage; SharedStorage& shared_storage = *reinterpret_cast(shared_memory); // Construct SMEM tensor Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.data()), smem_layout); // Construct the GMEM tensor Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout); // Shared memory barriers use 64bits in SMEM for synchronization uint64_t* bulk_copy_mbar = shared_storage.bulk_copy_mbar; // // Perform the BULK_COPY load // auto blkcp = Copy_Traits{}; #if 0 if (thread0()) { print("sA: "); print(sA.data()); print(" o "); print(sA.layout()); print("\n"); print("gA: "); print(gA.data()); print(" o "); print(gA.layout()); print("\n"); } #endif // Set the bytes transferred in this transaction (may involve multiple issues) constexpr int transaction_bytes = size(sA) * sizeof(T); if (threadIdx.x == 0) { /// Initialize shared memory barrier bulk_copy_mbar[0] = 0; initialize_barrier(bulk_copy_mbar[0], 1 /*numThreads*/); set_barrier_transaction_bytes(bulk_copy_mbar[0], transaction_bytes); copy(blkcp.with(bulk_copy_mbar[0]), gA, sA); } __syncthreads(); /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value constexpr int kPhaseBit = 0; wait_barrier(bulk_copy_mbar[0], kPhaseBit); #if 0 if (thread0()) { print(sA); } #endif // // Write out trivially // Tensor gA_out = make_tensor(make_gmem_ptr(g_out), gmem_layout); // Output smem -> gmem for (int i = threadIdx.x; i < size(sA); i += blockDim.x) { gA_out(i) = sA(i); } } template void run_and_validate(GLayout gmem_layout, SLayout smem_layout) { thrust::host_vector h_in(cosize(gmem_layout)); for (size_t i = 0; i < h_in.size(); ++i) { h_in[i] = static_cast(int(i)); } thrust::device_vector d_in = h_in; thrust::device_vector d_out(d_in.size(), T(-1)); int32_t smem_size = static_cast(sizeof(SharedStorage)); bulk_copy_test_device_cute<<<1, 128, smem_size>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), gmem_layout, smem_layout); // Transfering results back to host thrust::host_vector h_out = d_out; // Validate the results for (int i = 0; i < cute::size(gmem_layout); ++i) { int k = gmem_layout(i); EXPECT_EQ(int(h_in[k]), int(h_out[k])); } } // } // namespace TEST(SM90_CuTe_BLKCP, ColMajor) { auto smem_layout = make_layout(Shape<_32,_32>{}, GenColMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); run_and_validate< half_t>(gmem_layout, smem_layout); run_and_validate(gmem_layout, smem_layout); } TEST(SM90_CuTe_BLKCP, RowMajor) { auto smem_layout = make_layout(Shape<_32,_32>{}, GenRowMajor{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); run_and_validate< half_t>(gmem_layout, smem_layout); run_and_validate(gmem_layout, smem_layout); } TEST(SM90_CuTe_BLKCP, NonCompact) { { auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); run_and_validate< half_t>(gmem_layout, smem_layout); run_and_validate(gmem_layout, smem_layout); } { auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_1,Int<48>>{}); auto gmem_layout = make_layout(Shape, Shape<_4,_8>>{}, Stride,Stride<_16,_128>>{}); run_and_validate< int8_t>(gmem_layout, smem_layout); run_and_validate< half_t>(gmem_layout, smem_layout); run_and_validate(gmem_layout, smem_layout); } { auto smem_layout = make_layout(Shape<_32,_32>{}, Stride<_64,_1>{}); auto gmem_layout = smem_layout; run_and_validate< int8_t>(gmem_layout, smem_layout); run_and_validate< half_t>(gmem_layout, smem_layout); run_and_validate(gmem_layout, smem_layout); } } #endif // #if CUDA_12_0_SM90_FEATURES_SUPPORTED