/*************************************************************************************************** * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #include "cutlass_unit_test.h" #include #include #include #include #include #include #include #include #include #include #include // cute::Swizzle #include // cute::compose(cute::Swizzle) #include using namespace cute; namespace cooperative_copy_mode { struct global_shared {}; struct global_global {}; struct shared_shared {}; } // gs --> global to/from shared template __device__ void cooperative_copy_default_gs(T const* g_in, T* g_out, GMemLayout const& gmem_layout, SMemLayout const& smem_layout) { using namespace cute; extern __shared__ uint128_t smem_buf[]; // Cast smem_buf to smem_uint8_ptr and move it by MaxVecBits bits // This is to make sure tests pass on pointer aligned to MaxVecBits bits uint8_t* smem_uint8_ptr = reinterpret_cast(smem_buf) + (MaxVecBits/8); T* smem = reinterpret_cast(smem_uint8_ptr); Tensor g_in_tensor = make_tensor(make_gmem_ptr(g_in), gmem_layout); Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), gmem_layout); Tensor s_tensor = make_tensor(make_smem_ptr(smem), smem_layout); cooperative_copy(threadIdx.x, g_in_tensor, s_tensor); cp_async_fence(); cp_async_wait<0>(); __syncthreads(); if(thread0()) { for(int i = 0; i < size(s_tensor); ++i) { s_tensor(i) += T(i); } } __syncthreads(); cooperative_copy(threadIdx.x, s_tensor, g_out_tensor); } // ss --> shared to shared template __device__ void cooperative_copy_default_ss(T const* g_in, T* g_out, Layout1 const& layout1, Layout2 const& layout2) { using namespace cute; extern __shared__ uint128_t smem_buf[]; // Cast smem_buf to smem_uint8_ptr and move it by MaxVecBits bits // This is to make sure tests pass on pointer aligned to MaxVecBits bits T* smem1 = reinterpret_cast(smem_buf); uint8_t* smem2_uint8_ptr = reinterpret_cast(smem_buf) + (MaxVecBits/8); T* smem2 = reinterpret_cast(smem2_uint8_ptr) + cute::cosize(layout2); Tensor g_in_tensor = make_tensor(make_gmem_ptr(g_in), layout1); Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), layout2); Tensor s1_tensor = make_tensor(make_smem_ptr(smem1), layout2); Tensor s2_tensor = make_tensor(make_smem_ptr(smem2), layout1); cooperative_copy>(threadIdx.x, g_in_tensor, s1_tensor); cp_async_fence(); cp_async_wait<0>(); __syncthreads(); if(thread0()) { for(int i = 0; i < size(s1_tensor); ++i) { s1_tensor(i) += T(i); } } __syncthreads(); cooperative_copy(threadIdx.x, s1_tensor, s2_tensor); __syncthreads(); cooperative_copy>(threadIdx.x, s2_tensor, g_out_tensor); } // gg --> global to global template __device__ void cooperative_copy_default_gg(T const* g_in, T* g_out, Layout1 const& layout1, Layout2 const& layout2) { using namespace cute; Tensor g_in_tensor = make_tensor(make_gmem_ptr(g_in), layout1); Tensor g_out_tensor = make_tensor(make_gmem_ptr(g_out), layout2); cooperative_copy(threadIdx.x, g_in_tensor, g_out_tensor); } template __global__ void cooperative_copy_default_kernel(T const* g_in, T* g_out, Layout1 const layout1, Layout2 const layout2) { if constexpr(std::is_same_v) { cooperative_copy_default_gs(g_in, g_out, layout1, layout2); } else if constexpr (std::is_same_v) { cooperative_copy_default_gg(g_in, g_out, layout1, layout2); } else if constexpr (std::is_same_v) { cooperative_copy_default_ss(g_in, g_out, layout1, layout2); } } // Mode - defines memory types of src and dst in cooperative_copy operation // MaxVecBits - defines max vectorization in cooperative_copy operation, and enforces that // alignment on used pointers to ensure correct testing template void test_cooperative_copy_default(Layout1 const& layout1, Layout2 const& layout2) { using value_type = T; CUTE_STATIC_ASSERT_V(cute::size(layout1) == cute::size(layout2)); auto gmem_layout_in = layout1; auto gmem_layout_out = cute::conditional_return>(layout1, layout2); #if 0 print(" "); print("layout1: "); print(layout1); print("\n"); print(" "); print("layout2: "); print(layout2); print("\n"); print(" "); print("threads: "); print(ThreadBlockSize); print("\n"); print(" "); print("maxvecbits: "); print(MaxVecBits); print("\n"); #endif if constexpr (MaxVecBits < cute::sizeof_bits_v) { GTEST_SKIP() << "Skipping test since MaxVecBits (=" << MaxVecBits << ") < cute::sizeof_bits_v (=" << cute::sizeof_bits_v << ")"; } else { constexpr auto max_vec_bytes = MaxVecBits / 8; static_assert((max_vec_bytes % sizeof(T)) == 0); uint32_t count = cute::cosize(gmem_layout_in); // Extra elements to force MaxVecBits alignment in global memory uint32_t extra_elements = max_vec_bytes / sizeof(value_type); // Allocate thrust::host_vector h_in (count + extra_elements); thrust::host_vector h_out(count + extra_elements); // Initialize Tensor h_in_tensor = make_tensor(h_in.data() + extra_elements, gmem_layout_in); Tensor h_out_tensor = make_tensor(h_out.data() + extra_elements, gmem_layout_out); for (int i = 0; i < cute::size(h_in_tensor); ++i) { h_in_tensor(i) = value_type(float(i)); // For global-to-global copy need to compare against the same value h_out_tensor(i) = std::is_same_v ? value_type(float(i)) : value_type(float(2 * i)); } // To GPU thrust::device_vector d_in = h_in; thrust::device_vector d_out(d_in.size(), value_type(float(-2))); // Adds (MaxVecBits/8) bytes to shared memory as we'll move pointer by that many bytes inside the kernel to enforce // alignment to (MaxVecBits/8) bytes size_t shared_memory_bytes = (sizeof(value_type) * count) + max_vec_bytes; shared_memory_bytes += std::is_same_v * (sizeof(value_type) * count); // Launch auto coop_copy = cooperative_copy_default_kernel; ASSERT_EQ(cudaFuncSetAttribute(coop_copy, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast(shared_memory_bytes)), cudaSuccess); auto d_in_ptr = thrust::raw_pointer_cast(d_in.data() + extra_elements); auto d_out_ptr = thrust::raw_pointer_cast(d_out.data() + extra_elements); coop_copy<<<1, ThreadBlockSize, shared_memory_bytes>>>(d_in_ptr, d_out_ptr, layout1, layout2); cudaError_t result = cudaDeviceSynchronize(); if (result != cudaSuccess) { cudaError_t error = cudaGetLastError(); FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n"; } // Validate thrust::host_vector h_result = d_out; Tensor h_result_tensor = make_tensor(h_result.data() + extra_elements, gmem_layout_out); for (int i = 0; i < cute::size(h_in_tensor); ++i) { ASSERT_EQ(h_result_tensor(i), h_out_tensor(i)) << i << " - result:" << h_result_tensor(i) << " expected:" << h_out_tensor(i); } } } template class SM80_CuTe_Ampere; template class SM80_CuTe_Ampere>: public testing::Test { public: using mode = Mode; static constexpr int max_vec_bits = MaxVecBits::value; }; typedef testing::Types< std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, std::tuple>, > CooperativeCopyModeMaxVecBitsList; TYPED_TEST_SUITE(SM80_CuTe_Ampere, CooperativeCopyModeMaxVecBitsList); // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault1D) { using value_type = float; constexpr uint32_t count = 512; auto gmem_layout = make_layout(make_shape(Int{})); auto smem_layout = make_layout(make_shape(Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault1DFallback) { using value_type = float; constexpr uint32_t count = 99; auto gmem_layout = make_layout(make_shape(Int{})); auto smem_layout = make_layout(make_shape(Int{})); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2D) { using value_type = float; constexpr uint32_t x = 32; constexpr uint32_t y = 32; auto gmem_layout = make_layout(make_shape(Int{}, Int{})); auto smem_layout = make_layout(make_shape(Int{}, Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } #if 0 // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DDynamicStrides) { using value_type = float; constexpr uint32_t x = 32; constexpr uint32_t y = 32; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), make_stride(1, x)); auto smem_layout = make_layout(make_shape(Int{}, Int{}), make_stride(1, x)); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DMixedStrides) { using value_type = float; constexpr uint32_t x = 32; constexpr uint32_t y = 32; auto gmem_layout = make_layout(make_shape(Int{}, Int{})); auto smem_layout = make_layout(make_shape(Int{}, Int{}), make_stride(1, x)); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } #endif TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DFallback) { using value_type = float; constexpr uint32_t x = 37; constexpr uint32_t y = 37; auto gmem_layout = make_layout(make_shape(Int{}, Int{})); auto smem_layout = make_layout(make_shape(Int{}, Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast Path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2DCustomStride) { using value_type = float; constexpr uint32_t x = 16; constexpr uint32_t y = 16; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), make_stride(Int{}, Int<1>{})); auto smem_layout = make_layout(make_shape(Int{}, Int{}), make_stride(Int<1>{}, Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault3D) { using value_type = cute::half_t; constexpr uint32_t x = 8; constexpr uint32_t y = 8; constexpr uint32_t z = 16; auto gmem_layout = make_layout(make_shape(Int{}, Int{}, Int{})); auto smem_layout = make_layout(make_shape(Int{}, Int{}, Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefault2Dto3D) { using value_type = double; constexpr uint32_t x = 16; constexpr uint32_t y = 16; constexpr uint32_t z = 4; auto gmem_layout = make_layout(make_shape(Int{}, Int{})); auto smem_layout = make_layout(make_shape(Int{}, Int{}, Int{})); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCustom1) { using value_type = double; auto gmem_layout = make_layout( make_shape(Int<8>{}, make_shape(Int<2>{}, Int<2>{})), make_stride(Int<2>{}, make_shape(Int<1>{}, Int<16>{})) ); auto smem_layout = make_layout( make_shape(Int<8>{}, Int<4>{}), make_stride(Int<4>{}, Int<1>{}) ); constexpr uint32_t thread_block_size = 8; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast Path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCustom2) { using value_type = float; auto gmem_layout = make_layout( make_shape(make_shape(Int<4>{}, Int<2>{}), make_shape(Int<2>{}, Int<2>{})), make_stride(make_shape(Int<4>{}, Int<1>{}), make_shape(Int<16>{}, Int<2>{})) ); auto smem_layout = make_layout( make_shape(make_shape(Int<2>{}, Int<2>{}, Int<2>{}), make_shape(Int<2>{}, Int<2>{})), make_stride(make_shape(Int<16>{}, Int<4>{}, Int<1>{}), make_shape(Int<8>{}, Int<2>{})) ); constexpr uint32_t thread_block_size = 16; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast Path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle1) { using value_type = float; auto gmem_layout = Layout, Stride<_64, _1>>{}; auto smem_layout = composition(Swizzle<3, 3, 3>{}, Layout, Stride<_64, _1>>{}); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast Path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle2) { using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int<64>{}, Int<64>{})); auto smem_atom_layout = composition(Swizzle<3, 2, 3>{}, Layout, Stride<_32, _1>>{}); auto smem_layout = tile_to_shape( smem_atom_layout, make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout)) ); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast Path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle3) { using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int<64>{}, Int<64>{})); auto smem_atom_layout = composition(Swizzle<2, 4, 3>{}, Layout, Stride<_64, _1>>{}); auto smem_layout = tile_to_shape( smem_atom_layout, make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout)) ); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle4) { using value_type = cute::half_t; auto gmem_atom_layout = composition(Swizzle<3, 2, 3>{}, Layout, Stride<_32, _1>>{}); auto smem_layout = make_layout(make_shape(Int<64>{}, Int<64>{})); auto gmem_layout = tile_to_shape( gmem_atom_layout, make_shape(shape<0>(smem_layout), shape<1>(smem_layout)) ); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Needs coalescing to work on fast path // OK if we enforce slow path // Problem: Wrong condition when we select between slow and fast path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultCoalesceToCompose) { constexpr int m = 96; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{}); auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{}); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // Fast path (default): OK // Slow path (enforced): OK TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzle5) { constexpr int m = 64; constexpr int n = 128; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{}); // auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{})); auto smem_atom_layout = composition(Swizzle<3,3,3>{}, Layout, Stride<_64, _1>>{}); auto smem_layout = tile_to_shape( smem_atom_layout, make_shape(shape<0>(gmem_layout), shape<1>(gmem_layout)) ); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // If condition not strict enought will go to fast path // This test needs checking if CuTe can compose layouts // Fast path (default): fail // Slow path (enforced): Should go to vectorized naive path TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzleNaiveVectorizable) { constexpr int m = 192; constexpr int n = 64; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{}); // auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenColMajor{}); auto smem_atom_layout = composition(Swizzle<3,3,3>{}, Layout, Stride< _1,_64>>{}); auto smem_layout = tile_to_shape( smem_atom_layout, shape(gmem_layout) ); constexpr uint32_t thread_block_size = 128; test_cooperative_copy_default(gmem_layout, smem_layout); } // fast path: ok (chosen) // slow path: ok TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultRowMajorSmall) { constexpr int m = 24; constexpr int n = 8; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // fast path: doesn't apply // slow path: ok TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSlowPath) { constexpr int m = 67; constexpr int n = 67; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); constexpr uint32_t thread_block_size = 64; test_cooperative_copy_default(gmem_layout, smem_layout); } // fast path: doesn't apply // slow path: should vectorize TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopyDefaultSwizzleSlowPathVectorize) { constexpr int m = 68; constexpr int n = 68; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); auto smem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); constexpr uint32_t thread_block_size = 32; test_cooperative_copy_default(gmem_layout, smem_layout); } TYPED_TEST(SM80_CuTe_Ampere, CooperativeCopy48x48Swizzle) { constexpr int m = 48; constexpr int n = 48; using value_type = cute::half_t; auto gmem_layout = make_layout(make_shape(Int{}, Int{}), GenRowMajor{}); auto smem_layout = composition(Swizzle<2,2,3>{}, Layout>>, Stride, _16>>>{}); constexpr uint32_t thread_block_size = 8 * 32; test_cooperative_copy_default(gmem_layout, smem_layout); }