| 
									
										
										
										
											2024-03-20 05:51:04 +08:00
										 |  |  | /***************************************************************************************************
 | 
					
						
							|  |  |  |  * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | 
					
						
							|  |  |  |  * SPDX-License-Identifier: BSD-3-Clause | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Redistribution and use in source and binary forms, with or without | 
					
						
							|  |  |  |  * modification, are permitted provided that the following conditions are met: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * 1. Redistributions of source code must retain the above copyright notice, this | 
					
						
							|  |  |  |  * list of conditions and the following disclaimer. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * 2. Redistributions in binary form must reproduce the above copyright notice, | 
					
						
							|  |  |  |  * this list of conditions and the following disclaimer in the documentation | 
					
						
							|  |  |  |  * and/or other materials provided with the distribution. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * 3. Neither the name of the copyright holder nor the names of its | 
					
						
							|  |  |  |  * contributors may be used to endorse or promote products derived from | 
					
						
							|  |  |  |  * this software without specific prior written permission. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
					
						
							|  |  |  |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
					
						
							|  |  |  |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | 
					
						
							|  |  |  |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | 
					
						
							|  |  |  |  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
					
						
							|  |  |  |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | 
					
						
							|  |  |  |  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | 
					
						
							|  |  |  |  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | 
					
						
							|  |  |  |  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
					
						
							|  |  |  |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  **************************************************************************************************/ | 
					
						
							|  |  |  | #pragma once
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include <cute/config.hpp>
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-07-29 20:46:24 +08:00
										 |  |  | #include <cute/tensor_impl.hpp>
 | 
					
						
							| 
									
										
										
										
											2024-03-20 05:51:04 +08:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include <cute/atom/copy_atom.hpp>
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace cute | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | // Prefetch global tensors into L2
 | 
					
						
							|  |  |  | //
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <uint32_t NumThreads, uint32_t FetchBytes = 64, | 
					
						
							|  |  |  |           class GEngine, class GLayout> | 
					
						
							|  |  |  | CUTE_HOST_DEVICE | 
					
						
							|  |  |  | void | 
					
						
							|  |  |  | cooperative_prefetch(uint32_t                 const& tid, | 
					
						
							|  |  |  |                      Tensor<GEngine, GLayout> const& src) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   static_assert(is_gmem<GEngine>::value, "Expected global tensor for prefetch"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   constexpr int V = decltype(max_common_vector(src, src))::value; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   if constexpr (V > 1) { | 
					
						
							|  |  |  |     // L2 sector is 32B, default fetch granularity is 64B
 | 
					
						
							|  |  |  |     using VecType = conditional_t<(V * sizeof_bits_v<typename GEngine::value_type>) < (FetchBytes * 8), | 
					
						
							|  |  |  |                                   ArrayEngine<typename GEngine::value_type, V>, | 
					
						
							|  |  |  |                                   uint8_t[FetchBytes]                         >; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Tensor src_v = recast<VecType const>(src); | 
					
						
							|  |  |  |     CUTE_UNROLL | 
					
						
							|  |  |  |     for (int i = tid; i < size(src_v); i += NumThreads) { | 
					
						
							|  |  |  |       prefetch(raw_pointer_cast(&src_v(i))); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } else { | 
					
						
							|  |  |  |     CUTE_UNROLL | 
					
						
							|  |  |  |     for (int i = tid; i < size(src); i += NumThreads) { | 
					
						
							|  |  |  |       prefetch(raw_pointer_cast(&src(i))); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <class GEngine, class GLayout> | 
					
						
							|  |  |  | CUTE_HOST_DEVICE | 
					
						
							|  |  |  | void | 
					
						
							|  |  |  | prefetch(Tensor<GEngine, GLayout> const& src) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   return cooperative_prefetch<1>(0, src); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Prefetch with copy atom
 | 
					
						
							|  |  |  | namespace detail { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <class CopyOp, class = void> | 
					
						
							|  |  |  | constexpr bool has_prefetch = false; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <class CopyOp> | 
					
						
							|  |  |  | constexpr bool has_prefetch<CopyOp, void_t<typename CopyOp::PREFETCH>> = true; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } // end namespace detail
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | template <class CopyOp, class... CT_Args, class... CA_Args, | 
					
						
							|  |  |  |           class GEngine, class GLayout> | 
					
						
							|  |  |  | CUTE_HOST_DEVICE | 
					
						
							|  |  |  | void | 
					
						
							|  |  |  | prefetch(Copy_Atom<Copy_Traits<CopyOp, CT_Args...>, CA_Args...> const& atom, | 
					
						
							|  |  |  |          Tensor<GEngine, GLayout>                               const& src) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   if constexpr (detail::has_prefetch<CopyOp>) { | 
					
						
							|  |  |  |     using Prefetch_Traits = Copy_Traits<typename CopyOp::PREFETCH, CT_Args...>; | 
					
						
							|  |  |  |     using Prefetch_Atom = Copy_Atom<Prefetch_Traits, CA_Args...>; | 
					
						
							|  |  |  |     Prefetch_Atom prefetch_atom{atom}; | 
					
						
							|  |  |  |     auto& dst = const_cast<Tensor<GEngine, GLayout>&>(src); // dst is ignored for prefetch atoms
 | 
					
						
							|  |  |  |     return copy(prefetch_atom, src, dst); | 
					
						
							|  |  |  |   } else { | 
					
						
							|  |  |  |     return prefetch(src); | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
 | 
					
						
							|  |  |  | template <class... CT_Args, | 
					
						
							|  |  |  |           class SrcEngine, class SrcLayout> | 
					
						
							|  |  |  | CUTE_HOST_DEVICE | 
					
						
							|  |  |  | void | 
					
						
							|  |  |  | prefetch(Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const& atom, | 
					
						
							|  |  |  |          Tensor<SrcEngine, SrcLayout>                 const& src) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   using SrcType = typename SrcEngine::value_type; | 
					
						
							|  |  |  |   static_assert(is_gmem<SrcEngine>::value, "Expected global tensor for L2 prefetch"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   auto tiler = max_common_layout(src, src); | 
					
						
							|  |  |  |   constexpr int vec_elem = decltype(size(tiler))::value; | 
					
						
							|  |  |  |   constexpr int vec_bits = vec_elem * sizeof_bits_v<SrcType>; | 
					
						
							|  |  |  |   static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // Construct a new concrete Atom of the vector size
 | 
					
						
							|  |  |  |   auto bulk_atom = Copy_Atom<Copy_Traits<SM90_BULK_COPY_G2S, Int<vec_bits>>, SrcType>{}; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   return prefetch(bulk_atom, logical_divide(src, tiler)); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // Backwards-compat. Throw out any extra Copy_Atom args.
 | 
					
						
							|  |  |  | template <class... CT_Args, class... CA_Args, | 
					
						
							|  |  |  |           class SrcEngine, class SrcLayout> | 
					
						
							|  |  |  | CUTE_HOST_DEVICE | 
					
						
							|  |  |  | void | 
					
						
							|  |  |  | prefetch(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom, | 
					
						
							|  |  |  |          Tensor<SrcEngine, SrcLayout>                                        const& src) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   return prefetch(static_cast<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...> const&>(atom), src); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | #endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } // end namespace cute
 |