2023-01-24 09:55:28 +08:00
|
|
|
/***************************************************************************************************
|
|
|
|
* Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
|
|
* list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* 3. Neither the name of the copyright holder nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived from
|
|
|
|
* this software without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
**************************************************************************************************/
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <cute/config.hpp>
|
|
|
|
|
|
|
|
#include <cute/tensor.hpp>
|
|
|
|
#include <cute/tensor_predicate.hpp>
|
|
|
|
|
|
|
|
#include <cute/atom/copy_atom.hpp>
|
|
|
|
|
|
|
|
namespace cute
|
|
|
|
{
|
|
|
|
|
|
|
|
//
|
|
|
|
// Accept mutable temporaries
|
|
|
|
//
|
|
|
|
|
|
|
|
template <class PrdTensor,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_if(PrdTensor const& pred,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> && dst)
|
|
|
|
{
|
|
|
|
return copy_if(pred, src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class... CopyArgs,
|
|
|
|
class PrdTensor,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_if(Copy_Atom<CopyArgs...> const& copy_atom,
|
|
|
|
PrdTensor const& pred,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> && dst)
|
|
|
|
{
|
|
|
|
return copy_if(copy_atom, pred, src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class VecType,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> && dst)
|
|
|
|
{
|
|
|
|
return copy_vec<VecType>(src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> && dst)
|
|
|
|
{
|
|
|
|
return copy(src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class... CopyArgs,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Copy_Atom<CopyArgs...> const& copy_atom,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> && dst)
|
|
|
|
{
|
|
|
|
return copy(copy_atom, src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// copy_if -- Predicated Copy
|
|
|
|
//
|
|
|
|
|
|
|
|
template <class PrdTensor,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_if(PrdTensor const& pred,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
|
|
|
auto copy_op = select_elementwise_copy(src, dst);
|
|
|
|
|
|
|
|
CUTE_UNROLL
|
|
|
|
for (int i = 0; i < size(src); ++i) {
|
|
|
|
if (pred(i)) {
|
|
|
|
copy_op.copy(src(i), dst(i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// copy_if -- Predicated CopyAtom
|
|
|
|
//
|
|
|
|
|
2023-11-02 23:09:05 +08:00
|
|
|
namespace detail {
|
|
|
|
|
|
|
|
// Trait that detects if atom's traits has a member function with(bool)
|
|
|
|
template<typename, typename Enable = void>
|
|
|
|
constexpr bool has_with_bool = false;
|
|
|
|
|
|
|
|
template<typename T>
|
|
|
|
constexpr bool has_with_bool<T, cute::void_t<decltype(declval<typename T::Traits>().with(declval<bool>()))>> = true;
|
|
|
|
|
|
|
|
} // end namespace detail
|
|
|
|
|
2023-01-24 09:55:28 +08:00
|
|
|
template <class... CopyArgs,
|
|
|
|
class PredTensor,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_if(Copy_Atom<CopyArgs...> const& copy_atom,
|
|
|
|
PredTensor const& pred, // (Rest...)
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src, // (V,Rest...)
|
|
|
|
Tensor<DstEngine, DstLayout> & dst) // (V,Rest...)
|
|
|
|
{
|
|
|
|
static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch.");
|
|
|
|
if constexpr (SrcLayout::rank == 1) { // Dispatch the copy
|
|
|
|
copy_atom.call(src, dst);
|
|
|
|
} else { // Loop over all but the first mode
|
|
|
|
constexpr int R = SrcLayout::rank;
|
|
|
|
auto src_v = group_modes<1,R>(src);
|
|
|
|
auto dst_v = group_modes<1,R>(dst);
|
|
|
|
CUTE_UNROLL
|
|
|
|
for (int i = 0; i < size<1>(src_v); ++i) {
|
2023-11-02 23:09:05 +08:00
|
|
|
// If copy traits can be transformed with a predicate value, do it, otherwise branch here
|
|
|
|
if constexpr (detail::has_with_bool<Copy_Atom<CopyArgs...>>) {
|
|
|
|
copy_atom.with(pred(i)).call(src_v(_,i), dst_v(_,i));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (pred(i)) {
|
|
|
|
copy_atom.call(src_v(_,i), dst_v(_,i));
|
|
|
|
}
|
2023-01-24 09:55:28 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// copy_vec -- attempt vectorized copy with VecType
|
|
|
|
//
|
|
|
|
|
|
|
|
template <class VecType,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy_vec(Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
2023-11-02 23:09:05 +08:00
|
|
|
using SrcType = typename SrcEngine::element_type;
|
|
|
|
using DstType = typename DstEngine::element_type;
|
2023-04-15 11:19:34 +08:00
|
|
|
if constexpr (sizeof(SrcType) == sizeof(DstType) && sizeof(VecType) > sizeof(DstType))
|
2023-01-24 09:55:28 +08:00
|
|
|
{
|
|
|
|
/* @pre is_aligned<N>(src.data()) &&
|
|
|
|
* is_aligned<N>(dst.data())
|
|
|
|
*/
|
2023-11-02 23:09:05 +08:00
|
|
|
using SrcVecType = conditional_t<is_volatile_v<SrcType>, VecType const volatile, VecType const>;
|
|
|
|
using DstVecType = conditional_t<is_volatile_v<DstType>, VecType volatile, VecType >;
|
|
|
|
auto src_v = recast<SrcVecType>(src);
|
|
|
|
auto dst_v = recast<DstVecType>(dst);
|
2023-01-24 09:55:28 +08:00
|
|
|
|
|
|
|
#if 0
|
|
|
|
if (thread0()) {
|
|
|
|
print("copy_vec -- vectorizing copy from %3db to %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(VecType)));
|
|
|
|
print(" "); print(layout(src)); print(" => "); print(layout(src_v)); print("\n");
|
|
|
|
print(" "); print(layout(dst)); print(" => "); print(layout(dst_v)); print("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return copy_if(TrivialPredTensor{}, src_v, dst_v);
|
|
|
|
} else {
|
|
|
|
#if 0
|
|
|
|
if (thread0()) {
|
|
|
|
print("copy_vec -- not vectorizing, copy with %3db and %3db\n", int(8*sizeof(SrcType)), int(8*sizeof(DstType)));
|
|
|
|
print(" "); print(layout(src)); print("\n");
|
|
|
|
print(" "); print(layout(dst)); print("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return copy_if(TrivialPredTensor{}, src, dst);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// copy -- auto-vectorizing copy
|
|
|
|
//
|
|
|
|
|
|
|
|
template <class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
|
|
|
constexpr int N = decltype(max_common_vector(src, dst))::value;
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
if (thread0()) {
|
|
|
|
print("copy -- found a max_common_vector of %d\n", N);
|
|
|
|
print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n");
|
|
|
|
print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if constexpr (N <= 1) {
|
|
|
|
return copy_if(TrivialPredTensor{}, src, dst);
|
|
|
|
} else {
|
|
|
|
constexpr int vec_bits = N * sizeof_bits<typename SrcEngine::value_type>::value;
|
|
|
|
using VecType = uint_bit_t<cute::min(128, vec_bits)>;
|
|
|
|
return copy_vec<VecType>(src, dst);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// copy -- CopyAtom
|
|
|
|
//
|
|
|
|
|
|
|
|
template <class... CopyArgs,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Copy_Atom<CopyArgs...> const& copy_atom,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
|
|
|
return copy_if(copy_atom, TrivialPredTensor{}, src, dst);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <class... CopyArgs,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Copy_Atom<DefaultCopy, CopyArgs...> const&,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
|
|
|
return copy(src, dst);
|
|
|
|
}
|
|
|
|
|
2023-04-15 11:19:34 +08:00
|
|
|
//////////////////////////////////////////
|
|
|
|
// Special Auto-Vectorizing Overloads
|
|
|
|
//////////////////////////////////////////
|
|
|
|
|
|
|
|
#if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
|
|
|
|
template <class... CT_Args, class... CA_Args,
|
|
|
|
class SrcEngine, class SrcLayout,
|
|
|
|
class DstEngine, class DstLayout>
|
|
|
|
CUTE_HOST_DEVICE
|
|
|
|
void
|
|
|
|
copy(Copy_Atom<Copy_Traits<SM90_BULK_COPY_AUTO, CT_Args...>, CA_Args...> const& atom,
|
|
|
|
Tensor<SrcEngine, SrcLayout> const& src,
|
|
|
|
Tensor<DstEngine, DstLayout> & dst)
|
|
|
|
{
|
|
|
|
using SrcType = typename SrcEngine::value_type;
|
|
|
|
using DstType = typename DstEngine::value_type;
|
|
|
|
static_assert(sizeof_bits<SrcType>::value == sizeof_bits<DstType>::value);
|
|
|
|
static_assert((is_gmem<SrcEngine>::value && is_smem<DstEngine>::value) ||
|
|
|
|
(is_smem<SrcEngine>::value && is_gmem<DstEngine>::value),
|
|
|
|
"Bulk Copy only supports gmem -> smem or smem -> gmem movement.");
|
|
|
|
// Do BulkCopy dispatch
|
|
|
|
using BULK_COPY_OP = conditional_t<is_gmem<SrcEngine>::value,
|
|
|
|
SM90_BULK_COPY_G2S,
|
|
|
|
SM90_BULK_COPY_S2G>;
|
|
|
|
|
|
|
|
constexpr int N = decltype(max_common_vector(src, dst))::value;
|
|
|
|
|
|
|
|
// Construct a new concrete Atom of the vector size
|
|
|
|
using N_BITS = Int<N*sizeof_bits<SrcType>::value>;
|
|
|
|
using COPY_ATOM = Copy_Atom<Copy_Traits<BULK_COPY_OP, N_BITS, CT_Args...>, SrcType>;
|
|
|
|
auto bulk_atom = apply(atom.opargs_, [&](auto const&... args) { return COPY_ATOM{args...}; });
|
|
|
|
|
|
|
|
// Tile the src and dst to the Atom
|
|
|
|
auto tiler = right_inverse(dst.layout()).compose(Int<N>{});
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
if (thread0()) {
|
|
|
|
print("copy -- found a max_common_vector of %d\n", N);
|
|
|
|
print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n");
|
|
|
|
print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler));
|
|
|
|
}
|
|
|
|
#endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED)
|
|
|
|
|
2023-01-24 09:55:28 +08:00
|
|
|
} // end namespace cute
|