/*************************************************************************************************** * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief This file contains the definition of Tensor as well as classes/functions most closely associated with it. For backwards-compatibility, "tensor.hpp" is the "entrypoint" header for a collection of classes and utilities that are adjacent to Tensor, e.g. fill(). Whereas this file contains the actual definition of Tensor and a small set of functions central to its usage. Within the CUTLASS codebase, favor not including "tensor.hpp" wherever possible; instead include "tensor_impl.hpp" along with other specific headers that you need. This helps to avoid circular includes and to reduce build time. */ #pragma once #include // CUTE_HOST_DEVICE #include // cute::Shape #include // cute::is_composed_layout #include // cute::recast_ptr #include // cute::iterator_traits #include // cute::array_aligned #include // cute::array_subbyte #include // cute::tuple #include // cute::is_integral #include // __CUTE_REQUIRES namespace cute { // // Engine -- owning or non-owning data store // // concept Engine { // using iterator = ; // using value_type = ; // using element_type = ; // using reference = ; // iterator begin(); // }; template struct ArrayEngine { using Storage = typename conditional<(sizeof_bits::value % 8 == 0), array_aligned, array_subbyte>::type; using iterator = typename Storage::iterator; using reference = typename iterator_traits::reference; using element_type = typename iterator_traits::element_type; using value_type = typename iterator_traits::value_type; Storage storage_; CUTE_HOST_DEVICE constexpr auto begin() const { return storage_.begin(); } CUTE_HOST_DEVICE constexpr auto begin() { return storage_.begin(); } }; // Specialization for sparse_elem tensor allocation/iteration template struct ArrayEngine, N> { static_assert(N % S == 0, "Expected a multiple of the sparsity."); using value_type = sparse_elem; using Storage = typename conditional<(sizeof_bits::value % 8 == 0), array_aligned, array_subbyte>::type; using iterator = sparse_ptr*>; using reference = typename iterator_traits::reference; using element_type = typename iterator_traits::element_type; Storage storage_; CUTE_HOST_DEVICE constexpr auto begin() const { return recast_ptr(storage_.begin()); } CUTE_HOST_DEVICE constexpr auto begin() { return recast_ptr(storage_.begin()); } }; template struct ViewEngine { using iterator = Iterator; using reference = typename iterator_traits::reference; using element_type = typename iterator_traits::element_type; using value_type = typename iterator_traits::value_type; iterator storage_; CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; } CUTE_HOST_DEVICE constexpr iterator & begin() { return storage_; } }; template struct ConstViewEngine { using iterator = Iterator; using reference = typename iterator_traits::reference; using element_type = typename iterator_traits::element_type; using value_type = typename iterator_traits::value_type; iterator storage_; CUTE_HOST_DEVICE constexpr iterator const& begin() const { return storage_; } }; // // Tensor // template struct Tensor { using iterator = typename Engine::iterator; using value_type = typename Engine::value_type; using element_type = typename Engine::element_type; using reference = typename Engine::reference; using engine_type = Engine; using layout_type = Layout; CUTE_HOST_DEVICE constexpr Tensor() {} CUTE_HOST_DEVICE constexpr Tensor(Engine const& engine, Layout const& layout) : rep_(layout, engine) { } // // Accessors // static constexpr int rank = Layout::rank; CUTE_HOST_DEVICE constexpr decltype(auto) tensor() const { return *this; } CUTE_HOST_DEVICE constexpr decltype(auto) engine() const { return get<1>(rep_); } CUTE_HOST_DEVICE constexpr decltype(auto) engine() { return get<1>(rep_); } CUTE_HOST_DEVICE constexpr decltype(auto) data() const { return engine().begin(); } CUTE_HOST_DEVICE constexpr decltype(auto) data() { return engine().begin(); } CUTE_HOST_DEVICE constexpr decltype(auto) layout() const { return get<0>(rep_); } CUTE_HOST_DEVICE constexpr decltype(auto) shape() const { return layout().shape(); } CUTE_HOST_DEVICE constexpr auto size() const { return cute::size(shape()); } CUTE_HOST_DEVICE constexpr decltype(auto) stride() const { return layout().stride(); } // // Indexing op() and op[] // // Index into this tensor like an array by computing the offset via layout() template CUTE_HOST_DEVICE constexpr decltype(auto) operator[](Coord const& coord) { return data()[layout()(coord)]; } template CUTE_HOST_DEVICE constexpr decltype(auto) operator[](Coord const& coord) const { return data()[layout()(coord)]; } template CUTE_HOST_DEVICE constexpr decltype(auto) operator()(Coord const& coord) { if constexpr (has_underscore::value) { auto const& [sliced_layout,offset] = slice_and_offset(coord, layout()); return make_tensor(data() + offset, sliced_layout); } else { return data()[layout()(coord)]; } CUTE_GCC_UNREACHABLE; } template CUTE_HOST_DEVICE constexpr decltype(auto) operator()(Coord const& coord) const { if constexpr (has_underscore::value) { auto const& [sliced_layout,offset] = slice_and_offset(coord, layout()); return make_tensor(data() + offset, sliced_layout); } else { return data()[layout()(coord)]; } CUTE_GCC_UNREACHABLE; } // op() convenience function for multi-dimensional coordinates template CUTE_HOST_DEVICE constexpr decltype(auto) operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) { return operator()(make_coord(c0,c1,cs...)); } template CUTE_HOST_DEVICE constexpr decltype(auto) operator()(Coord0 const& c0, Coord1 const& c1, Coords const&... cs) const { return operator()(make_coord(c0,c1,cs...)); } // // Compose // template CUTE_HOST_DEVICE constexpr auto compose(Layouts const&... layouts) { return make_tensor(data(), layout().compose(layouts...)); } template CUTE_HOST_DEVICE constexpr auto compose(Layouts const&... layouts) const { return make_tensor(data(), layout().compose(layouts...)); } // // Tile // template CUTE_HOST_DEVICE constexpr auto tile(Layouts const&... layouts) { return make_tensor(data(), layout().tile(layouts...)); } template CUTE_HOST_DEVICE constexpr auto tile(Layouts const&... layouts) const { return make_tensor(data(), layout().tile(layouts...)); } // // Utility // template ::value)> CUTE_HOST_DEVICE constexpr auto get_1d_coord(Int const& linear_idx) const { return layout().get_1d_coord(linear_idx); } template ::value)> CUTE_HOST_DEVICE constexpr auto get_hier_coord(Int const& linear_idx) const { return layout().get_hier_coord(linear_idx); } template ::value)> CUTE_HOST_DEVICE constexpr auto get_flat_coord(Int const& linear_idx) const { return layout().get_flat_coord(linear_idx); } cute::tuple rep_; }; template struct is_tensor : false_type {}; template struct is_tensor> : true_type {}; template constexpr bool is_tensor_v = is_tensor::value; // Customization point for creation of owning and non-owning Tensors template struct MakeTensor { template CUTE_HOST_DEVICE constexpr auto operator()(Arg0 const& arg0, Args const&... args) const { if constexpr (has_dereference::value) { // Construct a non-owning Tensor using Engine = ViewEngine; if constexpr (sizeof...(Args) == 1 && (is_layout::value && ...)) { // Forward a Layout return Tensor{Engine{arg0}, args...}; } else { // Construct a Layout from Args return Tensor{Engine{arg0}, make_layout(args...)}; } } else { // Construct an owning Tensor static_assert((is_static::value && ... && is_static::value), "Dynamic owning tensors not supported"); if constexpr (sizeof...(Args) == 0 && is_layout::value) { // Forward a Layout using Layout = Arg0; using Engine = ArrayEngine>; return Tensor(); } else { // Construct a Layout from Args using Layout = decltype(make_layout(arg0, args...)); using Engine = ArrayEngine>; return Tensor(); } } } }; // // make_tensor // // Make an owning Tensor that will allocate a static array // e.g. make_tensor(Int<12>{}) template CUTE_HOST_DEVICE constexpr auto make_tensor(Args const&... args) { static_assert((not has_dereference::value && ...), "Expected layout args... in make_tensor(args...)"); return MakeTensor{}(args...); } // Make a non-owning Tensor that will use a pointer (view) // e.g. make_tensor(vec.data(), 12) template CUTE_HOST_DEVICE constexpr auto make_tensor(Iterator const& iter, Args const&... args) { static_assert(has_dereference::value, "Expected iterator iter in make_tensor(iter, args...)"); static_assert((not has_dereference::value && ...), "Expected layout args... in make_tensor(iter, args...)"); return MakeTensor{}(iter, args...); } // // make_tensor_like // Make a register tensor the same type and shape and (if possible) order as another tensor // template CUTE_HOST_DEVICE constexpr auto make_tensor_like(Layout const& layout) { return make_tensor(make_layout_like(layout)); } template CUTE_HOST_DEVICE constexpr auto make_tensor_like(Tensor const& tensor) { return make_tensor_like(tensor.layout()); } template CUTE_HOST_DEVICE constexpr auto make_tensor_like(Tensor const& tensor) { return make_tensor_like(tensor.layout()); } // // make_fragment_like // Make a tensor the same shape and (if possible) order as another tensor, with special // consideration of the 0th mode. The 0th mode is commonly used for MMA_Atoms or Copy_Atoms // so this allocates the 0th mode with LayoutLeft regardless of the reference layout. // template CUTE_HOST_DEVICE constexpr auto make_fragment_like(Layout const& layout) { return make_tensor(make_fragment_like(layout)); } template CUTE_HOST_DEVICE constexpr auto make_fragment_like(Tensor const& tensor) { return make_fragment_like(tensor.layout()); } template CUTE_HOST_DEVICE constexpr auto make_fragment_like(Tensor const& tensor) { return make_fragment_like(tensor.layout()); } // // make_counting_tensor // Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain. // template ::value)> CUTE_HOST_DEVICE constexpr auto make_counting_tensor(Layout const& layout) { return make_tensor(make_inttuple_iter(repeat_like(coshape(layout), Int<0>{})), layout); } // // make_identity_tensor // Make a tensor that maps coordinates within a shape to themselves. // template CUTE_HOST_DEVICE constexpr auto make_identity_tensor(Shape const& shape) { return make_counting_tensor(make_identity_layout(shape)); } // // Utilities // // Return the subtensor of a mode template CUTE_HOST_DEVICE constexpr auto tensor(Tensor&& tensor) { if constexpr (sizeof...(Is) == 0) { return tensor; } else { return make_tensor(tensor.data(), get(tensor.layout())); } CUTE_GCC_UNREACHABLE; } // Return the layout of a mode template CUTE_HOST_DEVICE constexpr auto layout(Tensor const& tensor) { return layout(tensor.layout()); } // Return the shape of a mode template CUTE_HOST_DEVICE constexpr auto shape(Tensor const& tensor) { return shape(tensor.layout()); } // Return the stride of a mode template CUTE_HOST_DEVICE constexpr auto stride(Tensor const& tensor) { return stride(tensor.layout()); } // Return the number of elements in a mode template CUTE_HOST_DEVICE constexpr auto size(Tensor const& tensor) { return size(tensor.layout()); } // Return the rank of a mode template CUTE_HOST_DEVICE constexpr auto rank(Tensor const& tensor) { return rank(tensor.layout()); } // Return the depth of a mode template CUTE_HOST_DEVICE constexpr auto depth(Tensor const& tensor) { return depth(tensor.layout()); } // // Operations to manipulate Tensors like a Layout or IntTuple // These are implemented with explicit modifier overloads because these // methods likely also have a general IntTuple overload that can shadow. // template CUTE_HOST_DEVICE constexpr auto flatten(Tensor const& tensor) { return make_tensor(tensor.data(), flatten(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto flatten(Tensor& tensor) { return make_tensor(tensor.data(), flatten(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto flatten(Tensor&& tensor) { return make_tensor(tensor.data(), flatten(tensor.layout())); } template > CUTE_HOST_DEVICE constexpr auto coalesce(Tensor const& tensor, Profile const& profile = {}) { return make_tensor(tensor.data(), coalesce(tensor.layout(), profile)); } template > CUTE_HOST_DEVICE constexpr auto coalesce(Tensor& tensor, Profile const& profile = {}) { return make_tensor(tensor.data(), coalesce(tensor.layout(), profile)); } template > CUTE_HOST_DEVICE constexpr auto coalesce(Tensor&& tensor, Profile const& profile = {}) { return make_tensor(tensor.data(), coalesce(tensor.layout(), profile)); } // Replace the modes in layout that have a 0-stride with a 1-size template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor const& tensor) { return make_tensor(tensor.data(), filter_zeros(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor& tensor) { return make_tensor(tensor.data(), filter_zeros(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor&& tensor) { return make_tensor(tensor.data(), filter_zeros(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor const& tensor, Profile const& profile) { return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile)); } template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor& tensor, Profile const& profile) { return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile)); } template CUTE_HOST_DEVICE constexpr auto filter_zeros(Tensor&& tensor, Profile const& profile) { return make_tensor(tensor.data(), filter_zeros(tensor.layout(), profile)); } // Remove all of the 0-strides and 1-sizes template CUTE_HOST_DEVICE constexpr auto filter(Tensor const& tensor) { return make_tensor(tensor.data(), filter(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto filter(Tensor& tensor) { return make_tensor(tensor.data(), filter(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto filter(Tensor&& tensor) { return make_tensor(tensor.data(), filter(tensor.layout())); } // Group the modes [B,E) into a single mode // e.g. group<2,4>(make_tensor(Layout>{})) // => make_tensor(Layout,_5,_6>>{}) template CUTE_HOST_DEVICE constexpr auto group_modes(Tensor const& tensor) { return make_tensor(tensor.data(), group(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto group_modes(Tensor& tensor) { return make_tensor(tensor.data(), group(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto group_modes(Tensor&& tensor) { return make_tensor(tensor.data(), group(tensor.layout())); } // Return the subtensor of a range of modes template CUTE_HOST_DEVICE constexpr auto take(Tensor const& tensor) { return make_tensor(tensor.data(), take(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto take(Tensor& tensor) { return make_tensor(tensor.data(), take(tensor.layout())); } template CUTE_HOST_DEVICE constexpr auto take(Tensor&& tensor) { return make_tensor(tensor.data(), take(tensor.layout())); } // Return a tensor with the same shape as input but offset by a given coordinate template >::value)> CUTE_HOST_DEVICE constexpr auto domain_offset(Coord const& coord, Tensor&& tensor) { auto [layout, ptr_offset] = domain_offset(coord, tensor.layout()); return make_tensor(static_cast(tensor).data() + ptr_offset, layout); } // // Recast // // NOTE: This is very dangerous to do // -- doesn't check dynamic integer divisibility // -- doesn't check alignment template CUTE_HOST_DEVICE constexpr auto recast(Tensor&& tensor) { using OldType = typename remove_cvref_t::value_type; auto old_layout = tensor.layout(); auto new_layout = recast_layout(old_layout); // If this is an upcast of a normal Layout with static negative strides, then offset as well if constexpr (sizeof(OldType) < sizeof(NewType) && not is_composed_layout::value) { auto shape_diff = transform(flatten(old_layout.shape()), flatten(new_layout.shape()), minus{}); auto extent_diff = transform(shape_diff, flatten(old_layout.stride()), multiplies{}); auto offset = fold(extent_diff, Int<0>{}, [](auto const& i, auto const& a) { return i + cute::min(a,Int<0>{}); }); return make_tensor(recast_ptr(static_cast(tensor).data() + offset), new_layout); } else { return make_tensor(recast_ptr(static_cast(tensor).data() ), new_layout); } CUTE_GCC_UNREACHABLE; } // // max_common_vector // /* Return Int such that N is the maximum number of contiguous elements * that logically correspond in the tensors of @a a and @a b. This is, * the number of elements that could reasonably be vectorized into a single load/store. * * @returns Int with N >= 0 * * A return value of Int<0> indicates that no such conclusion can be made and no * vectorization should be attempted. * * Note that the return value does NOT include alignment concerns such as the pointer value and * the divisbility of dynamic strides. */ template CUTE_HOST_DEVICE constexpr auto max_common_vector(Tensor const& a, Tensor const& b) { using SrcType = typename SrcEngine::value_type; using SrcRef = typename SrcEngine::reference; using DstType = typename DstEngine::value_type; using DstRef = typename DstEngine::reference; // Determine if vectorization candidates at all if constexpr (// Should be the same value_types, else the copy is also performing a cast cute::is_same::value && // The types should be trivially copyable so that vectorization is valid is_trivially_copyable::value && is_trivially_copyable::value && // Should be load/storing real data, rather than implicit iterators or such is_reference::value && is_reference::value) { return max_common_vector(a.layout(), b.layout()); } else { return Int<0>{}; } CUTE_GCC_UNREACHABLE; } /* Return a layout that points to the maximum number of contiguous elements * that logically correspond in the tensors of @a a and @a b. This is, * the elements that could reasonably be "vectorized" into a single load/store. * * @returns Layout R such that composition(a.layout(), R) and composition(b.layout(), R) * are both identity Layouts. * * Note that the returned layout does NOT include alignment concerns such as the pointer value and * the divisbility of dynamic strides. */ template CUTE_HOST_DEVICE constexpr auto max_common_layout(Tensor const& a, Tensor const& b) { using SrcType = typename SrcEngine::value_type; using SrcRef = typename SrcEngine::reference; using DstType = typename DstEngine::value_type; using DstRef = typename DstEngine::reference; // Determine if vectorization candidates at all if constexpr (// Should be the same value_types, else the copy is also performing a cast cute::is_same::value && // The types should be trivially copyable so that vectorization is valid is_trivially_copyable::value && is_trivially_copyable::value && // Should be load/storing real data, rather than implicit iterators or such is_reference::value && is_reference::value) { return max_common_layout(a.layout(), b.layout()); } else { return Layout<_1,_0>{}; } CUTE_GCC_UNREACHABLE; } // // Key algebraic operations -- Composition, Divide, and Product // // Apply a Tiler to the Tensor via composition. template >::value)> CUTE_HOST_DEVICE constexpr auto composition(Tensor && tensor, Tiler const& tiler) // Layout or Tile or Shape { return make_tensor(static_cast(tensor).data(), composition(tensor.layout(), tiler)); } // Apply a Tiler to the Tensor. // // Consider a Tensor with shape (A,B,x,y) // And a Tiler that is: // // * A Layout with shape (BLK_A,BLK_B) // ** Result Tensor shape ((BLK_A,BLK_B),Rest). // ** That is, the Tensor and Tile are treated as 1D for the tiling. // ** See logical_divide(Layout,Layout) // // * A Tile with shape // ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y). // ** Each mode of the Tile is applied to the corresponding mode of the Tensor. // ** See logical_divide(Layout,Tuple) // // * A Shape (BLK_A,BLK_B) // ** Result Tensor shape ((BLK_A,a),(BLK_B,b),x,y). // ** Equivalent to applying Tile. // ** See logical_divide(Layout,Tuple) and logical_divide(Layout,Int) // // Note that the Tile/Shape Tilers must be weakly_congruent to the Tensor template >::value)> CUTE_HOST_DEVICE constexpr auto logical_divide(Tensor && tensor, Tiler const& tiler) // Layout or Tile or Shape { return make_tensor(static_cast(tensor).data(), logical_divide(tensor.layout(), tiler)); } // zipped_divide is logical_divide with Tiler modes and Rest modes gathered together: (Tiler,Rest) // When Tiler is Layout, this has no effect as logical_divide results in the same. // When Tiler is Tile or Shape, this zips modes into standard form ((BLK_A,BLK_B),(a,b,x,y)) template >::value)> CUTE_HOST_DEVICE constexpr auto zipped_divide(Tensor && tensor, Tiler const& tiler) // Layout or Tile or Shape { return make_tensor(static_cast(tensor).data(), zipped_divide(tensor.layout(), tiler)); } // tiled_divide is zipped_divide with the second output mode flattened ((BLK_A,BLK_B),a,b,x,y) template >::value)> CUTE_HOST_DEVICE constexpr auto tiled_divide(Tensor && tensor, Tiler const& tiler) // Layout or Tile or Shape { return make_tensor(static_cast(tensor).data(), tiled_divide(tensor.layout(), tiler)); } // flat_divide is zipped_divide with the both modes flattened (BLK_A,BLK_B,a,b,x,y) template >::value)> CUTE_HOST_DEVICE constexpr auto flat_divide(Tensor && tensor, Tiler const& tiler) // Layout or Tile or Shape { return make_tensor(static_cast(tensor).data(), flat_divide(tensor.layout(), tiler)); } // logical_product on a Tensor doesn't make sense since it often increases cosize // though this might make sense for creating Tensors with broadcasted (stride-0) modes // // Tensor partitioning utilities // // Apply a Tiler to the Tensor, then slice out one of those tiles by slicing into the "Rest" modes. // With an inner_partition, you get everything that's inside the Tiler. Everything that the Tiler is pointing to. // Split the modes of tensor according to the Tiler // zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y)) // Then slice into the second mode (the "Rest" mode) with Coord template >::value)> CUTE_HOST_DEVICE constexpr auto inner_partition(Tensor && tensor, Tiler const& tiler, Coord const& coord) { auto tensor_tiled = zipped_divide(static_cast(tensor), tiler); constexpr int R0 = decltype(rank<0>(tensor_tiled))::value; // The coord slices into the second mode (the "rest" mode), flatten the first if constexpr (is_tuple::value) { // Append trailing modes if coord is tuple constexpr int R1 = decltype(rank<1>(tensor_tiled))::value; return tensor_tiled(repeat(_), append(coord,_)); } else { // Flat indexing if coord is not tuple return tensor_tiled(repeat(_), coord); } } // Apply a Tiler to the Tensor, then slice out the remainder by slicing into the "Tile" modes. // With an outer_partition, you get everything that's outside the Tiler. The layout of the Tile in the Tensor. // Split the modes of tensor according to the Tiler // zipped_divide returns something like ((BLK_A,BLK_B,...),(a,b,...,x,y)) // Then slice into the first mode (the "Tile" mode) with Coord template >::value)> CUTE_HOST_DEVICE constexpr auto outer_partition(Tensor && tensor, Tiler const& tiler, Coord const& coord) { auto tensor_tiled = zipped_divide(static_cast(tensor), tiler); constexpr int R1 = decltype(rank<1>(tensor_tiled))::value; // The coord slices into the first mode (the "tile" mode), flatten the second if constexpr (is_tuple::value) { // Append trailing modes if coord is tuple constexpr int R0 = decltype(rank<0>(tensor_tiled))::value; return tensor_tiled(append(coord,_), repeat(_)); } else { // Flat indexing if coord is not tuple return tensor_tiled(coord, repeat(_)); } } // Tile a tensor according to @a tiler and use @a coord to index into the remainder, keeping the tile. // This is typical at the CTA level where tiles of data are extracted: // Tensor data = ... // ( M, N) // Tensor cta_data = local_tile(data, Shape<_32,_64>{}, make_coord(blockIdx.x,blockIdx.y)); // (_32,_64) template >::value)> CUTE_HOST_DEVICE constexpr auto local_tile(Tensor && tensor, Tiler const& tiler, // tiler to apply Coord const& coord) // coord to slice into "remainder" { return inner_partition(static_cast(tensor), tiler, coord); } // Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience // when using projections of the same tiler. // This is typical at the CTA level where tiles of data are extracted as projections: // Tensor dataA = ... // (M,K) // Tensor dataB = ... // (N,K) // Tensor dataC = ... // (M,N) // auto cta_tiler = Shape<_32, _64, _4>{}; // auto cta_coord = make_coord(blockIdx.x, blockIdx.y, _); // Tensor ctaA = local_tile(dataA, cta_tiler, cta_coord, Step<_1, X,_1>{}); // (_32,_4,k) // Tensor ctaB = local_tile(dataA, cta_tiler, cta_coord, Step< X,_1,_1>{}); // (_64,_4,k) // Tensor ctaC = local_tile(dataA, cta_tiler, cta_coord, Step<_1,_1, X>{}); // (_32,_64) template >::value)> CUTE_HOST_DEVICE auto local_tile(Tensor && tensor, Tiler const& tiler, // tiler to apply Coord const& coord, // coord to slice into "remainder" Proj const& proj) // projection to apply to tiler and coord { return local_tile(static_cast(tensor), dice(proj, tiler), dice(proj, coord)); } // Tile a tensor according to the flat shape of a layout that provides the coordinate of the target index. // This is typical at the Thread level where data is partitioned across repeated patterns of threads: // Tensor data = ... // (_16,_64) // Tensor thr_data = local_partition(data, Layout>{}, thr_idx); // ( _8, _4) template >::value)> CUTE_HOST_DEVICE auto local_partition(Tensor && tensor, Layout const& tile, // coord -> index Index const& index) // index to slice for { static_assert(is_integral::value); return outer_partition(static_cast(tensor), product_each(shape(tile)), tile.get_flat_coord(index)); } // Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience // when using projections of the same tiler. // This is typical at the Thread level where data is partitioned across projected layouts of threads: // Tensor dataA = ... // (M,K) // Tensor dataB = ... // (N,K) // Tensor dataC = ... // (M,N) // auto thr_layout = Layout, Stride<_16,_1,_0>>{}; // Tensor thrA = local_partition(dataA, thr_layout, thr_idx, Step<_1, X,_1>{}); // (M/2,K/1) // Tensor thrB = local_partition(dataB, thr_layout, thr_idx, Step< X,_1,_1>{}); // (N/16,K/1) // Tensor thrC = local_partition(dataC, thr_layout, thr_idx, Step<_1,_1, X>{}); // (M/2,N/16) template >::value)> CUTE_HOST_DEVICE auto local_partition(Tensor && tensor, Layout const& tile, // coord -> index Index const& index, // index to slice for Projection const& proj) { return local_partition(static_cast(tensor), dice(proj, tile), index); } // // Display utilities // template CUTE_HOST_DEVICE void print(Tensor const& tensor) { print(tensor.data()); print(" o "); print(tensor.layout()); } template CUTE_HOST_DEVICE void print_tensor(Tensor const& tensor, bool print_type = true) { if (print_type) { print(tensor); print(":\n"); } if constexpr (Layout::rank == 1) { for (int m = 0; m < size(tensor); ++m) { pretty_print(tensor(m)); printf("\n"); } } else if constexpr (Layout::rank == 2) { for (int m = 0; m < size<0>(tensor); ++m) { for (int n = 0; n < size<1>(tensor); ++n) { pretty_print(tensor(m,n)); } printf("\n"); } } else if constexpr (Layout::rank == 3) { print_tensor(tensor(_,_,0), false); for (int k = 1; k < size<2>(tensor); ++k) { for (int i = 0; i < 5*size<1>(tensor); ++i) { print("-"); } print("\n"); print_tensor(tensor(_,_,k), false); } } else if constexpr (Layout::rank == 4) { print_tensor(tensor(_,_,_,0), false); for (int p = 1; p < size<3>(tensor); ++p) { for (int i = 0; i < 5*size<1>(tensor); ++i) { print("="); } print("\n"); print_tensor(tensor(_,_,_,p), false); } } } #if !defined(__CUDACC_RTC__) template CUTE_HOST std::ostream& print_tensor_os(std::ostream& os, Tensor const& tensor) { int digits = 9; if constexpr (Layout::rank == 1) { for (int m = 0; m < size(tensor); ++m) { os << std::setw(digits) << tensor(m) << std::endl; } } else if constexpr (Layout::rank == 2) { for (int m = 0; m < size<0>(tensor); ++m) { for (int n = 0; n < size<1>(tensor); ++n) { os << std::setw(digits) << tensor(m,n); } os << std::endl; } } else if constexpr (Layout::rank == 3) { print_tensor_os(os, tensor(_,_,0)); for (int k = 1; k < size<2>(tensor); ++k) { for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "-"; } os << std::endl; print_tensor_os(os, tensor(_,_,k)); } } else if constexpr (Layout::rank == 4) { print_tensor_os(os, tensor(_,_,_,0)); for (int p = 1; p < size<3>(tensor); ++p) { for (int i = 0; i < digits*size<1>(tensor); ++i) { os << "="; } os << std::endl; print_tensor_os(os, tensor(_,_,_,p)); } } return os; } template CUTE_HOST std::ostream& operator<<(std::ostream& os, Tensor const& tensor) { os << tensor.layout() << std::endl; return print_tensor_os(os, tensor); } #endif // !defined(__CUDACC_RTC__) } // end namespace cute