cutlass/examples/cute/tutorial/tiled_copy.cu

/***************************************************************************************************
 * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <cute/tensor.hpp>

#include "cutlass/util/print_error.hpp"
#include "cutlass/util/GPU_Clock.hpp"
#include "cutlass/util/helper_cuda.hpp"

// This is a simple tutorial showing several ways to partition a tensor into tiles then
// perform efficient, coalesced copies. This example also shows how to vectorize accesses
// which may be a useful optimization or required for certain workloads.
//
// `copy_kernel()` and `copy_kernel_vectorized()` each assume a pair of tensors with
// dimensions (m, n) have been partitioned via `tiled_divide()`.
//
// The result are a part of compatible tensors with dimensions ((M, N), m', n'), where
// (M, N) denotes a statically sized tile, and m' and n' denote the number of such tiles
// within the tensor.
//
// Each statically sized tile is mapped to a CUDA threadblock which performs efficient
// loads and stores to Global Memory.
//
// `copy_kernel()` uses `cute::local_partition()` to partition the tensor and map
// the result to threads using a striped indexing scheme. Threads themselve are arranged
// in a (ThreadShape_M, ThreadShape_N) arrangement which is replicated over the tile.
//
// `copy_kernel_vectorized()` uses `cute::make_tiled_copy()` to perform a similar
// partitioning using `cute::Copy_Atom` to perform vectorization. The actual vector
// size is defined by `ThreadShape`.
//
// This example assumes the overall tensor shape is divisible by the tile size and
// does not perform predication.


/// Simple copy kernel.
//
// Uses local_partition() to partition a tile among threads arranged as (THR_M, THR_N).
template <class TensorS, class TensorD, class ThreadLayout>
__global__ void copy_kernel(TensorS S, TensorD D, ThreadLayout)
{
  using namespace cute;

  // Slice the tiled tensors
  Tensor tile_S = S(make_coord(_,_), blockIdx.x, blockIdx.y);   // (BlockShape_M, BlockShape_N)
  Tensor tile_D = D(make_coord(_,_), blockIdx.x, blockIdx.y);   // (BlockShape_M, BlockShape_N)

  // Construct a partitioning of the tile among threads with the given thread arrangement.

  // Concept:                         Tensor  ThrLayout       ThrIndex
  Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, threadIdx.x);  // (ThrValM, ThrValN)
  Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, threadIdx.x);  // (ThrValM, ThrValN)

  // Construct a register-backed Tensor with the same shape as each thread's partition
  // Use make_tensor to try to match the layout of thr_tile_S
  Tensor fragment = make_tensor_like(thr_tile_S);               // (ThrValM, ThrValN)

  // Copy from GMEM to RMEM and from RMEM to GMEM
  copy(thr_tile_S, fragment);
  copy(fragment, thr_tile_D);
}

/// Vectorized copy kernel.
///
/// Uses `make_tiled_copy()` to perform a copy using vector instructions. This operation
/// has the precondition that pointers are aligned to the vector size.
///
template <class TensorS, class TensorD, class ThreadLayout, class VecLayout>
__global__ void copy_kernel_vectorized(TensorS S, TensorD D, ThreadLayout, VecLayout)
{
  using namespace cute;
  using Element = typename TensorS::value_type;

  // Slice the tensors to obtain a view into each tile.
  Tensor tile_S = S(make_coord(_, _), blockIdx.x, blockIdx.y);  // (BlockShape_M, BlockShape_N)
  Tensor tile_D = D(make_coord(_, _), blockIdx.x, blockIdx.y);  // (BlockShape_M, BlockShape_N)

  // Define `AccessType` which controls the size of the actual memory access.
  using AccessType = cutlass::AlignedArray<Element, size(VecLayout{})>;

  // A copy atom corresponds to one hardware memory access.
  using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;

  // Construct tiled copy, a tiling of copy atoms.
  //
  // Note, this assumes the vector and thread layouts are aligned with contigous data
  // in GMEM. Alternative thread layouts are possible but may result in uncoalesced
  // reads. Alternative vector layouts are also possible, though incompatible layouts
  // will result in compile time errors.
  auto tiled_copy =
    make_tiled_copy(
      Atom{},                       // access size
      ThreadLayout{},               // thread layout
      VecLayout{});                 // vector layout (e.g. 4x1)

  // Construct a Tensor corresponding to each thread's slice.
  auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);

  Tensor thr_tile_S = thr_copy.partition_S(tile_S);             // (CopyOp, CopyM, CopyN)
  Tensor thr_tile_D = thr_copy.partition_D(tile_D);             // (CopyOp, CopyM, CopyN)

  // Construct a register-backed Tensor with the same shape as each thread's partition
  // Use make_fragment because the first mode is the instruction-local mode
  Tensor fragment = make_fragment_like(thr_tile_D);             // (CopyOp, CopyM, CopyN)

  // Copy from GMEM to RMEM and from RMEM to GMEM
  copy(tiled_copy, thr_tile_S, fragment);
  copy(tiled_copy, fragment, thr_tile_D);
}

/// Main function
int main(int argc, char** argv)
{
  //
  // Given a 2D shape, perform an efficient copy
  //

  using namespace cute;
  using Element = float;

  // Define a tensor shape with dynamic extents (m, n)
  auto tensor_shape = make_shape(256, 512);

  //
  // Allocate and initialize
  //

  thrust::host_vector<Element> h_S(size(tensor_shape));
  thrust::host_vector<Element> h_D(size(tensor_shape));

  for (size_t i = 0; i < h_S.size(); ++i) {
    h_S[i] = static_cast<Element>(i);
    h_D[i] = Element{};
  }

  thrust::device_vector<Element> d_S = h_S;
  thrust::device_vector<Element> d_D = h_D;

  //
  // Make tensors
  //

  Tensor tensor_S = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), make_layout(tensor_shape));
  Tensor tensor_D = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), make_layout(tensor_shape));

  //
  // Tile tensors
  //

  // Define a statically sized block (M, N).
  // Note, by convention, capital letters are used to represent static modes.
  auto block_shape = make_shape(Int<128>{}, Int<64>{});

  if ((size<0>(tensor_shape) % size<0>(block_shape)) || (size<1>(tensor_shape) % size<1>(block_shape))) {
    std::cerr << "The tensor shape must be divisible by the block shape." << std::endl;
    return -1;
  }
  // Equivalent check to the above
  if (not evenly_divides(tensor_shape, block_shape)) {
    std::cerr << "Expected the block_shape to evenly divide the tensor shape." << std::endl;
    return -1;
  }

  // Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile
  // shape, and modes (m', n') correspond to the number of tiles.
  //
  // These will be used to determine the CUDA kernel grid dimensions.
  Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape);      // ((M, N), m', n')
  Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape);      // ((M, N), m', n')

  // Thread arrangement
  Layout thr_layout = make_layout(make_shape(Int<32>{}, Int<8>{}));

  // Vector dimensions
  Layout vec_layout = make_layout(make_shape(Int<4>{}, Int<1>{}));

  //
  // Determine grid and block dimensions
  //

  dim3 gridDim (size<1>(tiled_tensor_D), size<2>(tiled_tensor_D));   // Grid shape corresponds to modes m' and n'
  dim3 blockDim(size(thr_layout));

  //
  // Launch the kernel
  //
  copy_kernel_vectorized<<< gridDim, blockDim >>>(
    tiled_tensor_S,
    tiled_tensor_D,
    thr_layout,
    vec_layout);

  cudaError result = cudaDeviceSynchronize();
  if (result != cudaSuccess) {
    std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) << std::endl;
    return -1;
  }

  //
  // Verify
  //

  h_D = d_D;

  int32_t errors = 0;
  int32_t const kErrorLimit = 10;

  for (size_t i = 0; i < h_D.size(); ++i) {
    if (h_S[i] != h_D[i]) {
      std::cerr << "Error. S[" << i << "]: " << h_S[i] << ",   D[" << i << "]: " << h_D[i] << std::endl;

      if (++errors >= kErrorLimit) {
        std::cerr << "Aborting on " << kErrorLimit << "nth error." << std::endl;
        return -1;
      }
    }
  }

  std::cout << "Success." << std::endl;

  return 0;
}
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`/***************************************************************************************************`
Update license year (#1306) 2024-01-17 03:37:22 +08:00			`* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`* SPDX-License-Identifier: BSD-3-Clause`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions are met:`
			`*`
			`* 1. Redistributions of source code must retain the above copyright notice, this`
			`* list of conditions and the following disclaimer.`
			`*`
			`* 2. Redistributions in binary form must reproduce the above copyright notice,`
			`* this list of conditions and the following disclaimer in the documentation`
			`* and/or other materials provided with the distribution.`
			`*`
			`* 3. Neither the name of the copyright holder nor the names of its`
			`* contributors may be used to endorse or promote products derived from`
			`* this software without specific prior written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE`
			`* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR`
			`* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER`
			`* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,`
			`* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`
			`#include <thrust/host_vector.h>`
			`#include <thrust/device_vector.h>`

			`#include <cute/tensor.hpp>`

			`#include "cutlass/util/print_error.hpp"`
			`#include "cutlass/util/GPU_Clock.hpp"`
			`#include "cutlass/util/helper_cuda.hpp"`

			`// This is a simple tutorial showing several ways to partition a tensor into tiles then`
			`// perform efficient, coalesced copies. This example also shows how to vectorize accesses`
			`// which may be a useful optimization or required for certain workloads.`
			`//`
			// `copy_kernel()` and `copy_kernel_vectorized()` each assume a pair of tensors with
			// dimensions (m, n) have been partitioned via `tiled_divide()`.
			`//`
			`// The result are a part of compatible tensors with dimensions ((M, N), m', n'), where`
			`// (M, N) denotes a statically sized tile, and m' and n' denote the number of such tiles`
			`// within the tensor.`
			`//`
			`// Each statically sized tile is mapped to a CUDA threadblock which performs efficient`
			`// loads and stores to Global Memory.`
			`//`
			// `copy_kernel()` uses `cute::local_partition()` to partition the tensor and map
			`// the result to threads using a striped indexing scheme. Threads themselve are arranged`
			`// in a (ThreadShape_M, ThreadShape_N) arrangement which is replicated over the tile.`
			`//`
			// `copy_kernel_vectorized()` uses `cute::make_tiled_copy()` to perform a similar
			// partitioning using `cute::Copy_Atom` to perform vectorization. The actual vector
			// size is defined by `ThreadShape`.
			`//`
			`// This example assumes the overall tensor shape is divisible by the tile size and`
			`// does not perform predication.`


			`/// Simple copy kernel.`
			`//`
			`// Uses local_partition() to partition a tile among threads arranged as (THR_M, THR_N).`
			`template <class TensorS, class TensorD, class ThreadLayout>`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`__global__ void copy_kernel(TensorS S, TensorD D, ThreadLayout)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`{`
			`using namespace cute;`

			`// Slice the tiled tensors`
			`Tensor tile_S = S(make_coord(_,_), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)`
			`Tensor tile_D = D(make_coord(_,_), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)`

			`// Construct a partitioning of the tile among threads with the given thread arrangement.`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Concept: Tensor ThrLayout ThrIndex`
			`Tensor thr_tile_S = local_partition(tile_S, ThreadLayout{}, threadIdx.x); // (ThrValM, ThrValN)`
			`Tensor thr_tile_D = local_partition(tile_D, ThreadLayout{}, threadIdx.x); // (ThrValM, ThrValN)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Construct a register-backed Tensor with the same shape as each thread's partition`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Use make_tensor to try to match the layout of thr_tile_S`
			`Tensor fragment = make_tensor_like(thr_tile_S); // (ThrValM, ThrValN)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Copy from GMEM to RMEM and from RMEM to GMEM`
			`copy(thr_tile_S, fragment);`
			`copy(fragment, thr_tile_D);`
			`}`

			`/// Vectorized copy kernel.`
			`///`
			/// Uses `make_tiled_copy()` to perform a copy using vector instructions. This operation
			`/// has the precondition that pointers are aligned to the vector size.`
			`///`
			`template <class TensorS, class TensorD, class ThreadLayout, class VecLayout>`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`__global__ void copy_kernel_vectorized(TensorS S, TensorD D, ThreadLayout, VecLayout)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`{`
			`using namespace cute;`
			`using Element = typename TensorS::value_type;`

			`// Slice the tensors to obtain a view into each tile.`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`Tensor tile_S = S(make_coord(_, _), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)`
			`Tensor tile_D = D(make_coord(_, _), blockIdx.x, blockIdx.y); // (BlockShape_M, BlockShape_N)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			// Define `AccessType` which controls the size of the actual memory access.
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`using AccessType = cutlass::AlignedArray<Element, size(VecLayout{})>;`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// A copy atom corresponds to one hardware memory access.`
			`using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;`

			`// Construct tiled copy, a tiling of copy atoms.`
			`//`
			`// Note, this assumes the vector and thread layouts are aligned with contigous data`
			`// in GMEM. Alternative thread layouts are possible but may result in uncoalesced`
			`// reads. Alternative vector layouts are also possible, though incompatible layouts`
			`// will result in compile time errors.`
			`auto tiled_copy =`
			`make_tiled_copy(`
			`Atom{}, // access size`
			`ThreadLayout{}, // thread layout`
			`VecLayout{}); // vector layout (e.g. 4x1)`

			`// Construct a Tensor corresponding to each thread's slice.`
			`auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`Tensor thr_tile_S = thr_copy.partition_S(tile_S); // (CopyOp, CopyM, CopyN)`
			`Tensor thr_tile_D = thr_copy.partition_D(tile_D); // (CopyOp, CopyM, CopyN)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Construct a register-backed Tensor with the same shape as each thread's partition`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Use make_fragment because the first mode is the instruction-local mode`
			`Tensor fragment = make_fragment_like(thr_tile_D); // (CopyOp, CopyM, CopyN)`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Copy from GMEM to RMEM and from RMEM to GMEM`
			`copy(tiled_copy, thr_tile_S, fragment);`
			`copy(tiled_copy, fragment, thr_tile_D);`
			`}`

			`/// Main function`
			`int main(int argc, char** argv)`
			`{`
			`//`
			`// Given a 2D shape, perform an efficient copy`
			`//`

			`using namespace cute;`
			`using Element = float;`

			`// Define a tensor shape with dynamic extents (m, n)`
			`auto tensor_shape = make_shape(256, 512);`

			`//`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Allocate and initialize`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`//`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`thrust::host_vector<Element> h_S(size(tensor_shape));`
			`thrust::host_vector<Element> h_D(size(tensor_shape));`

CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`for (size_t i = 0; i < h_S.size(); ++i) {`
			`h_S[i] = static_cast<Element>(i);`
			`h_D[i] = Element{};`
			`}`

			`thrust::device_vector<Element> d_S = h_S;`
			`thrust::device_vector<Element> d_D = h_D;`

			`//`
			`// Make tensors`
			`//`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`Tensor tensor_S = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_S.data())), make_layout(tensor_shape));`
			`Tensor tensor_D = make_tensor(make_gmem_ptr(thrust::raw_pointer_cast(d_D.data())), make_layout(tensor_shape));`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`//`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Tile tensors`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`//`

			`// Define a statically sized block (M, N).`
			`// Note, by convention, capital letters are used to represent static modes.`
			`auto block_shape = make_shape(Int<128>{}, Int<64>{});`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`if ((size<0>(tensor_shape) % size<0>(block_shape)) \|\| (size<1>(tensor_shape) % size<1>(block_shape))) {`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`std::cerr << "The tensor shape must be divisible by the block shape." << std::endl;`
			`return -1;`
			`}`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Equivalent check to the above`
CUTLASS 3.6.0 (#1850) * v3.6 * update changelog * update readme * fix typo * fixing typos * hopper gemm with weight prefetch --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com> 2024-10-10 03:33:27 +08:00			`if (not evenly_divides(tensor_shape, block_shape)) {`
			`std::cerr << "Expected the block_shape to evenly divide the tensor shape." << std::endl;`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`return -1;`
			`}`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`// Tile the tensor (m, n) ==> ((M, N), m', n') where (M, N) is the static tile`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`// shape, and modes (m', n') correspond to the number of tiles.`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`//`
			`// These will be used to determine the CUDA kernel grid dimensions.`
			`Tensor tiled_tensor_S = tiled_divide(tensor_S, block_shape); // ((M, N), m', n')`
			`Tensor tiled_tensor_D = tiled_divide(tensor_D, block_shape); // ((M, N), m', n')`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Thread arrangement`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`Layout thr_layout = make_layout(make_shape(Int<32>{}, Int<8>{}));`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`// Vector dimensions`
			`Layout vec_layout = make_layout(make_shape(Int<4>{}, Int<1>{}));`

			`//`
			`// Determine grid and block dimensions`
			`//`

CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`dim3 gridDim (size<1>(tiled_tensor_D), size<2>(tiled_tensor_D)); // Grid shape corresponds to modes m' and n'`
			`dim3 blockDim(size(thr_layout));`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00
			`//`
			`// Launch the kernel`
			`//`
			`copy_kernel_vectorized<<< gridDim, blockDim >>>(`
CUTLASS 3.5.0 (#1411) 2024-03-20 05:51:04 +08:00			`tiled_tensor_S,`
			`tiled_tensor_D,`
			`thr_layout,`
CUTLASS 3.4.0 (#1286) * CUTLASS 3.4.0 * Update CHANGELOG.md --------- Co-authored-by: Pradeep Ramani <prramani@nvidia.com> 2023-12-30 04:21:31 +08:00			`vec_layout);`

			`cudaError result = cudaDeviceSynchronize();`
			`if (result != cudaSuccess) {`
			`std::cerr << "CUDA Runtime error: " << cudaGetErrorString(result) << std::endl;`
			`return -1;`
			`}`

			`//`
			`// Verify`
			`//`

			`h_D = d_D;`

			`int32_t errors = 0;`
			`int32_t const kErrorLimit = 10;`

			`for (size_t i = 0; i < h_D.size(); ++i) {`
			`if (h_S[i] != h_D[i]) {`
			`std::cerr << "Error. S[" << i << "]: " << h_S[i] << ", D[" << i << "]: " << h_D[i] << std::endl;`

			`if (++errors >= kErrorLimit) {`
			`std::cerr << "Aborting on " << kErrorLimit << "nth error." << std::endl;`
			`return -1;`
			`}`
			`}`
			`}`

			`std::cout << "Success." << std::endl;`

			`return 0;`
			`}`