cutlass/examples/04_tile_iterator/tile_iterator.cu

/***************************************************************************************************
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright notice, this list of
 *       conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright notice, this list of
 *       conditions and the following disclaimer in the documentation and/or other materials
 *       provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
 *       to endorse or promote products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/

/*
  This example demonstrates how to use the TileIterator in CUTLASS to load data from addressable
  memory, and store it back into addressable memory.

  TileIterator is a core concept in CUTLASS that enables efficient loading and storing of data from
  and to addressable memory. The TileIterator accepts a TileTraits type, which defines the shape of a 
  tile and the distribution of accesses by individual entities, either threads or others.

  In this example, a LoadTileIterator is used to load elements from a tile in global memory, stored in 
  column-major layout, into a fragment, and a corresponding StoreTileIterator is used to store the
  elements back into global memory (in the same column-major layout).

  https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/

  This example uses CUTLASS utilities to ease the matrix operations.
*/

// Standard Library includes
#include <iostream>
#include <sstream>
#include <vector>

// CUTLASS includes
#include "cutlass/tile_iterator.h"
#include "cutlass/tile_traits_standard.h"

//
// CUTLASS utility includes
//

// Defines operator<<() to write TensorView objects to std::ostream
#include "tools/util/tensor_view_io.h"

// Defines cutlass::HostMatrix<>
#include "tools/util/host_matrix.h"

// Defines cutlass::reference::device::TensorInitialize()
#include "tools/util/reference/device/tensor_elementwise.h"

// Defines cutlass::reference::host::TensorEquals()
#include "tools/util/reference/host/tensor_elementwise.h"

///////////////////////////////////////////////////////////////////////////////////////////////////
//
// This function defines load and store tile iterators to load and store a M-by-K tile, in
// column-major layout, from and back into global memory.
//
///////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Traits>
__global__ void cutlass_tile_iterator_load_store_global(
  float const *input,
  float *output,
  int M,
  int K) {

    // Define a tile load iterator
    typedef cutlass::TileLoadIterator<
        Traits,                         // the Traits type, defines shape/distribution of accesses
        float,                          // elements are of type float
        cutlass::IteratorAdvance::kH,   // post-increment accesses advance in strided (as opposed to
                                        //     contiguous dimension 
        cutlass::MemorySpace::kGlobal   // iterator loads from global memory 
        > TileLoadIterator;

    // Defines a tile store iterator
    typedef cutlass::TileStoreIterator<
        Traits,                         // the Traits type, defines shape/distribution of accesses
        float,                          // elements are of type float
        cutlass::IteratorAdvance::kH,   // post-increment accesses advance in strided (as opposed to
                                        //     contiguous) dimension
        cutlass::MemorySpace::kGlobal   // iterator stores into global memory
        > TileStoreIterator;

    // Defines a predicate vector for managing statically sized vector of boolean predicates
    typedef typename TileLoadIterator::PredicateVector PredicateVector;

    // The parameters specified to the iterators. These include the pointer to the source of
    // addressable memory, and the strides and increments for each of the tile's dimensions  
    typename TileLoadIterator::Params load_params;
    typename TileStoreIterator::Params store_params;

    // Initializing the parameters for both of the iterators. The TileLoadIterator accesses the
    // input matrix and TileStoreIterator accesses the output matrix. The strides are set
    // identically since the data is being stored in the same way as it is loaded (column-major
    // mapping).
    load_params.initialize(input, M*K, M, 1);
    store_params.initialize(output, M*K, M, 1);
   
    // Constructing the tile load and store iterators, and the predicates vector
    TileLoadIterator load_iterator(load_params);
    TileStoreIterator store_iterator(store_params);
    PredicateVector predicates;

    // Initializing the predicates with bounds set to <1, K, M>. This protects out-of-bounds loads.
    load_iterator.initialize_predicates(predicates.begin(), cutlass::make_Coord(1, K, M));

    // The fragment in which the elements are loaded into and stored from.
    typename TileLoadIterator::Fragment fragment;

    // Loading a tile into a fragment and advancing to the next tile's position
    load_iterator.load_post_increment(fragment, predicates.begin());
    // Storing a tile from fragment and advancing to the next tile's position
    store_iterator.store_post_increment(fragment);
}


///////////////////////////////////////////////////////////////////////////////////////////////////

// Launches cutlass_tile_iterator_load_store_global kernel
cudaError_t test_cutlass_tile_iterator() {
  cudaError_t result = cudaSuccess;

  // Creating a M-by-K (128-by-8) tile for this example.
  static int const M = 128;
  static int const K = 8;
  // The kernel is launched with 128 threads per thread block.
  static int const kThreadsPerThreadBlock = 128;
  // Define the tile type
  typedef cutlass::Shape<1, 8, 128> Tile;

  // CUTLASS provides a standard TileTraits type, which chooses the 'best' shape to enable warp 
  // raking along the contiguous dimension if possible.
  typedef cutlass::TileTraitsStandard<Tile, kThreadsPerThreadBlock> Traits;

  // M-by-K input matrix of float
  cutlass::HostMatrix<float> input(cutlass::MatrixCoord(M, K));

  // M-by-K output matrix of float
  cutlass::HostMatrix<float> output(cutlass::MatrixCoord(M, K));

  //
  // Initialize input matrix with linear combination.
  //

  cutlass::Distribution dist;

  // Linear distribution in column-major format.
  dist.set_linear(1, 1, M);

  // Arbitrary RNG seed value. Hard-coded for deterministic results.
  int seed = 2080;

  cutlass::reference::device::TensorInitialize(
    input.device_view(),                                // concept: TensorView
    seed,
    dist);

  // Initialize output matrix to all zeroes.
  output.fill(0);

  // Launch kernel to load and store tiles from/to global memory.
  cutlass_tile_iterator_load_store_global<Traits><<<
      dim3(1, 1, 1),
      dim3(kThreadsPerThreadBlock, 1)
    >>>(input.device_data(), output.device_data(), M, K);

  result = cudaDeviceSynchronize();

  if (result != cudaSuccess) {
    return result;
  }

  // Copy results to host
  output.sync_host();

  // Verify results
  for(int i = 0; i < M; ++i) {
    for(int j = 0; j < K; ++j) {
      if(output.at(cutlass::make_Coord(i, j)) != float(M*j+i+1)){
        std::cout << "FAILED: (" << i << ", " << j
                  << ") -- expected: " << (M*j+i+1)
                  << ", actual: " << output.at(cutlass::make_Coord(i, j))
                  << std::endl;
        result = cudaErrorUnknown;
        break;
      }
    }
  }

  return result;
}

///////////////////////////////////////////////////////////////////////////////////////////////////

/// Entry point to tile_iterator example.
//
// usage:
//
//   04_tile_iterator
//
int main(int argc, const char *arg[]) {
  
  // Properties of CUDA device
  cudaDeviceProp device_properties;
    
  // Assumne the device id is 0.
  int device_id = 0;

  cudaError_t result = cudaGetDeviceProperties(&device_properties, device_id);
  if (result != cudaSuccess) {
    std::cerr << "Failed to get device properties: " 
      << cudaGetErrorString(result) << std::endl;
    return -1;
  }


  //
  // Run the CUTLASS tile iterator test.
  //

  result = test_cutlass_tile_iterator();

  if (result == cudaSuccess) {
    std::cout << "Passed." << std::endl;
  }

  // Exit.
  return result == cudaSuccess ? 0 : -1;
}

///////////////////////////////////////////////////////////////////////////////////////////////////
Checkpointing CUTLASS 1.1 release. 2018-09-19 07:58:03 +08:00			`/***************************************************************************************************`
			`* Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without modification, are permitted`
			`* provided that the following conditions are met:`
			`* * Redistributions of source code must retain the above copyright notice, this list of`
			`* conditions and the following disclaimer.`
			`* * Redistributions in binary form must reproduce the above copyright notice, this list of`
			`* conditions and the following disclaimer in the documentation and/or other materials`
			`* provided with the distribution.`
			`* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used`
			`* to endorse or promote products derived from this software without specific prior written`
			`* permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR`
			`* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND`
			`* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,`
			`* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`**************************************************************************************************/`

			`/*`
			`This example demonstrates how to use the TileIterator in CUTLASS to load data from addressable`
			`memory, and store it back into addressable memory.`

			`TileIterator is a core concept in CUTLASS that enables efficient loading and storing of data from`
			`and to addressable memory. The TileIterator accepts a TileTraits type, which defines the shape of a`
			`tile and the distribution of accesses by individual entities, either threads or others.`

			`In this example, a LoadTileIterator is used to load elements from a tile in global memory, stored in`
			`column-major layout, into a fragment, and a corresponding StoreTileIterator is used to store the`
			`elements back into global memory (in the same column-major layout).`

			`https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/`

			`This example uses CUTLASS utilities to ease the matrix operations.`
			`*/`

			`// Standard Library includes`
			`#include <iostream>`
			`#include <sstream>`
			`#include <vector>`

			`// CUTLASS includes`
			`#include "cutlass/tile_iterator.h"`
			`#include "cutlass/tile_traits_standard.h"`

			`//`
			`// CUTLASS utility includes`
			`//`

			`// Defines operator<<() to write TensorView objects to std::ostream`
			`#include "tools/util/tensor_view_io.h"`

			`// Defines cutlass::HostMatrix<>`
			`#include "tools/util/host_matrix.h"`

			`// Defines cutlass::reference::device::TensorInitialize()`
			`#include "tools/util/reference/device/tensor_elementwise.h"`

			`// Defines cutlass::reference::host::TensorEquals()`
			`#include "tools/util/reference/host/tensor_elementwise.h"`

			`///////////////////////////////////////////////////////////////////////////////////////////////////`
			`//`
			`// This function defines load and store tile iterators to load and store a M-by-K tile, in`
			`// column-major layout, from and back into global memory.`
			`//`
			`///////////////////////////////////////////////////////////////////////////////////////////////////`

			`template <typename Traits>`
			`__global__ void cutlass_tile_iterator_load_store_global(`
			`float const *input,`
			`float *output,`
			`int M,`
			`int K) {`

			`// Define a tile load iterator`
			`typedef cutlass::TileLoadIterator<`
			`Traits, // the Traits type, defines shape/distribution of accesses`
			`float, // elements are of type float`
			`cutlass::IteratorAdvance::kH, // post-increment accesses advance in strided (as opposed to`
			`// contiguous dimension`
			`cutlass::MemorySpace::kGlobal // iterator loads from global memory`
			`> TileLoadIterator;`

			`// Defines a tile store iterator`
			`typedef cutlass::TileStoreIterator<`
			`Traits, // the Traits type, defines shape/distribution of accesses`
			`float, // elements are of type float`
			`cutlass::IteratorAdvance::kH, // post-increment accesses advance in strided (as opposed to`
			`// contiguous) dimension`
			`cutlass::MemorySpace::kGlobal // iterator stores into global memory`
			`> TileStoreIterator;`

			`// Defines a predicate vector for managing statically sized vector of boolean predicates`
			`typedef typename TileLoadIterator::PredicateVector PredicateVector;`

			`// The parameters specified to the iterators. These include the pointer to the source of`
			`// addressable memory, and the strides and increments for each of the tile's dimensions`
			`typename TileLoadIterator::Params load_params;`
			`typename TileStoreIterator::Params store_params;`

			`// Initializing the parameters for both of the iterators. The TileLoadIterator accesses the`
			`// input matrix and TileStoreIterator accesses the output matrix. The strides are set`
			`// identically since the data is being stored in the same way as it is loaded (column-major`
			`// mapping).`
			`load_params.initialize(input, M*K, M, 1);`
			`store_params.initialize(output, M*K, M, 1);`

			`// Constructing the tile load and store iterators, and the predicates vector`
			`TileLoadIterator load_iterator(load_params);`
			`TileStoreIterator store_iterator(store_params);`
			`PredicateVector predicates;`

			`// Initializing the predicates with bounds set to <1, K, M>. This protects out-of-bounds loads.`
			`load_iterator.initialize_predicates(predicates.begin(), cutlass::make_Coord(1, K, M));`

			`// The fragment in which the elements are loaded into and stored from.`
			`typename TileLoadIterator::Fragment fragment;`

			`// Loading a tile into a fragment and advancing to the next tile's position`
			`load_iterator.load_post_increment(fragment, predicates.begin());`
			`// Storing a tile from fragment and advancing to the next tile's position`
			`store_iterator.store_post_increment(fragment);`
			`}`


			`///////////////////////////////////////////////////////////////////////////////////////////////////`

			`// Launches cutlass_tile_iterator_load_store_global kernel`
			`cudaError_t test_cutlass_tile_iterator() {`
			`cudaError_t result = cudaSuccess;`

			`// Creating a M-by-K (128-by-8) tile for this example.`
			`static int const M = 128;`
			`static int const K = 8;`
			`// The kernel is launched with 128 threads per thread block.`
			`static int const kThreadsPerThreadBlock = 128;`
			`// Define the tile type`
			`typedef cutlass::Shape<1, 8, 128> Tile;`

			`// CUTLASS provides a standard TileTraits type, which chooses the 'best' shape to enable warp`
			`// raking along the contiguous dimension if possible.`
			`typedef cutlass::TileTraitsStandard<Tile, kThreadsPerThreadBlock> Traits;`

			`// M-by-K input matrix of float`
			`cutlass::HostMatrix<float> input(cutlass::MatrixCoord(M, K));`

			`// M-by-K output matrix of float`
			`cutlass::HostMatrix<float> output(cutlass::MatrixCoord(M, K));`

			`//`
			`// Initialize input matrix with linear combination.`
			`//`

			`cutlass::Distribution dist;`

			`// Linear distribution in column-major format.`
			`dist.set_linear(1, 1, M);`

			`// Arbitrary RNG seed value. Hard-coded for deterministic results.`
			`int seed = 2080;`

			`cutlass::reference::device::TensorInitialize(`
			`input.device_view(), // concept: TensorView`
			`seed,`
			`dist);`

			`// Initialize output matrix to all zeroes.`
			`output.fill(0);`

			`// Launch kernel to load and store tiles from/to global memory.`
			`cutlass_tile_iterator_load_store_global<Traits><<<`
			`dim3(1, 1, 1),`
			`dim3(kThreadsPerThreadBlock, 1)`
			`>>>(input.device_data(), output.device_data(), M, K);`

			`result = cudaDeviceSynchronize();`

			`if (result != cudaSuccess) {`
			`return result;`
			`}`

			`// Copy results to host`
			`output.sync_host();`

			`// Verify results`
			`for(int i = 0; i < M; ++i) {`
			`for(int j = 0; j < K; ++j) {`
			`if(output.at(cutlass::make_Coord(i, j)) != float(M*j+i+1)){`
			`std::cout << "FAILED: (" << i << ", " << j`
			`<< ") -- expected: " << (M*j+i+1)`
			`<< ", actual: " << output.at(cutlass::make_Coord(i, j))`
			`<< std::endl;`
			`result = cudaErrorUnknown;`
			`break;`
			`}`
			`}`
			`}`

			`return result;`
			`}`

			`///////////////////////////////////////////////////////////////////////////////////////////////////`

			`/// Entry point to tile_iterator example.`
			`//`
			`// usage:`
			`//`
			`// 04_tile_iterator`
			`//`
			`int main(int argc, const char *arg[]) {`

			`// Properties of CUDA device`
			`cudaDeviceProp device_properties;`

			`// Assumne the device id is 0.`
			`int device_id = 0;`

			`cudaError_t result = cudaGetDeviceProperties(&device_properties, device_id);`
			`if (result != cudaSuccess) {`
			`std::cerr << "Failed to get device properties: "`
			`<< cudaGetErrorString(result) << std::endl;`
			`return -1;`
			`}`


			`//`
			`// Run the CUTLASS tile iterator test.`
			`//`

			`result = test_cutlass_tile_iterator();`

			`if (result == cudaSuccess) {`
			`std::cout << "Passed." << std::endl;`
			`}`

			`// Exit.`
			`return result == cudaSuccess ? 0 : -1;`
			`}`

			`///////////////////////////////////////////////////////////////////////////////////////////////////`