550 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			550 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /***************************************************************************************************
 | |
|  * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 | |
|  * SPDX-License-Identifier: BSD-3-Clause
 | |
|  *
 | |
|  * Redistribution and use in source and binary forms, with or without
 | |
|  * modification, are permitted provided that the following conditions are met:
 | |
|  *
 | |
|  * 1. Redistributions of source code must retain the above copyright notice, this
 | |
|  * list of conditions and the following disclaimer.
 | |
|  *
 | |
|  * 2. Redistributions in binary form must reproduce the above copyright notice,
 | |
|  * this list of conditions and the following disclaimer in the documentation
 | |
|  * and/or other materials provided with the distribution.
 | |
|  *
 | |
|  * 3. Neither the name of the copyright holder nor the names of its
 | |
|  * contributors may be used to endorse or promote products derived from
 | |
|  * this software without specific prior written permission.
 | |
|  *
 | |
|  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
|  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
|  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 | |
|  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 | |
|  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
|  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
|  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
|  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
|  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | |
|  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
|  *
 | |
|  **************************************************************************************************/
 | |
| /*! \file
 | |
|     \brief Unit tests for thread-level GEMM
 | |
| */
 | |
| 
 | |
| #include "../../common/cutlass_unit_test.h"
 | |
| 
 | |
| #include "cutlass/aligned_buffer.h"
 | |
| #include "cutlass/half.h"
 | |
| #include "cutlass/platform/platform.h"
 | |
| 
 | |
| #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 | |
| #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
 | |
| 
 | |
| #include "cutlass/util/tensor_view_io.h"
 | |
| #include "cutlass/util/host_tensor.h"
 | |
| #include "cutlass/util/reference/host/tensor_fill.h"
 | |
| 
 | |
| ///////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| /// Prototype algorithm for partitioning a 4D space across warps to achieve several performance
 | |
| /// objectives:
 | |
| ///
 | |
| ///   - coalesced memory accesses in units of 128 Byte lines
 | |
| ///   - minimal address arithmetic
 | |
| ///   - minimal predicate calculations
 | |
| ///
 | |
| struct OutputTileThreadMapExpr {
 | |
| 
 | |
|   struct Shape {
 | |
|     int column;
 | |
|     int row;
 | |
|     int group;
 | |
|     int cluster;
 | |
| 
 | |
|     Shape(int col = 1, int r = 1, int g = 1, int c = 1): 
 | |
|       column(col), row(r), group(g), cluster(c) { }
 | |
|   };
 | |
| 
 | |
|   int const kWarpSize = 32;
 | |
|   int const kMemoryAccessSize = 256;  // size in bytes of the preferred memory access size
 | |
| 
 | |
|   //
 | |
|   // Data members
 | |
|   //
 | |
| 
 | |
|   Shape shape;
 | |
|   Shape count;
 | |
|   int threads;
 | |
|   int warp_count;
 | |
|   int elements_per_access;
 | |
|   int element_size;
 | |
| 
 | |
|   Shape iterations;
 | |
|   Shape delta;
 | |
|   Shape warp_partitions;
 | |
| 
 | |
|   int access_width_in_vectors;
 | |
|   int access_rows;
 | |
| 
 | |
|   //
 | |
|   // Methods
 | |
|   //
 | |
| 
 | |
|   OutputTileThreadMapExpr(
 | |
|     Shape shape_,
 | |
|     Shape count_,
 | |
|     int threads_,
 | |
|     int elements_per_access_,
 | |
|     int element_size_
 | |
|   ):
 | |
|     shape(shape_), 
 | |
|     count(count_), 
 | |
|     threads(threads_), 
 | |
|     warp_count(threads_ / kWarpSize),
 | |
|     elements_per_access(elements_per_access_), 
 | |
|     element_size(element_size_) {
 | |
| 
 | |
|     int warps_remaining = warp_count;
 | |
| 
 | |
|     // clusters
 | |
|     if (shape.cluster > warp_count) {
 | |
|       iterations.cluster = shape.cluster / warp_count;
 | |
|       delta.cluster = shape.row * count.row * shape.group * count.group * shape.cluster / iterations.cluster;
 | |
|       warps_remaining = 1;
 | |
|       warp_partitions.cluster = warp_count;
 | |
|     }
 | |
|     else {
 | |
|       iterations.cluster = 1;
 | |
|       delta.cluster = 1;
 | |
|       warps_remaining = warp_count / shape.cluster;
 | |
|       warp_partitions.cluster = warps_remaining;
 | |
|     }
 | |
| 
 | |
|     // group size
 | |
|     if (shape.group > warps_remaining) {
 | |
|       iterations.group = shape.group / warps_remaining;
 | |
|       delta.group = shape.row * count.row * shape.group / iterations.group;
 | |
|       warps_remaining = 1;
 | |
|       warp_partitions.group = warps_remaining;
 | |
|     }
 | |
|     else {
 | |
|       iterations.group = 1;
 | |
|       delta.group = 1;
 | |
|       warps_remaining = warps_remaining / shape.group;
 | |
|       warp_partitions.group = warps_remaining;
 | |
|     }
 | |
| 
 | |
|     // Number of rows in a group
 | |
|     if (shape.row > warps_remaining) {
 | |
|       
 | |
|       // We must cover this shape within a warp
 | |
|       int shape_row = shape.row / warps_remaining;
 | |
|       int shape_width_vectors = shape.column / elements_per_access;
 | |
| 
 | |
|       // We would still like to minimize the number of strided increments. We can accomplish this
 | |
|       // by arranging the memory instructions as 2D, 128B wide accesses.
 | |
| 
 | |
|       int target_memory_access_width = kMemoryAccessSize / (elements_per_access * element_size / 8);
 | |
|       int target_rows_per_access = kWarpSize / target_memory_access_width;
 | |
| 
 | |
|       if (target_rows_per_access > shape_row) {
 | |
|         access_rows = shape_row;
 | |
|         access_width_in_vectors = kWarpSize / access_rows;
 | |
|       }
 | |
|       else {
 | |
| 
 | |
|         access_width_in_vectors = cutlass::platform::min(
 | |
|           shape_width_vectors, 
 | |
|           cutlass::platform::min(kWarpSize, kMemoryAccessSize / (elements_per_access * element_size / 8)));
 | |
| 
 | |
|         access_rows = cutlass::platform::min(shape_row, kWarpSize / access_width_in_vectors);
 | |
|       }
 | |
| 
 | |
|       iterations.row = shape_row / access_rows;
 | |
|       delta.row = access_rows;
 | |
| 
 | |
|       iterations.column = shape_width_vectors / access_width_in_vectors;
 | |
|       delta.column = access_width_in_vectors * elements_per_access;
 | |
|       
 | |
|       warp_partitions.column = 1;
 | |
|       warp_partitions.row = 1;
 | |
|     }
 | |
|     else {
 | |
|       iterations.row = 1;
 | |
|       delta.row = 1;
 | |
|       iterations.column = (shape.column / elements_per_access) / kWarpSize;
 | |
|       delta.column = kWarpSize * elements_per_access;
 | |
| 
 | |
|       access_width_in_vectors = kWarpSize;
 | |
|       access_rows = 1;
 | |
| 
 | |
|       warp_partitions.row = 1;
 | |
|       warp_partitions.column = warps_remaining;
 | |
|     }
 | |
|   }
 | |
| };
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| std::ostream & operator<<(std::ostream &out, OutputTileThreadMapExpr::Shape const &shape) {
 | |
|   out << "col: " << shape.column << ", r: " << shape.row << ", g: " << shape.group << ", c: " << shape.cluster;
 | |
|   return out;
 | |
| }
 | |
| 
 | |
| std::ostream & operator<<(std::ostream &out, OutputTileThreadMapExpr const &map) {
 | |
|   out 
 | |
|     << "  shape(" << map.shape  << ")\n"
 | |
|     << "  count(" << map.count << ")\n"
 | |
|     << "  iterations(" << map.iterations << ")\n"
 | |
|     << "  delta(" << map.delta << ")\n"
 | |
|     << "  warps(" << map.warp_partitions << ")\n"
 | |
|     << "  access(width: " << map.access_width_in_vectors 
 | |
|       << ", rows: " << map.access_rows
 | |
|       << ") x v" << map.elements_per_access
 | |
|       << ".b" << map.element_size << "\n";
 | |
| 
 | |
|   return out;
 | |
| }
 | |
| 
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| template <
 | |
|   typename Shape,
 | |
|   typename Count,
 | |
|   int Threads,
 | |
|   int ElementsPerAccess,
 | |
|   int ElementSize
 | |
| >
 | |
| struct ThreadMapTestbed {
 | |
|   ThreadMapTestbed() {
 | |
|     OutputTileThreadMapExpr map(
 | |
|       { Shape::kColumn, Shape::kRow, Shape::kGroup, Shape::kCluster },
 | |
|       { Count::kColumn, Count::kRow, Count::kGroup, Count::kCluster },
 | |
|       Threads,
 | |
|       ElementsPerAccess,
 | |
|       ElementSize
 | |
|     );
 | |
| 
 | |
|     using ThreadMap = cutlass::epilogue::threadblock::OutputTileOptimalThreadMap<
 | |
|       Shape,
 | |
|       Count,
 | |
|       Threads,
 | |
|       ElementsPerAccess,
 | |
|       ElementSize
 | |
|     >;
 | |
| 
 | |
|     using CompactThreadmap = typename ThreadMap::CompactedThreadMap;
 | |
|     
 | |
|     bool const kVerbose = false;
 | |
| 
 | |
|     if (kVerbose) {
 | |
| 
 | |
|       std::cout << map << std::endl;
 | |
| 
 | |
|       std::cout << "ThreadMap::warps remaining:\n"
 | |
|         << "  for groups: " << ThreadMap::Detail::kWarpsRemainingForGroups << "\n"
 | |
|         << "    for rows: " << ThreadMap::Detail::kWarpsRemainingForRows << "\n";
 | |
| 
 | |
|       std::cout << "ThreadMap::Access:\n"
 | |
|         << " width: " << ThreadMap::Detail::kAccessWidth << "\n"
 | |
|         << "  rows: " << ThreadMap::Detail::kAccessRows << "\n";
 | |
| 
 | |
|       std::cout << "ThreadMap::RowArrangement::Iterations:\n"
 | |
|         << "  row: " << int(ThreadMap::Detail::RowArrangement::kIterationsRow) << "\n";
 | |
|     }
 | |
| 
 | |
|     EXPECT_EQ(int(ThreadMap::Delta::kCluster), map.delta.cluster);
 | |
|     EXPECT_EQ(int(ThreadMap::Delta::kGroup), map.delta.group);
 | |
|     EXPECT_EQ(int(ThreadMap::Delta::kRow), map.delta.row);
 | |
|     EXPECT_EQ(int(ThreadMap::Delta::kColumn), map.delta.column);
 | |
| 
 | |
|     EXPECT_EQ(int(ThreadMap::Iterations::kCluster), map.iterations.cluster);
 | |
|     EXPECT_EQ(int(ThreadMap::Iterations::kGroup), map.iterations.group);
 | |
|     EXPECT_EQ(int(ThreadMap::Iterations::kRow), map.iterations.row);
 | |
|     EXPECT_EQ(int(ThreadMap::Iterations::kColumn), map.iterations.column);
 | |
| 
 | |
|     if (kVerbose) {
 | |
|       std::cout << "Iterations(col: " << ThreadMap::Iterations::kColumn
 | |
|         << ", r: " << ThreadMap::Iterations::kRow
 | |
|         << ", g: " << ThreadMap::Iterations::kGroup
 | |
|         << ", c: " << ThreadMap::Iterations::kCluster << ")\n";
 | |
| 
 | |
|       std::cout << "Delta(col: " << ThreadMap::Delta::kColumn
 | |
|         << ", r: " << ThreadMap::Delta::kRow
 | |
|         << ", g: " << ThreadMap::Delta::kGroup
 | |
|         << ", c: " << ThreadMap::Delta::kCluster << ")\n";
 | |
| 
 | |
|       for (int tid = 0; tid < Threads; ++tid) {
 | |
|         auto output_coord = ThreadMap::initial_offset(tid);
 | |
|         auto source_coord = CompactThreadmap::initial_offset(tid);
 | |
| 
 | |
|         std::cout << "T" << tid << " - output: " << output_coord << ", source: " << source_coord << "\n";
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| };
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_64x64_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 8, 1, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 1>;
 | |
|   int const kThreads = 32;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_128x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_256x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_128x256_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<256, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_128x64_64x32x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f16_tensor_op_64x128_128x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 1, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 8;
 | |
|   int const kElementSize = 16;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_64x64_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 8, 1, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 1, 1, 1>;
 | |
|   int const kThreads = 32;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_128x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_256x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_128x256_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<256, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_128x64_64x32x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 8, 2, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_tensor_op_64x128_128x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 8, 1, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 8, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_64x64_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 2, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 32;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_64x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 2, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 64;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_128x64_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 2, 4, 2, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 64;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_128x64_64x32x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 2, 4, 2, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_128x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 2, 4, 2, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_128x256_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<256, 2, 4, 2, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, f32_volta_tensor_op_256x128_64x64x8) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 2, 4, 4, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 4;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| TEST(ThreadMap, simt_32x64_32x64x1) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<64, 1, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 32;
 | |
|   int const kElementsPerAccess = 1;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, simt_32x128_32x64x1) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 1, 4, 1, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 64;
 | |
|   int const kElementsPerAccess = 1;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, simt_64x128_32x64x1) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 1, 4, 2, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 128;
 | |
|   int const kElementsPerAccess = 1;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| TEST(ThreadMap, simt_128x128_32x64x1) {
 | |
| 
 | |
|   using Shape = cutlass::epilogue::threadblock::OutputTileShape<128, 1, 4, 4, 1>;
 | |
|   using Count = cutlass::epilogue::threadblock::OutputTileShape<1, 4, 2, 1, 1>;
 | |
|   int const kThreads = 256;
 | |
|   int const kElementsPerAccess = 1;
 | |
|   int const kElementSize = 32;
 | |
| 
 | |
|   ThreadMapTestbed<Shape, Count, kThreads, kElementsPerAccess, kElementSize>();
 | |
| }
 | |
| 
 | |
| /////////////////////////////////////////////////////////////////////////////////////////////////
 | 
