Cutlass
CUDA Templates for Linear Algebra Subroutines and Solvers
gemm_shared_tile.h
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template <typename Scalar_, typename Tile_, typename Threads_, int kScalarsPerSts_>
42  typedef Scalar_* Pointer;
46  typedef Threads_ Threads;
48  typedef Shape<0, ShapeCount<Tile>::kWc, Tile::kC, kScalarsPerSts_> ThreadsStrides;
50  static int const kSkew = 0;
52  static int const kAccessSize = kScalarsPerSts_;
55 
57  typedef Shape<1,
58  Tile::kH / Threads::kH,
59  Tile::kW / Threads::kW,
60  Tile::kC / Threads::kC / kAccessSize>
67 
68  struct ThreadOffset {
70  Coord<4> operator()() const {
72  return make_Coord(0, 0, offset, 0);
73  }
74  };
75 };
76 
78 
79 template <typename Scalar_, typename Tile_, typename Threads_, int kScalarsPerSts_, int kSkew_>
84  typedef Scalar_* Pointer;
89  kScalarsPerSts_>::Tile Tile;
91  typedef Threads_ Threads;
93  static int const kSkew = kSkew_;
95  static int const kAccessSize = kScalarsPerSts_;
98 
100  typedef Shape<1, TileWithoutSkew::kH / Threads::kW, TileWithoutSkew::kW / Threads::kH> Iterations;
105 
106  struct ThreadOffset {
110  return make_Coord(0, 0, offset, 0);
111  }
112  };
113 
114  protected:
117 };
118 
120 
121 template <typename Scalar_,
122  typename OutputTile_,
123  typename Warps_,
124  typename ThreadsPerWarp_,
125  typename InstructionShape_,
126  int kStages_,
127  int kScalarsPerLds_,
128  int kSkew_ = 0>
134  typedef Scalar_* Pointer;
136  typedef Shape<kStages_,
137  OutputTile_::kD / InstructionShape_::kD,
138  GetExtent<kOperand, OutputTile_>::kExtent * InstructionShape_::kD>
147  typedef Warps_ Warps;
149  typedef ThreadsPerWarp_ ThreadsPerWarp;
151  // static int const kScalarsPerLds = kScalarsPerLds_;
152  static int const kAccessSize = kScalarsPerLds_;
154  static int const kSkew = kSkew_;
157 
162 
164  typedef Shape<1, 1, TileWithoutSkew::kW / kWarps / kThreadsPerWarp /* / kScalarsPerLds*/>
171 
173  struct ThreadOffset {
176  // Extract the warp.
177  int const warp = threadIdx.x / kWarpSize % Warps::kW;
178  // Compute the row offset for each thread
179  int const lane = (threadIdx.x & 0x0e) / 2;
180  // The offset.
181  int const offset = (warp * ThreadsPerWarp::kW + lane) * kAccessSize;
182 
183  return make_Coord(0, 0, offset, 0);
184  }
185  };
186 };
187 
189 
190 template <typename Scalar_,
191  typename OutputTile_,
192  typename Warps_,
193  typename ThreadsPerWarp_,
194  typename InstructionShape_,
195  int kStages_,
196  int kScalarsPerLds_,
197  int kSkew_ = 0>
203  typedef Scalar_* Pointer;
205  typedef Shape<kStages_,
206  OutputTile_::kD / InstructionShape_::kD,
207  GetExtent<kOperand, OutputTile_>::kExtent * InstructionShape_::kD>
216  typedef Warps_ Warps;
218  typedef ThreadsPerWarp_ ThreadsPerWarp;
220  static int const kAccessSize = kScalarsPerLds_;
222  static int const kSkew = kSkew_;
225 
230 
232  typedef Shape<1, 1, TileWithoutSkew::kW / kWarps / kThreadsPerWarp /* / kAccessSize*/> Iterations;
238 
240  struct ThreadOffset {
243  // The position of the warp.
244  int const warp = threadIdx.x / (Warps::kW * kWarpSize);
245 
246  // Compute the column offset for each thread
247  int const lane = (threadIdx.x & 0x10) / 8 + (threadIdx.x & 0x01);
248  // The offset.
249  int const offset = (warp * ThreadsPerWarp::kH + lane) * kAccessSize;
250 
251  return make_Coord(0, 0, offset, 0);
252  }
253  };
254 };
255 
257 
258 template <typename Scalar_,
259  typename OutputTile_,
260  typename Warps_,
261  typename ThreadsPerWarp_,
262  int kScalarsPerSts_,
263  int kSkew_ = 0>
268  typedef Scalar_* Pointer;
270  typedef OutputTile_ OutputTile;
272  typedef Warps_ Warps;
274  typedef ThreadsPerWarp_ ThreadsPerWarp;
276  static int const kAccessSize = kScalarsPerSts_;
278  static int const kSkew = kSkew_;
281 
283  static int const kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW;
285  static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize;
287  static int const kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew;
288 
297 
299  struct ThreadOffset {
302  // We issue STS.128 in the epilogue to store the accumulators to shared memory. When we use
303  // STS.128, we have to guarantee that threads in groups of 8 do not have bank conflicts (i.e
304  // they write to different banks).
305 
306  // Odd threads go to the second half of shared memory.
307  int const row = threadIdx.x & 0x01;
308 
309  int const warp_id = (threadIdx.x >> 5);
310 
311  int const warp_row = (warp_id % Warps::kW);
312  int const warp_col = (warp_id / Warps::kW);
313 
314  int hi_halfwarp_offset = OutputTile::kW * ((threadIdx.x >> 4) & 1);
315  int lo_halfwarp_offset = (((threadIdx.x >> 1) & 0x7) + warp_row * ThreadsPerWarp::kW);
316 
317  int col = kAccessSize * lo_halfwarp_offset +
318  warp_col * (ThreadsPerWarp::kH / 2) * OutputTile::kW + hi_halfwarp_offset;
319 
320  int offset = row * kScalarsPerRow + col;
321  return make_Coord(0, 0, offset, 0);
322  }
323  };
324 };
325 
327 
328 template <typename Scalar_,
329  typename OutputTile_,
330  typename Warps_,
331  typename ThreadsPerWarp_,
332  int kTileH_,
333  int kScalarsPerLds_,
334  int kSkew_ = 0>
339  typedef Scalar_* Pointer;
341  typedef OutputTile_ OutputTile;
343  typedef Warps_ Warps;
345  typedef ThreadsPerWarp_ ThreadsPerWarp;
347  static int const kAccessSize = kScalarsPerLds_;
349  static int const kSkew = kSkew_;
352 
354  static int const kScalarsPerThread = OutputTile_::kW / Warps::kW / ThreadsPerWarp::kW;
356  static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize;
358  static int const kScalarsPerRow = kThreads / 2 * kScalarsPerThread + kSkew;
359 
362 
363  // Compute the number of iterations per warp in the Tile::kH dimension.
364  static int const kIterationsInHPerWarp = kTileH_ / ShapeCount<Warps>::kCount;
365 
366  // As shown above, the shared memory tile is composed of 2 rows and each rows is made of
367  // kScalarsPerRow. A warp is expected to read from the 1st row, then move to the 2nd row and go
368  // back to the 1st row. To model that scheme we define the Iterations shape as Shape<X, 2, ...>.
369  // However, in some cases, we have only 1 iteration per warp. In that case, we must define the
370  // shape as Shape<1, 1, ...>. The following code does that.
371  static int const kIterationsH = kIterationsInHPerWarp == 1 ? 1 : 2;
372  // As soon as we know kIterationsH, it is trivial to compute kIterationsD:
374 
376  typedef Shape<kIterationsD, kIterationsH, OutputTile::kW / kWarpSize / kAccessSize> Iterations;
381 
383  struct ThreadOffset {
386  // Each warp works on a different column.
387  int const h = threadIdx.x / kWarpSize;
388  // Compute the row.
389  int const w = (threadIdx.x & (kWarpSize - 1)) * kAccessSize;
390  int offset = 0;
391  if (Iterations::kH == 1) {
392  int const row = h & 0x1;
393  int const col = h / 2;
394  offset = row * ShapeCount<Tile>::kWc + col * OutputTile::kW * Iterations::kD + w;
395  } else {
396  offset = h * OutputTile::kW * Iterations::kD + w;
397  }
398  return make_Coord(0, 0, offset, 0);
399  }
400  };
401 };
402 
404 
405 } // namespace gemm
406 } // namespace cutlass
static int const kAccessSize
The number of scalars per STS.
Definition: gemm_shared_tile.h:95
static CUTLASS_DEVICE int get()
Definition: shape.h:253
ReshapeTile< TileWithSkew, kScalarsPerLds_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:214
ReshapeTile< TileWithSkew, kScalarsPerLds_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:145
ReshapeTile< TileWithoutSkew_, kScalarsPerLds_ >::Tile TileWithoutSkew
The tile without skew after reshaping.
Definition: gemm_shared_tile.h:212
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:351
static int const kScalarsPerThread
The number of scalars per thread.
Definition: gemm_shared_tile.h:354
Definition: load_store.h:42
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:242
Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarp > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:232
Definition: convert.h:33
static int const kWarps
The number of warps.
Definition: gemm_shared_tile.h:227
Definition: gemm_shared_tile.h:129
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:42
static int const kScalarsPerRow
The number of scalars per row. We build a tile with 2 rows (to avoid bank conflicts).
Definition: gemm_shared_tile.h:287
T type
Definition: platform.h:369
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:132
Shape< 1, 1, kScalarsPerThread/kAccessSize > Iterations
The number of iterations needed to store the tile.
Definition: gemm_shared_tile.h:292
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:347
ThreadsPerWarp_ ThreadsPerWarp
The threads in a warp.
Definition: gemm_shared_tile.h:149
Definition: reshape_tile.h:42
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241
Shape< 0, ShapeCount< Tile >::kWc, Tile::kC, kScalarsPerSts_ > ThreadsStrides
The strides to compute the base position of the thread.
Definition: gemm_shared_tile.h:48
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:276
Shape< kIterationsD, kIterationsH, OutputTile::kW/kWarpSize/kAccessSize > Iterations
The number of iterations needed to store the tile.
Definition: gemm_shared_tile.h:376
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:349
Warps_ Warps
The number of warps.
Definition: gemm_shared_tile.h:216
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:222
Definition: gemm_shared_tile.h:38
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:201
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:383
Definition: gemm_shared_tile.h:198
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:156
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:40
static GemmOperand::Kind const kOperand
Definition: gemm_shared_tile.h:130
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:280
Kind
Definition: load_store.h:40
Shape< kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW+kSkew_ > TileWithSkew
The tile with skew.
Definition: gemm_shared_tile.h:210
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:152
static int const kH
The height of the cube.
Definition: shape.h:68
Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/Threads::kC/kAccessSize > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:61
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:93
Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarp > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:165
OutputTile_ OutputTile
The dimension of the output tile.
Definition: gemm_shared_tile.h:270
static int const kScalarsPerRow
The number of scalars per row. We build a tile with 2 rows (to avoid bank conflicts).
Definition: gemm_shared_tile.h:358
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:203
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:134
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:268
static int const kScalarsPerThread
The number of scalars per thread.
Definition: gemm_shared_tile.h:283
Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:380
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:301
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:54
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:50
static int const kThreadsPerWarp
The number of threads in one dimension of the warp.
Definition: gemm_shared_tile.h:229
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:240
Shape< 0, ShapeCount< Tile >::kWc, Threads::kH *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:104
Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSize > Tile
The tile.
Definition: gemm_shared_tile.h:290
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:52
ReshapeTile< Tile_, kScalarsPerSts_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:44
static int const kIterationsInHPerWarp
Definition: gemm_shared_tile.h:364
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:278
ReshapeTile< TileWithoutSkew_, kScalarsPerLds_ >::Tile TileWithoutSkew
The tile without skew after reshaping.
Definition: gemm_shared_tile.h:143
Defines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory...
Shape< 0, Threads::kH *ShapeCount< Tile >::kWc, Threads::kW *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:66
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:170
ReshapeTile< Shape< Tile_::kD, Tile_::kH, Tile_::kW+kSkew_ >, kScalarsPerSts_ >::Tile Tile
The tile.
Definition: gemm_shared_tile.h:89
Shape< 0, kScalarsPerSts_, ShapeCount< Tile >::kHwc/Threads::kW > ThreadsStrides
The strides to compute the base position of the thread.
Definition: gemm_shared_tile.h:116
ReshapeTile< Tile_, kScalarsPerSts_ >::Tile TileWithoutSkew
The tile without skews.
Definition: gemm_shared_tile.h:86
static int const kIterationsD
Definition: gemm_shared_tile.h:373
static int const kWarps
The number of warps.
Definition: gemm_shared_tile.h:159
Definition: matrix_traits.h:43
ThreadsPerWarp_ ThreadsPerWarp
The threads in the warps.
Definition: gemm_shared_tile.h:274
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:173
Shape< kStages_, OutputTile_::kD/InstructionShape_::kD, GetExtent< kOperand, OutputTile_ >::kExtent *InstructionShape_::kD > TileWithoutSkew_
The tile without skew.
Definition: gemm_shared_tile.h:139
Definition: gemm_shared_tile.h:335
Threads_ Threads
The threads.
Definition: gemm_shared_tile.h:91
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
OutputTile_ OutputTile
The dimension of the output tile.
Definition: gemm_shared_tile.h:341
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:82
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:167
Shape< 0, 0, Warps::kW *ThreadsPerWarp::kW *kAccessSize > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:296
static GemmOperand::Kind const kOperand
Definition: gemm_shared_tile.h:199
Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSize > Tile
The tile.
Definition: gemm_shared_tile.h:361
static int const kThreadsPerWarp
The number of threads in one dimension of the warp.
Definition: gemm_shared_tile.h:161
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:84
Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:378
Shape< 1, TileWithoutSkew::kH/Threads::kW, TileWithoutSkew::kW/Threads::kH > Iterations
The number of iterations needed to load/store the tile.
Definition: gemm_shared_tile.h:100
Shape< kStages_, OutputTile_::kD/InstructionShape_::kD, GetExtent< kOperand, OutputTile_ >::kExtent *InstructionShape_::kD > TileWithoutSkew_
The tile without skew.
Definition: gemm_shared_tile.h:208
Threads_ Threads
The threads.
Definition: gemm_shared_tile.h:46
Definition: gemm_operand.h:50
Shape< 0, Threads::kH *ShapeCount< Tile >::kWc, Threads::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:63
static int const kThreads
The number of threads.
Definition: gemm_shared_tile.h:356
Warps_ Warps
The number of warps.
Definition: gemm_shared_tile.h:147
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:97
static MemorySpace::Kind const kMemorySpace
The memory space.
Definition: gemm_shared_tile.h:224
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:70
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:175
static int const kD
The depth of the cube.
Definition: shape.h:66
Computes the thread offset in (H, W) based on thread ID.
Definition: gemm_shared_tile.h:299
Warps_ Warps
The warps in the tile.
Definition: gemm_shared_tile.h:343
Tile_ Tile
Definition: reshape_tile.h:43
Shape< 0, ShapeCount< Tile >::kWc, Threads::kH *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:102
static int const kIterationsH
Definition: gemm_shared_tile.h:371
Shape< 0, 0, Warps::kW *ThreadsPerWarp::kW *kAccessSize > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:294
Kind
Definition: matrix_traits.h:43
static int const kSkew
The skew.
Definition: gemm_shared_tile.h:154
ThreadsPerWarp_ ThreadsPerWarp
The threads in the warps.
Definition: gemm_shared_tile.h:345
Definition: matrix_traits.h:43
Scalar_ * Pointer
The pointer.
Definition: gemm_shared_tile.h:339
static int const kThreads
The number of threads.
Definition: gemm_shared_tile.h:285
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:234
ThreadsPerWarp_ ThreadsPerWarp
The threads in a warp.
Definition: gemm_shared_tile.h:218
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:266
Compute derived counted of a Layout Concept based class.
Definition: shape.h:79
Shape< kStages_, TileWithoutSkew_::kH, TileWithoutSkew_::kW+kSkew_ > TileWithSkew
The tile with skew.
Definition: gemm_shared_tile.h:141
Warps_ Warps
The warps in the tile.
Definition: gemm_shared_tile.h:272
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:108
platform::remove_const< Scalar_ >::type Scalar
The scalar.
Definition: gemm_shared_tile.h:337
CUTLASS_HOST_DEVICE Coord< 4 > operator()() const
Definition: gemm_shared_tile.h:385
Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
The strides in each dimension between different loads/stores.
Definition: gemm_shared_tile.h:237
Definition: gemm_shared_tile.h:264
static int const kAccessSize
The number of scalars per LDG/STG.
Definition: gemm_shared_tile.h:220