From bb35a3ba6f674a24686f25579f914b7b6692c5d7 Mon Sep 17 00:00:00 2001 From: "mengchi.hmc" Date: Thu, 22 Apr 2021 15:20:57 +0800 Subject: [PATCH] support setting load granularity for conv2d fprop --- .../conv/kernel/default_conv2d_fprop.h | 333 ++++-------------- ...ctivation_tile_access_iterator_optimized.h | 9 +- ...op_filter_tile_access_iterator_optimized.h | 9 +- 3 files changed, 86 insertions(+), 265 deletions(-) diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h index 030e5ca5..88096b8e 100644 --- a/include/cutlass/conv/kernel/default_conv2d_fprop.h +++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h @@ -66,10 +66,10 @@ template < int Stages, typename MathOperatorTag, conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, - /// whether Matrix A is 128b aligned - bool AlignedA = true, - /// whether Matrix B is 128b aligned - bool AlignedB = true, + /// Access granularity of A matrix in units of elements + int AlignmentA = 128 / cutlass::sizeof_bits::value, + /// Access granularity of B matrix in units of elements + int AlignmentB = 128 / cutlass::sizeof_bits::value, conv::StrideSupport StrideSupport = StrideSupport::kStrided > struct DefaultConv2dFprop; @@ -94,7 +94,9 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, int Stages, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -113,7 +115,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, Stages, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -197,7 +201,9 @@ template < typename ThreadblockSwizzle, int Stages, typename MathOperatorTag, - int InterleavedK + int InterleavedK, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -216,7 +222,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, Stages, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -312,7 +320,9 @@ template < typename InstructionShape, typename EpilogueOutputOp, typename ThreadblockSwizzle, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -331,7 +341,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, 2, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -417,7 +429,9 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, typename MathOperatorTag, - int InterleavedK + int InterleavedK, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -436,7 +450,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, 2, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -520,7 +536,7 @@ struct DefaultConv2dFprop < ///////////////////////////////////////////////////////////////////////////////////////////////// /// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and -/// multistage pipeline with unaligned data +/// multistage pipeline. template < typename ElementA, typename LayoutA, @@ -537,8 +553,8 @@ template < typename ThreadblockSwizzle, int Stages, typename MathOperatorTag, - bool AlignedA, - bool AlignedB + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -558,8 +574,8 @@ struct DefaultConv2dFprop < Stages, MathOperatorTag, IteratorAlgorithm::kOptimized, - AlignedA, - AlignedB + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -577,7 +593,7 @@ struct DefaultConv2dFprop < ElementA, LayoutA, ThreadMapA, - AlignedA + AlignmentA >; using SmemIteratorA = typename MmaCore::SmemIteratorA; @@ -590,7 +606,7 @@ struct DefaultConv2dFprop < ElementB, LayoutB, ThreadMapB, - AlignedB + AlignmentB >; using SmemIteratorB = typename MmaCore::SmemIteratorB; @@ -607,114 +623,7 @@ struct DefaultConv2dFprop < arch::CacheOperation::Always, IteratorB, SmemIteratorB, - arch::CacheOperation::Global, - MmaPolicy, - Stages - >; - - // Define the epilogue - using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< - ThreadblockShape, - WarpMmaTensorOp, - 1, - EpilogueOutputOp, - EpilogueOutputOp::kCount - >::Epilogue; - - // Define the kernel - using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< - Mma, - Epilogue, - ThreadblockSwizzle, - conv::Operator::kFprop - >; -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and -/// multistage pipeline. -template < - typename ElementA, - typename LayoutA, - typename ElementB, - typename LayoutB, - typename ElementC, - typename LayoutC, - typename ElementAccumulator, - typename ArchTag, - typename ThreadblockShape, - typename WarpShape, - typename InstructionShape, - typename EpilogueOutputOp, - typename ThreadblockSwizzle, - int Stages, - typename MathOperatorTag -> -struct DefaultConv2dFprop < - ElementA, - LayoutA, - ElementB, - LayoutB, - ElementC, - LayoutC, - ElementAccumulator, - arch::OpClassTensorOp, - ArchTag, - ThreadblockShape, - WarpShape, - InstructionShape, - EpilogueOutputOp, - ThreadblockSwizzle, - Stages, - MathOperatorTag, - IteratorAlgorithm::kOptimized -> { - - // Define the core components from GEMM - using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< - ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, - ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, - Stages, MathOperatorTag - >; - - // Define iterators over tiles from the A operand - using ThreadMapA = typename MmaCore::IteratorThreadMapA; - using IteratorA = - cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< - cutlass::MatrixShape, - ElementA, - LayoutA, - ThreadMapA - >; - - using SmemIteratorA = typename MmaCore::SmemIteratorA; - - // Define iterators over tiles from the B operand - using ThreadMapB = typename MmaCore::IteratorThreadMapB; - using IteratorB = - cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< - cutlass::MatrixShape, - ElementB, - LayoutB, - ThreadMapB - >; - - using SmemIteratorB = typename MmaCore::SmemIteratorB; - - // Warp-level GEMM components - using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; - using MmaPolicy = typename MmaCore::MmaPolicy; - - // Define the Mma - using Mma = threadblock::ImplicitGemmMultistage< - ThreadblockShape, - IteratorA, - SmemIteratorA, arch::CacheOperation::Always, - IteratorB, - SmemIteratorB, - arch::CacheOperation::Global, MmaPolicy, Stages >; @@ -755,7 +664,9 @@ template < typename ThreadblockSwizzle, int Stages, typename MathOperatorTag, - int InterleavedK + int InterleavedK, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -774,7 +685,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, Stages, MathOperatorTag, - IteratorAlgorithm::kOptimized + IteratorAlgorithm::kOptimized, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -844,10 +757,8 @@ struct DefaultConv2dFprop < >; }; -///////////////////////////////////////////////////////////////////////////////////////////////// - /// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm -/// and 2 stage pipeline with disalignment data +/// and 2 stage pipeline. template < typename ElementA, typename LayoutA, @@ -863,8 +774,8 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, typename MathOperatorTag, - bool AlignedA, - bool AlignedB + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -884,8 +795,8 @@ struct DefaultConv2dFprop < 2, MathOperatorTag, IteratorAlgorithm::kOptimized, - AlignedA, - AlignedB + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -903,7 +814,7 @@ struct DefaultConv2dFprop < ElementA, LayoutA, ThreadMapA, - AlignedA + AlignmentA > >; @@ -918,115 +829,7 @@ struct DefaultConv2dFprop < ElementB, LayoutB, ThreadMapB, - AlignedB - > - >; - - using SmemIteratorB = typename MmaCore::SmemIteratorB; - - // Warp-level GEMM components - using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; - using MmaPolicy = typename MmaCore::MmaPolicy; - - // Define the Mma - using Mma = threadblock::ImplicitGemmPipelined< - ThreadblockShape, - IteratorA, - SmemIteratorA, - IteratorB, - SmemIteratorB, - ElementC, - LayoutC, - MmaPolicy - >; - - // Define the epilogue - using Epilogue = typename detail::DefaultConvEpilogue< - ArchTag, - ThreadblockShape, - WarpMmaTensorOp, - 1, - EpilogueOutputOp - >::Epilogue; - - // Define the kernel - using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< - Mma, - Epilogue, - ThreadblockSwizzle, - conv::Operator::kFprop - >; -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm -/// and 2 stage pipeline. -template < - typename ElementA, - typename LayoutA, - typename ElementB, - typename LayoutB, - typename ElementC, - typename LayoutC, - typename ElementAccumulator, - typename ArchTag, - typename ThreadblockShape, - typename WarpShape, - typename InstructionShape, - typename EpilogueOutputOp, - typename ThreadblockSwizzle, - typename MathOperatorTag -> -struct DefaultConv2dFprop < - ElementA, - LayoutA, - ElementB, - LayoutB, - ElementC, - LayoutC, - ElementAccumulator, - arch::OpClassTensorOp, - ArchTag, - ThreadblockShape, - WarpShape, - InstructionShape, - EpilogueOutputOp, - ThreadblockSwizzle, - 2, - MathOperatorTag, - IteratorAlgorithm::kOptimized -> { - - // Define the core components from GEMM - using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< - ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, - ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, - 2, MathOperatorTag>; - - // Define iterators over tiles from the A operand - using ThreadMapA = typename MmaCore::IteratorThreadMapA; - using IteratorA = - cutlass::conv::threadblock::TileIterator< - cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< - cutlass::MatrixShape, - ElementA, - LayoutA, - ThreadMapA - > - >; - - using SmemIteratorA = typename MmaCore::SmemIteratorA; - - // Define iterators over tiles from the B operand - using ThreadMapB = typename MmaCore::IteratorThreadMapB; - using IteratorB = - cutlass::conv::threadblock::TileIterator< - cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< - cutlass::MatrixShape, - ElementB, - LayoutB, - ThreadMapB + AlignmentB > >; @@ -1083,7 +886,9 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, typename MathOperatorTag, - int InterleavedK + int InterleavedK, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -1102,7 +907,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, 2, MathOperatorTag, - IteratorAlgorithm::kOptimized + IteratorAlgorithm::kOptimized, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -1194,7 +1001,9 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, int Stages, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -1213,7 +1022,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, Stages, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -1299,7 +1110,9 @@ template < typename EpilogueOutputOp, typename ThreadblockSwizzle, int Stages, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -1318,7 +1131,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, Stages, MathOperatorTag, - IteratorAlgorithm::kOptimized + IteratorAlgorithm::kOptimized, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -1404,7 +1219,9 @@ template < typename InstructionShape, typename EpilogueOutputOp, typename ThreadblockSwizzle, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -1423,7 +1240,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, 2, MathOperatorTag, - IteratorAlgorithm::kAnalytic + IteratorAlgorithm::kAnalytic, + AlignmentA, + AlignmentB > { // Define the core components from GEMM @@ -1510,7 +1329,9 @@ template < typename InstructionShape, typename EpilogueOutputOp, typename ThreadblockSwizzle, - typename MathOperatorTag + typename MathOperatorTag, + int AlignmentA, + int AlignmentB > struct DefaultConv2dFprop < ElementA, @@ -1529,7 +1350,9 @@ struct DefaultConv2dFprop < ThreadblockSwizzle, 2, MathOperatorTag, - IteratorAlgorithm::kOptimized + IteratorAlgorithm::kOptimized, + AlignmentA, + AlignmentB > { // Define the core components from GEMM diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h index bb720cf7..9a272b05 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h @@ -61,7 +61,7 @@ template < typename Element_, typename Layout_, typename ThreadMap_, - bool Aligned = true + int AccessSize = ThreadMap_::kElementsPerAccess > class Conv2dFpropActivationTileAccessIteratorOptimized { public: @@ -75,7 +75,6 @@ public: using Layout = Layout_; using TensorCoord = typename Layout::TensorCoord; using ThreadMap = ThreadMap_; - static int const AccessSize = Aligned ? ThreadMap::kElementsPerAccess : 1; using AccessType = AlignedArray; using TensorRef = cutlass::TensorRef; using Index = typename Layout::Index; @@ -216,7 +215,7 @@ public: CUTLASS_PRAGMA_UNROLL for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) { - clear_mask_(filter_c_ + v_idx >= problem_size_.C, v_idx); + clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx); } set_iteration_index(0); @@ -357,7 +356,7 @@ public: CUTLASS_PRAGMA_UNROLL for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) { - clear_mask_(filter_c_ + v_idx >= problem_size_.C, v_idx); + clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx); } } @@ -420,7 +419,7 @@ public: static Status can_implement(Conv2dProblemSize const &problem_size) { // check alignment constraint on iterator's contiguous dimension - if (Aligned && problem_size.C % (128/sizeof_bits::value)) { + if (problem_size.C % AccessSize) { return Status::kErrorInvalidProblem; } diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h index 9781e42f..3229d6c5 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h @@ -61,7 +61,7 @@ template < typename Element_, typename Layout_, typename ThreadMap_, - bool Aligned = true + int AccessSize = ThreadMap_::kElementsPerAccess > class Conv2dFpropFilterTileAccessIteratorOptimized{ public: @@ -74,7 +74,6 @@ public: using Element = Element_; using Layout = Layout_; using ThreadMap = ThreadMap_; - static int const AccessSize = Aligned ? ThreadMap::kElementsPerAccess : 1; using AccessType = AlignedArray; using TensorRef = cutlass::TensorRef; using TensorCoord = typename Layout::TensorCoord; @@ -178,7 +177,7 @@ public: } for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) { - clear_mask_(filter_c_ + v_idx >= problem_size_.C, v_idx); + clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx); } pointer_ += ( @@ -249,7 +248,7 @@ public: } for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) { - clear_mask_(filter_c_ + v_idx >= problem_size_.C, v_idx); + clear_mask_(filter_c_ + v_idx * AccessSize >= problem_size_.C, v_idx); } pointer_ += next; @@ -301,7 +300,7 @@ public: static Status can_implement(Conv2dProblemSize const &problem_size) { // check alignment constraint on iterator's contiguous dimension - if (Aligned && problem_size.C % (128/sizeof_bits::value)) { + if (problem_size.C % AccessSize) { return Status::kErrorInvalidProblem; }