From 1f2b590da6dc7753ea24c5c35ab9bd2f4aa9255c Mon Sep 17 00:00:00 2001 From: Ali Hassani <68103095+alihassanijr@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:11:58 -0400 Subject: [PATCH] Skip void-C kernels in the profiler when beta is non zero (#1661) * Skip void-C kernels in the profiler when beta is non zero CUTLASS profiler will only skip disposition for void-C kernels when beta is non zero, when it makes more sense to skip running it in the first place. Not all users are aware of void-C kernels (as far as I know it wasn't a thing in 2.X), and not everyone remembers to filter out voidC kernels when running the profiler with a non zero beta. The easiest solution (and as far as I can tell correct way of handling this) is that `can_implement` return `false` when beta is non zero (or whatever argument indicates an epilogue source) but we have a void-C kernel. Profiler already includes functionality to skip running kernels that fail `can_implement`. * Move checks to collectives instead --------- Co-authored-by: Ali Hassani --- .../cutlass/epilogue/collective/detail.hpp | 21 +++++++++++++++++++ ...m90_epilogue_array_tma_warpspecialized.hpp | 18 +++++++++++++++- .../sm90_epilogue_tma_warpspecialized.hpp | 18 +++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/cutlass/epilogue/collective/detail.hpp b/include/cutlass/epilogue/collective/detail.hpp index a0178144..a6e5e2f4 100644 --- a/include/cutlass/epilogue/collective/detail.hpp +++ b/include/cutlass/epilogue/collective/detail.hpp @@ -392,6 +392,27 @@ public: tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { } }; +// SFINAE helpers for detecting beta/beta_ptr in EVT arguments. +template +struct has_beta { + static constexpr bool value = false; +}; + +template +struct has_beta> { + static constexpr bool value = true; +}; + +template +struct has_beta_ptr { + static constexpr bool value = false; +}; + +template +struct has_beta_ptr> { + static constexpr bool value = true; +}; + } // namespace detail } // namespace collective } // namespace epilogue diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp index 981ea3e2..87e62887 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp @@ -369,7 +369,23 @@ public: if (!fusion_implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n"); } - return implementable && fusion_implementable; + + bool beta_implementable = true; + + if constexpr (cute::is_void_v) { + if constexpr (detail::has_beta::value) { + beta_implementable = args.thread.beta == 0.0; + } + if constexpr (detail::has_beta_ptr::value) { + beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr; + } + } + + if (!beta_implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n"); + } + + return implementable && fusion_implementable && beta_implementable; } template diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp index c03aed33..56b55292 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp @@ -339,7 +339,23 @@ public: if (!fusion_implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n"); } - return implementable && fusion_implementable; + + bool beta_implementable = true; + + if constexpr (cute::is_void_v) { + if constexpr (detail::has_beta::value) { + beta_implementable = args.thread.beta == 0.0; + } + if constexpr (detail::has_beta_ptr::value) { + beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr; + } + } + + if (!beta_implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n"); + } + + return implementable && fusion_implementable && beta_implementable; } template