From add4ba622f1cdebc145d1df0e9620c3c84c00a52 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@users.noreply.github.com>
Date: Fri, 27 Jan 2023 07:18:59 -0700
Subject: [PATCH] Fix 8.4 + CUDA 11.4 build (#789)

Work around a likely GCC 8.x issue with fold expressions
and generic lambdas.

Only use the work-around when the host compiler is GCC 8.x.
This avoids any concerns about the work-around possibly
hindering inlining for a critical CuTe function (product).

Users can experiment with the work-around for other compilers
or compiler versions by defining the following macro.

CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND

Fixes https://github.com/NVIDIA/cutlass/issues/788

Co-authored-by: Mark Hoemmen <mhoemmen@nvidia.com>
---
 include/cute/int_tuple.hpp | 57 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
index 045e7210..492d08cc 100644
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@@ -219,13 +219,70 @@ product(IntTuple const& a)
   CUTE_GCC_UNREACHABLE;
 }
 
+// Work-around for some compiler versions (e.g., GCC 8.x)
+// incorrectly not being able to compile certain
+// legal C++ fold expressions inside generic lambdas.
+// Issue is known to exist in GCC 8.4 and GCC 8.5.
+// Work-around should be valid portable CUDA C++.
+#if ! defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+#  if defined(__GNUC__) && __GNUC__ == 8
+#    define CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND 1
+#  endif
+#endif
+
+#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+namespace impl {
+
+template<int B, int E>
+struct SubrangeProductImpl {
+  // GCC 8.4 accepts the fold expression here.  If that doesn't work,
+  // the other branch (recursive operator()) is known to build
+  // with GCC 8.4 as well.  The code does not enable recursion by default,
+  // as fold expressions might be easier for compilers to optimize.
+#if 1
+  template<class ... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Args const&... args) const
+  {
+    return (Int<1>{} * ... * product(args));
+  }
+#else
+  CUTE_HOST_DEVICE constexpr Int<1>
+  operator()() const
+  {
+    return Int<1>{};
+  }
+
+  template<class Head, class ... Tail>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Head const& head, Tail const&... tail) const
+  {
+    return (*this)(tail...) * product<Head>(head);
+  }
+#endif // 1
+};
+
+} // namespace impl
+
+#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+
 // Product of a subrange
 template <int B, int E, class Tuple>
 CUTE_HOST_DEVICE constexpr
 auto
 product(Tuple const& a)
 {
+  // Work around some compiler versions that do not accept
+  // the generic lambda in the else branch, by replacing
+  // the lambda with a function object.  The work-around
+  // is legal C++17, but the original code might be easier
+  // for non-broken compilers to optimize, so it remains.
+#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+  impl::SubrangeProductImpl<B, E> function_object;
+  return detail::apply(a, function_object, make_range<B, E>{});
+#else
   return detail::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }, make_range<B,E>{});
+#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
 }
 
 template <class Tuple>