Fix 8.4 + CUDA 11.4 build (#789)

Work around a likely GCC 8.x issue with fold expressions and generic lambdas. Only use the work-around when the host compiler is GCC 8.x. This avoids any concerns about the work-around possibly hindering inlining for a critical CuTe function (product). Users can experiment with the work-around for other compilers or compiler versions by defining the following macro. CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND Fixes https://github.com/NVIDIA/cutlass/issues/788 Co-authored-by: Mark Hoemmen <mhoemmen@nvidia.com>
2023-01-27 07:18:59 -07:00 · 2023-01-27 07:18:59 -07:00 · add4ba622f
commit add4ba622f
parent 277bd6e537
1 changed files with 57 additions and 0 deletions
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@ -219,13 +219,70 @@ product(IntTuple const& a)
  CUTE_GCC_UNREACHABLE;
 }

+// Work-around for some compiler versions (e.g., GCC 8.x)
+// incorrectly not being able to compile certain
+// legal C++ fold expressions inside generic lambdas.
+// Issue is known to exist in GCC 8.4 and GCC 8.5.
+// Work-around should be valid portable CUDA C++.
+#if ! defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+#  if defined(__GNUC__) && __GNUC__ == 8
+#    define CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND 1
+#  endif
+#endif
+
+#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+namespace impl {
+
+template<int B, int E>
+struct SubrangeProductImpl {
+  // GCC 8.4 accepts the fold expression here.  If that doesn't work,
+  // the other branch (recursive operator()) is known to build
+  // with GCC 8.4 as well.  The code does not enable recursion by default,
+  // as fold expressions might be easier for compilers to optimize.
+#if 1
+  template<class ... Args>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Args const&... args) const
+  {
+    return (Int<1>{} * ... * product(args));
+  }
+#else
+  CUTE_HOST_DEVICE constexpr Int<1>
+  operator()() const
+  {
+    return Int<1>{};
+  }
+
+  template<class Head, class ... Tail>
+  CUTE_HOST_DEVICE constexpr auto
+  operator()(Head const& head, Tail const&... tail) const
+  {
+    return (*this)(tail...) * product<Head>(head);
+  }
+#endif // 1
+};
+
+} // namespace impl
+
+#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+
 // Product of a subrange
 template <int B, int E, class Tuple>
 CUTE_HOST_DEVICE constexpr
 auto
 product(Tuple const& a)
 {
+  // Work around some compiler versions that do not accept
+  // the generic lambda in the else branch, by replacing
+  // the lambda with a function object.  The work-around
+  // is legal C++17, but the original code might be easier
+  // for non-broken compilers to optimize, so it remains.
+#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
+  impl::SubrangeProductImpl<B, E> function_object;
+  return detail::apply(a, function_object, make_range<B, E>{});
+#else
  return detail::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }, make_range<B,E>{});
+#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
 }

 template <class Tuple>