From add4ba622f1cdebc145d1df0e9620c3c84c00a52 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 27 Jan 2023 07:18:59 -0700 Subject: [PATCH] Fix 8.4 + CUDA 11.4 build (#789) Work around a likely GCC 8.x issue with fold expressions and generic lambdas. Only use the work-around when the host compiler is GCC 8.x. This avoids any concerns about the work-around possibly hindering inlining for a critical CuTe function (product). Users can experiment with the work-around for other compilers or compiler versions by defining the following macro. CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND Fixes https://github.com/NVIDIA/cutlass/issues/788 Co-authored-by: Mark Hoemmen --- include/cute/int_tuple.hpp | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp index 045e7210..492d08cc 100644 --- a/include/cute/int_tuple.hpp +++ b/include/cute/int_tuple.hpp @@ -219,13 +219,70 @@ product(IntTuple const& a) CUTE_GCC_UNREACHABLE; } +// Work-around for some compiler versions (e.g., GCC 8.x) +// incorrectly not being able to compile certain +// legal C++ fold expressions inside generic lambdas. +// Issue is known to exist in GCC 8.4 and GCC 8.5. +// Work-around should be valid portable CUDA C++. +#if ! defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND) +# if defined(__GNUC__) && __GNUC__ == 8 +# define CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND 1 +# endif +#endif + +#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND) +namespace impl { + +template +struct SubrangeProductImpl { + // GCC 8.4 accepts the fold expression here. If that doesn't work, + // the other branch (recursive operator()) is known to build + // with GCC 8.4 as well. The code does not enable recursion by default, + // as fold expressions might be easier for compilers to optimize. +#if 1 + template + CUTE_HOST_DEVICE constexpr auto + operator()(Args const&... args) const + { + return (Int<1>{} * ... * product(args)); + } +#else + CUTE_HOST_DEVICE constexpr Int<1> + operator()() const + { + return Int<1>{}; + } + + template + CUTE_HOST_DEVICE constexpr auto + operator()(Head const& head, Tail const&... tail) const + { + return (*this)(tail...) * product(head); + } +#endif // 1 +}; + +} // namespace impl + +#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND) + // Product of a subrange template CUTE_HOST_DEVICE constexpr auto product(Tuple const& a) { + // Work around some compiler versions that do not accept + // the generic lambda in the else branch, by replacing + // the lambda with a function object. The work-around + // is legal C++17, but the original code might be easier + // for non-broken compilers to optimize, so it remains. +#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND) + impl::SubrangeProductImpl function_object; + return detail::apply(a, function_object, make_range{}); +#else return detail::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }, make_range{}); +#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND) } template