Fix 8.4 + CUDA 11.4 build (#789)
Work around a likely GCC 8.x issue with fold expressions and generic lambdas. Only use the work-around when the host compiler is GCC 8.x. This avoids any concerns about the work-around possibly hindering inlining for a critical CuTe function (product). Users can experiment with the work-around for other compilers or compiler versions by defining the following macro. CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND Fixes https://github.com/NVIDIA/cutlass/issues/788 Co-authored-by: Mark Hoemmen <mhoemmen@nvidia.com>
This commit is contained in:
parent
277bd6e537
commit
add4ba622f
@ -219,13 +219,70 @@ product(IntTuple const& a)
|
||||
CUTE_GCC_UNREACHABLE;
|
||||
}
|
||||
|
||||
// Work-around for some compiler versions (e.g., GCC 8.x)
|
||||
// incorrectly not being able to compile certain
|
||||
// legal C++ fold expressions inside generic lambdas.
|
||||
// Issue is known to exist in GCC 8.4 and GCC 8.5.
|
||||
// Work-around should be valid portable CUDA C++.
|
||||
#if ! defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
|
||||
# if defined(__GNUC__) && __GNUC__ == 8
|
||||
# define CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND 1
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
|
||||
namespace impl {
|
||||
|
||||
template<int B, int E>
|
||||
struct SubrangeProductImpl {
|
||||
// GCC 8.4 accepts the fold expression here. If that doesn't work,
|
||||
// the other branch (recursive operator()) is known to build
|
||||
// with GCC 8.4 as well. The code does not enable recursion by default,
|
||||
// as fold expressions might be easier for compilers to optimize.
|
||||
#if 1
|
||||
template<class ... Args>
|
||||
CUTE_HOST_DEVICE constexpr auto
|
||||
operator()(Args const&... args) const
|
||||
{
|
||||
return (Int<1>{} * ... * product(args));
|
||||
}
|
||||
#else
|
||||
CUTE_HOST_DEVICE constexpr Int<1>
|
||||
operator()() const
|
||||
{
|
||||
return Int<1>{};
|
||||
}
|
||||
|
||||
template<class Head, class ... Tail>
|
||||
CUTE_HOST_DEVICE constexpr auto
|
||||
operator()(Head const& head, Tail const&... tail) const
|
||||
{
|
||||
return (*this)(tail...) * product<Head>(head);
|
||||
}
|
||||
#endif // 1
|
||||
};
|
||||
|
||||
} // namespace impl
|
||||
|
||||
#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
|
||||
|
||||
// Product of a subrange
|
||||
template <int B, int E, class Tuple>
|
||||
CUTE_HOST_DEVICE constexpr
|
||||
auto
|
||||
product(Tuple const& a)
|
||||
{
|
||||
// Work around some compiler versions that do not accept
|
||||
// the generic lambda in the else branch, by replacing
|
||||
// the lambda with a function object. The work-around
|
||||
// is legal C++17, but the original code might be easier
|
||||
// for non-broken compilers to optimize, so it remains.
|
||||
#if defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
|
||||
impl::SubrangeProductImpl<B, E> function_object;
|
||||
return detail::apply(a, function_object, make_range<B, E>{});
|
||||
#else
|
||||
return detail::apply(a, [](auto const&... v){ return (Int<1>{} * ... * product(v)); }, make_range<B,E>{});
|
||||
#endif // defined(CUTE_FOLD_GENERIC_LAMBDA_WORKAROUND)
|
||||
}
|
||||
|
||||
template <class Tuple>
|
||||
|
Loading…
Reference in New Issue
Block a user