Force inlining of few functions that rely on that for performance.
Clang is less agressive than nvccnvcc, so number of functions did not getn inlined into the kernel by default. That prevented SROA from eliminating loads/stores to temporary buffers and resulted in abysmal performance. Replaced inline with __forceinline__ to ensure that we do inline the functions necessary for optimal performance.
This commit is contained in:
parent
ce2b3f695d
commit
81957b3a3d
@ -484,7 +484,8 @@ struct block_task
|
|||||||
* - Applies the scalar multipliers and addends to the accumulators
|
* - Applies the scalar multipliers and addends to the accumulators
|
||||||
* - Write the result to the output matrix
|
* - Write the result to the output matrix
|
||||||
*/
|
*/
|
||||||
inline __device__ void epilogue()
|
__forceinline__ __device__
|
||||||
|
void epilogue()
|
||||||
{
|
{
|
||||||
// Wait for predecessor thread block(s) to produce block-wide tile of
|
// Wait for predecessor thread block(s) to produce block-wide tile of
|
||||||
// exclsuive partial-sums
|
// exclsuive partial-sums
|
||||||
@ -551,7 +552,7 @@ struct block_task
|
|||||||
* Consume a tile of A and B each
|
* Consume a tile of A and B each
|
||||||
*/
|
*/
|
||||||
template <bool DoGlobalPrefetch>
|
template <bool DoGlobalPrefetch>
|
||||||
inline __device__
|
__forceinline__ __device__
|
||||||
void consume_tile()
|
void consume_tile()
|
||||||
{
|
{
|
||||||
// Unroll BlockDpVectorsK iterations of outer-product accumulations
|
// Unroll BlockDpVectorsK iterations of outer-product accumulations
|
||||||
@ -612,7 +613,7 @@ struct block_task
|
|||||||
/**
|
/**
|
||||||
* Compute GEMM
|
* Compute GEMM
|
||||||
*/
|
*/
|
||||||
inline __device__
|
__forceinline__ __device__
|
||||||
void run()
|
void run()
|
||||||
{
|
{
|
||||||
// Quit if the thread block is fully out-of-bounds
|
// Quit if the thread block is fully out-of-bounds
|
||||||
|
Loading…
Reference in New Issue
Block a user