From 81957b3a3ddd5b43683f3e16ef43b9d0a53f5d79 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Fri, 8 Dec 2017 16:59:32 -0800
Subject: [PATCH] Force inlining of few functions that rely on that for
 performance.

Clang is less agressive than nvccnvcc, so number of functions did not getn
inlined into the kernel by default. That prevented SROA from eliminating
loads/stores to temporary buffers and resulted in abysmal performance.

Replaced inline with __forceinline__ to ensure that we do inline the
functions necessary for optimal performance.
---
 cutlass/gemm/block_task.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/cutlass/gemm/block_task.h b/cutlass/gemm/block_task.h
index d5700c50..3940fb6e 100644
--- a/cutlass/gemm/block_task.h
+++ b/cutlass/gemm/block_task.h
@@ -484,7 +484,8 @@ struct block_task
      *   - Applies the scalar multipliers and addends to the accumulators
      *   - Write the result to the output matrix
      */
-    inline __device__ void epilogue()
+    __forceinline__ __device__
+    void epilogue()
     {
         // Wait for predecessor thread block(s) to produce block-wide tile of
         // exclsuive partial-sums
@@ -551,7 +552,7 @@ struct block_task
      * Consume a tile of A and B each
      */
     template <bool DoGlobalPrefetch>
-    inline __device__
+    __forceinline__ __device__
     void consume_tile()
     {
         // Unroll BlockDpVectorsK iterations of outer-product accumulations
@@ -612,7 +613,7 @@ struct block_task
     /**
      * Compute GEMM
      */
-    inline __device__
+    __forceinline__ __device__
     void run()
     {
         // Quit if the thread block is fully out-of-bounds