CUTLASS 2.3.0 final.

2020-09-25 10:34:46 -07:00 · 2020-09-25 10:34:46 -07:00 · 37a8f9e598
commit 37a8f9e598
parent c53f3339bb
8 changed files with 15 additions and 15 deletions
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@ -372,12 +372,11 @@ public:

            bool guard = row_guard && mask_.predicates[column];

-            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
-                         column],
-                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
-                                        kElementsPerAccess],
-                guard);
+            if (guard) {
+              
+              memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+            }
          }

          if (row + 1 < ThreadMap::Iterations::kRow) {
@ -691,8 +690,9 @@ public:

    bool guard = col_guard && mask_.predicates[iteration_contiguous_];

-    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
-        *frag_ptr, (void *)memory_pointer, guard);
+    if (guard) {
+      *memory_pointer = *frag_ptr;
+    }
  }

  /// Overrides the internal iteration index
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@ -224,7 +224,7 @@ struct Gemm {

    // Broadcast the warp_id computed by lane 0 to ensure dependent code
    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
    int lane_idx = threadIdx.x % 32;

    //
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@ -184,7 +184,7 @@ struct GemmArray {
      
      // Broadcast the warp_id computed by lane 0 to ensure dependent code
      // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);

      int lane_idx = threadIdx.x % 32;
      
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@ -196,7 +196,7 @@ struct GemmBatched {

      // Broadcast the warp_id computed by lane 0 to ensure dependent code
      // is compiled as warp-uniform.
-      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);

      int lane_idx = threadIdx.x % 32;
      
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@ -512,7 +512,7 @@ public:

    // Broadcast the warp_id computed by lane 0 to ensure dependent code
    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);

    int lane_idx = threadIdx.x % 32;

--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@ -441,7 +441,7 @@ public:

        // Broadcast the warp_id computed by lane 0 to ensure dependent code
        // is compiled as warp-uniform.
-        int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+        int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
        int lane_idx = threadIdx.x % 32;
    
        //
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@ -402,7 +402,7 @@ public:

    // Broadcast the warp_id computed by lane 0 to ensure dependent code
    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);

    int lane_idx = threadIdx.x % 32;

--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@ -269,7 +269,7 @@ struct SparseGemm {

    // Broadcast the warp_id computed by lane 0 to ensure dependent code
    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
    int lane_idx = threadIdx.x % 32;

    //