Fix the racing condition of mixed-input gemm when writing the registers (#1931)
* move two warpgroup_wait * merge main --------- Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
This commit is contained in:
parent
d656afbd2a
commit
8aa95dbb88
@ -724,4 +724,4 @@ int main(int argc, char const **args) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
@ -1024,8 +1024,8 @@ public:
|
||||
tiled_mma.accumulate_ = GMMA::ScaleOut::One;
|
||||
warpgroup_commit_batch();
|
||||
|
||||
warpgroup_wait<K_BLOCK_MAX - 1>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
|
||||
if (k_block == K_BLOCK_MAX - 1) {
|
||||
warpgroup_wait<K_BLOCK_MAX - 1>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
|
||||
pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it
|
||||
++smem_pipe_release;
|
||||
}
|
||||
@ -1076,8 +1076,9 @@ public:
|
||||
cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
|
||||
tiled_mma.accumulate_ = GMMA::ScaleOut::One;
|
||||
warpgroup_commit_batch();
|
||||
|
||||
warpgroup_wait<K_BLOCK_MAX - 1>();
|
||||
if (k_block == K_BLOCK_MAX - 1) { // release prior barrier
|
||||
warpgroup_wait<K_BLOCK_MAX - 1>();
|
||||
pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it
|
||||
++smem_pipe_release;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user