[Kernel] Fix CUTLASS 3.x custom broadcast load epilogue (#5516)

This commit is contained in:
Tyler Michael Smith 2024-06-14 12:30:15 -04:00 committed by GitHub
parent d47af2bc02
commit 703475f6c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 2 additions and 4 deletions

View File

@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {
CUTLASS_DEVICE void CUTLASS_DEVICE void
begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
if (params.ptr_row == nullptr) { if (!params.row_broadcast) {
return; return;
} }

View File

@ -257,9 +257,7 @@ class Fp8LinearMethod(LinearMethodBase):
# If dynamic, layer.input_scale is None and x_scale computed from x. # If dynamic, layer.input_scale is None and x_scale computed from x.
# If static, layer.input_scale is scalar and x_scale is input_scale. # If static, layer.input_scale is scalar and x_scale is input_scale.
# Temporarily disable CUTLASS kernels due to an illegal memory access if bias is None and self.cutlass_fp8_supported:
#if bias is None and self.cutlass_fp8_supported:
if False:
qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
# Fused GEMM_DQ # Fused GEMM_DQ