* Split apart gemm reference templates into multiple TUs for parallel compilation * remove old files * better balancing of ref kernels across TUs * remove 3 new added refcheck kernels and some un-necessary fp8 library instances to reduce lib size * remove auto fp8 kernels * remove some redundant kernels |
||
|---|---|---|
| .. | ||
| conv2d.cu | ||
| conv3d.cu | ||
| conv_reference_operation.h | ||
| gemm_e4m3a_e4m3out.cu | ||
| gemm_e4m3a_e5m2out.cu | ||
| gemm_e5m2a_e4m3out.cu | ||
| gemm_e5m2a_e5m2out.cu | ||
| gemm_fp8in_bf16out.cu | ||
| gemm_fp8in_fp16out.cu | ||
| gemm_fp8in_fp32out.cu | ||
| gemm_fp32out.cu | ||
| gemm_fp_other.cu | ||
| gemm_int4.cu | ||
| gemm_int8_canonical.cu | ||
| gemm_int8_interleaved_32.cu | ||
| gemm_int8_interleaved_64.cu | ||
| gemm_reference_operation.h | ||
| initialize_reference_operations.cu | ||