diff --git a/csrc/max.cu b/csrc/max.cu index 344b62a..8f2f15a 100644 --- a/csrc/max.cu +++ b/csrc/max.cu @@ -81,7 +81,7 @@ __global__ void test_cute_tensor_kernel() Stride<_32, _2>{}); Layout smem_layout = make_layout(make_shape(Int<4>{}, Int<8>{})); __shared__ float smem[decltype(cosize(smem_layout))::value]; // (static-only allocation) - // printf("smem size is :%d\n", decltype(cosize(smem_layout))::value); + printf("smem size is :%d\n", decltype(cosize(smem_layout))::value); Tensor stensor = make_tensor(make_smem_ptr(smem), smem_layout); printf("tensor size is: %d, ind size is: %d, rmem size is: %d , rmem4x8 is: %d, smem size is: %d\n", bool_tensor.size(), @@ -92,7 +92,10 @@ __global__ void test_cute_tensor_kernel() TiledCopy copyA = make_tiled_copy(Copy_Atom, float>{}, // Atom: Copy TAs as if they were uint128_t Layout>{}, // Thr layout 32x8 m-major Layout>{}); // Val layout 4x1 m-major + printf("stensor size 1 is %d\n", cute::size<1>(stensor)); +#if 0 print_latex(copyA); +#endif } // template diff --git a/csrc/md.cu b/csrc/md.cu index 7f62793..809df94 100644 --- a/csrc/md.cu +++ b/csrc/md.cu @@ -5,6 +5,9 @@ #include #include +__device__ void mm_device(const float *src) +{ +} __global__ void md_mm_kernel(const float *src, int stride_a, int stride_b, int stride_c, int thread_num) { int batch_idx = blockIdx.x; diff --git a/test b/test deleted file mode 100755 index c1aeb71..0000000 Binary files a/test and /dev/null differ