Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							877bdcace6 
							
						 
					 
					
						
						
							
							Cutlass 1.3 Release ( #42 )  
						
						... 
						
						
						
						CUTLASS 1.3 Release
- Efficient GEMM kernel targeting Volta Tensor Cores via mma.sync instruction added in CUDA 10.1. 
						
					 
					
						2019-03-20 10:49:17 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							19a9d64e3c 
							
						 
					 
					
						
						
							
							Removed patch version from README.  
						
						... 
						
						
						
						Removed patch version from README. 
						
					 
					
						2018-12-19 15:20:43 -08:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							80e6f7c860 
							
						 
					 
					
						
						
							
							Merge pull request  #38  from NVIDIA/resolve_maxwell  
						
						... 
						
						
						
						Resolved issue for incorrect SGEMM on Maxwell architecture. 
						
					 
					
						2018-12-19 15:17:41 -08:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							822b0952cd 
							
						 
					 
					
						
						
							
							Resolved issue for incorrect SGEMM on Maxwell architecture.  
						
						
						
					 
					
						2018-12-19 15:07:16 -08:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							ed2ed4d667 
							
						 
					 
					
						
						
							
							Merge pull request  #33  from NVIDIA/cutlass_1.2  
						
						... 
						
						
						
						CUTLASS 1.2 
						
					 
					
						2018-10-26 14:59:50 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							4db423c40f 
							
						 
					 
					
						
						
							
							Minor edit to CHANGELOG.  
						
						
						
					 
					
						2018-10-26 14:58:31 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
						
						
							
						
						
							b2bc0d3b79 
							
						 
					 
					
						
						
							
							Updating Doxygen docs  
						
						
						
					 
					
						2018-10-26 14:54:58 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							74df0331f2 
							
						 
					 
					
						
						
							
							CUTLASS 1.2  
						
						
						
					 
					
						2018-10-26 14:38:46 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							2332df492e 
							
						 
					 
					
						
						
							
							Merge pull request  #30  from NVIDIA/fix_utilities_example  
						
						... 
						
						
						
						Fixed cutlass_utilities example. 
						
					 
					
						2018-09-29 15:09:18 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							cfe4b933ef 
							
						 
					 
					
						
						
							
							CUDA 9 lacks host-side conversions from float=>half. Instead, we must reinterpret_cast<> from cutlass::half_t => half.  
						
						
						
					 
					
						2018-09-29 15:04:20 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							6877595a5e 
							
						 
					 
					
						
						
							
							Merge pull request  #28  from NVIDIA/cutlass_1.1  
						
						... 
						
						
						
						Fixed typeo 
						
					 
					
						2018-09-28 12:59:49 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							69e3709da4 
							
						 
					 
					
						
						
							
							Fixed typeo  
						
						... 
						
						
						
						Fixed typeo 
						
					 
					
						2018-09-28 12:59:20 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							d419094c28 
							
						 
					 
					
						
						
							
							Merge pull request  #26  from NVIDIA/cutlass_1.1  
						
						... 
						
						
						
						Clarification to README 
						
					 
					
						2018-09-21 11:44:47 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							1a7ac522f8 
							
						 
					 
					
						
						
							
							Clarification to README  
						
						
						
					 
					
						2018-09-20 11:04:03 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							bf6eec53eb 
							
						 
					 
					
						
						
							
							Merge pull request  #25  from NVIDIA/cutlass_1.1  
						
						... 
						
						
						
						Updated CUTLASS.md 
						
					 
					
						2018-09-19 21:33:04 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							206e38dac5 
							
						 
					 
					
						
						
							
							Updated copyright of CUTLASS.md  
						
						
						
					 
					
						2018-09-19 21:31:12 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							d85f6a1cec 
							
						 
					 
					
						
						
							
							Merge pull request  #24  from NVIDIA/cutlass_1.1  
						
						... 
						
						
						
						Cutlass 1.1 
						
					 
					
						2018-09-19 21:16:53 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							0826572c4c 
							
						 
					 
					
						
						
							
							Reduced range of random values to avoid bit-level inconsistencies for large matrices.  
						
						
						
					 
					
						2018-09-19 21:11:48 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							77d1e0ca81 
							
						 
					 
					
						
						
							
							Updated README and CHANGELOG.  
						
						
						
					 
					
						2018-09-19 20:42:51 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							d7137f9c0a 
							
						 
					 
					
						
						
							
							Updated doxygen  
						
						
						
					 
					
						2018-09-19 14:02:08 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							461f417b9d 
							
						 
					 
					
						
						
							
							Checkpointing CUTLASS 1.1 release.  
						
						
						
					 
					
						2018-09-18 16:58:03 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							cf0301e00f 
							
						 
					 
					
						
						
							
							Merge pull request  #15  from NVIDIA/release_1.0.1_edits  
						
						... 
						
						
						
						Minor edits to README and changelog pursuant CUTLASS 1.0.1 patch. 
						
					 
					
						2018-06-26 13:59:01 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							b9bb0d1a49 
							
						 
					 
					
						
						
							
							Edits to README and changelog pursuant CUTLASS 1.0.1 patch.  
						
						
						
					 
					
						2018-06-26 13:57:39 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							e1c4ba501b 
							
						 
					 
					
						
						
							
							Merge pull request  #13  from NVIDIA/cutlass_v1.0.1  
						
						... 
						
						
						
						Cutlass v1.0.1 
						
					 
					
						2018-06-12 08:25:56 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							c566e83e6d 
							
						 
					 
					
						
						
							
							Updated changelog.  
						
						
						
					 
					
						2018-06-11 14:54:07 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							374882be53 
							
						 
					 
					
						
						
							
							Replaced GoogleTest copy with submodule. Added updates to support intra-threadblock reductions. Added tests for same.  
						
						
						
					 
					
						2018-06-11 11:47:15 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							2c496c3e9e 
							
						 
					 
					
						
						
							
							Replaced GoogleTest copy with Git submodule.  
						
						
						
					 
					
						2018-06-11 11:32:41 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							9fd55460c6 
							
						 
					 
					
						
						
							
							Merge pull request  #10  from NVIDIA/cutlass_v1.0_rel  
						
						... 
						
						
						
						Minor updates to usage and README. 
						
					 
					
						2018-05-18 12:27:31 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							480732c2e8 
							
						 
					 
					
						
						
							
							Minor updates to usage and readme.  
						
						
						
					 
					
						2018-05-17 15:10:55 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							68aaee8773 
							
						 
					 
					
						
						
							
							Merge pull request  #9  from NVIDIA/cutlass_v1.0_rel  
						
						... 
						
						
						
						Updated URL to Doxygen and modified usage statement 
						
					 
					
						2018-05-17 11:12:37 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							acb90e962a 
							
						 
					 
					
						
						
							
							Updated url to Doxygen and modified usage statement in performance test program.  
						
						
						
					 
					
						2018-05-17 11:11:05 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							96bc3f227f 
							
						 
					 
					
						
						
							
							Merge pull request  #8  from NVIDIA/cutlass_v1.0_rel  
						
						... 
						
						
						
						Configured Github Pages 
						
					 
					
						2018-05-16 15:26:55 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							25ff282403 
							
						 
					 
					
						
						
							
							Moved Doxygen documents.  
						
						
						
					 
					
						2018-05-16 15:25:24 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
						
						
							
						
						
							9d5726a568 
							
						 
					 
					
						
						
							
							Set theme jekyll-theme-minimal  
						
						
						
					 
					
						2018-05-16 13:49:06 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							6f0d271d8d 
							
						 
					 
					
						
						
							
							CUTLASS v1.0  
						
						... 
						
						
						
						CUTLASS v1.0 released. 
						
					 
					
						2018-05-16 13:47:13 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							923dfb42ce 
							
						 
					 
					
						
						
							
							Updated README.md  
						
						
						
					 
					
						2018-05-16 12:50:10 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							6f6f269a0a 
							
						 
					 
					
						
						
							
							Updated README.md  
						
						
						
					 
					
						2018-05-16 12:47:07 -07:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							2028ebe120 
							
						 
					 
					
						
						
							
							CUTLASS v1.0 release  
						
						
						
					 
					
						2018-05-16 11:44:56 -07:00 
						 
				 
			
				
					
						
							
							
								Andrew Kerr 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							84377249a1 
							
						 
					 
					
						
						
							
							Merge pull request  #2  from Artem-B/clang-fixes  
						
						... 
						
						
						
						Merging "Clang fixes" into master. 
						
					 
					
						2018-01-04 15:52:53 -08:00 
						 
				 
			
				
					
						
							
							
								akerr 
							
						 
					 
					
						
						
						
						
							
						
						
							901287175f 
							
						 
					 
					
						
						
							
							Merge branch 'Artem-B-clang-fixes'  
						
						
						
					 
					
						2018-01-04 15:46:08 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							1c9b54df16 
							
						 
					 
					
						
						
							
							Whitespace fix.  
						
						
						
					 
					
						2018-01-03 16:42:51 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							39616514d0 
							
						 
					 
					
						
						
							
							Reworked CUDA_LOG macro to print location&the message with one printf.  
						
						... 
						
						
						
						This replies on the fact that clang allows using device-side features
from __host__/__device__ functions from __host__ ones as long as we
don't have to generate code for that. Wrapping thread/blockIdx in
__host__ __device__ function allows using CUDA_LOG everywhere during
host and device compilation. 
						
					 
					
						2018-01-03 16:36:50 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							df4b4e4bb6 
							
						 
					 
					
						
						
							
							Added _cuda_ to the name of the executable to indicate that it's not clang's version.  
						
						
						
					 
					
						2017-12-11 16:34:10 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							81957b3a3d 
							
						 
					 
					
						
						
							
							Force inlining of few functions that rely on that for performance.  
						
						... 
						
						
						
						Clang is less agressive than nvccnvcc, so number of functions did not getn
inlined into the kernel by default. That prevented SROA from eliminating
loads/stores to temporary buffers and resulted in abysmal performance.
Replaced inline with __forceinline__ to ensure that we do inline the
functions necessary for optimal performance. 
						
					 
					
						2017-12-11 14:52:30 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							ce2b3f695d 
							
						 
					 
					
						
						
							
							Fixed debug macros for clang.  
						
						... 
						
						
						
						Unlike nvcc, clang always sees both host and device-side code during
compilation. CUDA_LOG macro is used in both host and device code, so when it
expanded to contain device-only code, that resulted in errors when it was used
from the host-side functions.
In order to make CUDA_LOG work with clang it was split into two parts -- a pair
of target-attribute-based overloaded functions that perform host or device
specific parts of logging, and a printf which works on both sides. 
						
					 
					
						2017-12-11 14:52:30 -08:00 
						 
				 
			
				
					
						
							
							
								Artem Belevich 
							
						 
					 
					
						
						
						
						
							
						
						
							e9e7cd4d44 
							
						 
					 
					
						
						
							
							Make cutlass compilable with clang.  
						
						... 
						
						
						
						E.g:
PATH=/nvcc/path/bin:/clang/path/bin:$PATH make sm=35,60 compiler=clang all 
						
					 
					
						2017-12-11 14:52:30 -08:00 
						 
				 
			
				
					
						
							
							
								Duane Merrill 
							
						 
					 
					
						
						
						
						
							
						
						
							95b0578d34 
							
						 
					 
					
						
						
							
							Update license info  
						
						
						
					 
					
						2017-12-06 10:00:59 -05:00 
						 
				 
			
				
					
						
							
							
								Duane Merrill 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							f4b48c7669 
							
						 
					 
					
						
						
							
							Update README.md  
						
						
						
					 
					
						2017-12-05 22:58:46 -05:00 
						 
				 
			
				
					
						
							
							
								Duane Merrill 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							6cb88d53eb 
							
						 
					 
					
						
						
							
							Update README.md  
						
						
						
					 
					
						2017-12-05 22:58:12 -05:00 
						 
				 
			
				
					
						
							
							
								Duane Merrill 
							
						 
					 
					
						
						
							
							
						
						
						
							
						
						
							537a4bcedf 
							
						 
					 
					
						
						
							
							Update README.md  
						
						
						
					 
					
						2017-12-05 22:54:49 -05:00