Resolved issue for incorrect SGEMM on Maxwell architecture.
This commit is contained in:
parent
ed2ed4d667
commit
822b0952cd
@ -1,5 +1,8 @@
|
|||||||
# NVIDIA CUTLASS Changelog
|
# NVIDIA CUTLASS Changelog
|
||||||
|
|
||||||
|
## [1.2.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.1) (2018-12-19)
|
||||||
|
* Resolved issue with sm50 and sm52 architectures
|
||||||
|
|
||||||
## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
|
## [1.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v1.2.0) (2018-10-26)
|
||||||
* Parallelized reductions across threadblocks ("Split-K")
|
* Parallelized reductions across threadblocks ("Split-K")
|
||||||
* Improved IGEMM performance
|
* Improved IGEMM performance
|
||||||
|
@ -33,7 +33,7 @@
|
|||||||
|
|
||||||
#define CUTLASS_MAJOR 1
|
#define CUTLASS_MAJOR 1
|
||||||
#define CUTLASS_MINOR 2
|
#define CUTLASS_MINOR 2
|
||||||
#define CUTLASS_PATCH 0
|
#define CUTLASS_PATCH 1
|
||||||
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
|
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
|
||||||
|
|
||||||
#ifdef __NVCC__
|
#ifdef __NVCC__
|
||||||
|
@ -52,7 +52,6 @@ struct FragmentMultiplyAdd {
|
|||||||
/// Multiply : d = a*b.
|
/// Multiply : d = a*b.
|
||||||
template <typename FragmentB_, typename FragmentCd_>
|
template <typename FragmentB_, typename FragmentCd_>
|
||||||
CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_& d) {
|
CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_& d) {
|
||||||
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
|
|
||||||
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
|
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
|
||||||
for (int j = 0; j < FragmentCd_::kElements; ++j) {
|
for (int j = 0; j < FragmentCd_::kElements; ++j) {
|
||||||
d[j] = b[j * kReduction + 0];
|
d[j] = b[j * kReduction + 0];
|
||||||
@ -61,7 +60,6 @@ struct FragmentMultiplyAdd {
|
|||||||
}
|
}
|
||||||
d[j] = a * ScalarAlphaBeta(d[j]);
|
d[j] = a * ScalarAlphaBeta(d[j]);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Multiply : d = a*b + c.
|
/// Multiply : d = a*b + c.
|
||||||
@ -70,7 +68,7 @@ struct FragmentMultiplyAdd {
|
|||||||
FragmentB_ const& b,
|
FragmentB_ const& b,
|
||||||
FragmentCd_ const& c,
|
FragmentCd_ const& c,
|
||||||
FragmentCd_& d) {
|
FragmentCd_& d) {
|
||||||
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
|
|
||||||
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
|
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
|
||||||
for (int j = 0; j < FragmentCd_::kElements; ++j) {
|
for (int j = 0; j < FragmentCd_::kElements; ++j) {
|
||||||
d[j] = b[j * kReduction + 0];
|
d[j] = b[j * kReduction + 0];
|
||||||
@ -79,7 +77,6 @@ struct FragmentMultiplyAdd {
|
|||||||
}
|
}
|
||||||
d[j] = a * ScalarAlphaBeta(d[j]) + ScalarAlphaBeta(c[j]);
|
d[j] = a * ScalarAlphaBeta(d[j]) + ScalarAlphaBeta(c[j]);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -77,6 +77,8 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__) >= 530
|
||||||
|
|
||||||
// CUTLASS includes needed for mixed-precision GEMM kernel
|
// CUTLASS includes needed for mixed-precision GEMM kernel
|
||||||
#include "cutlass/gemm/gemm.h"
|
#include "cutlass/gemm/gemm.h"
|
||||||
#include "cutlass/gemm/fp16_sgemm_traits.h"
|
#include "cutlass/gemm/fp16_sgemm_traits.h"
|
||||||
@ -312,6 +314,24 @@ cudaError_t TestCutlassGemm(int M, int N, int K, cutlass::half_t alpha, cutlass:
|
|||||||
//
|
//
|
||||||
int main(int argc, const char *arg[]) {
|
int main(int argc, const char *arg[]) {
|
||||||
|
|
||||||
|
//
|
||||||
|
// This example uses half-precision and is only suitable for devices with compute capabitliy 5.3 or greater.
|
||||||
|
//
|
||||||
|
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
cudaError_t result = cudaGetDeviceProperties(&prop, 0);
|
||||||
|
|
||||||
|
if (result != cudaSuccess) {
|
||||||
|
std::cerr << "Failed to query device properties with error " << cudaGetErrorString(result) << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(prop.major > 5 || (prop.major == 5 && prop.minor >= 3))) {
|
||||||
|
std::cerr << "This example uses mixed precision and is only suitable for devices with compute capability 5.3 or greater.\n";
|
||||||
|
std::cerr << "You are using a CUDA device with compute capability " << prop.major << "." << prop.minor << std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Parse the command line to obtain GEMM dimensions and scalar values.
|
// Parse the command line to obtain GEMM dimensions and scalar values.
|
||||||
//
|
//
|
||||||
@ -341,7 +361,7 @@ int main(int argc, const char *arg[]) {
|
|||||||
// Run the CUTLASS GEMM test.
|
// Run the CUTLASS GEMM test.
|
||||||
//
|
//
|
||||||
|
|
||||||
cudaError_t result = TestCutlassGemm(
|
result = TestCutlassGemm(
|
||||||
problem[0], // GEMM M dimension
|
problem[0], // GEMM M dimension
|
||||||
problem[1], // GEMM N dimension
|
problem[1], // GEMM N dimension
|
||||||
problem[2], // GEMM K dimension
|
problem[2], // GEMM K dimension
|
||||||
@ -358,3 +378,6 @@ int main(int argc, const char *arg[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user