461f417b9dfb7b7a14fbe65cf7c9191115b3f7b0/docs/hgemm__multiply__add_8h_source.html

 /***************************************************************************************************
  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modification, are permitted
  * provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright notice, this list of
  *       conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright notice, this list of
  *       conditions and the following disclaimer in the documentation and/or other materials
  *       provided with the distribution.
  *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
  *       to endorse or promote products derived from this software without specific prior written
  *       permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once

 #include <cutlass/fragment.h>

 #include <cutlass/gemm/thread_multiply_add.h>

 namespace cutlass {
 namespace gemm {


 template <typename AccumulatorsPerThread_, typename ThreadsPerWarp_>
 struct ThreadMultiplyAdd<AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half> {
   typedef Shape<1, 1, 2, 1> InstructionShape;
   typedef AccumulatorsPerThread_ AccumulatorsPerThread;
   typedef ThreadsPerWarp_ ThreadsPerWarp;
   typedef typename ShapeMul<AccumulatorsPerThread, ThreadsPerWarp>::Shape AccumulatorsPerWarp;
   typedef half ScalarA;
   typedef Fragment<ScalarA, AccumulatorsPerThread::kW> FragmentA;
   typedef half ScalarB;
   typedef Fragment<ScalarB, AccumulatorsPerThread::kH> FragmentB;
   typedef half ScalarC;
   typedef Fragment<half, AccumulatorsPerThread::kH * AccumulatorsPerThread::kW> Accumulators;

   static_assert(AccumulatorsPerThread::kH % 2 == 0, "Invalid size");
   static_assert(AccumulatorsPerThread::kW % 2 == 0, "Invalid size");

   CUTLASS_DEVICE ThreadMultiplyAdd() {}

   CUTLASS_DEVICE void multiply_add(FragmentA const& a,
                                    FragmentB const& b,
                                    Accumulators const& c,
                                    Accumulators& d) {
 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
     // The inputs.
     __half2 const* a_half2 = reinterpret_cast<__half2 const*>(&a[0]);
     __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
     __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);

     // The output.
     __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);

     for (int j = 0; j < AccumulatorsPerThread::kH / 2; ++j) {
       for (int i = 0; i < AccumulatorsPerThread::kW / 2; ++i) {
         // The offsets in the output fragment.
         int const k0 = (2 * j + 0) * (AccumulatorsPerThread::kW / 2) + i;
         int const k1 = (2 * j + 1) * (AccumulatorsPerThread::kW / 2) + i;

         // Compute the product a[i] * b[j].H0_H0.
         d_half2[k0] = __hfma2(a_half2[i], __low2half2(b_half2[j]), c_half2[k0]);
         // Compute the product a[i] * b[j].H1_H1.
         d_half2[k1] = __hfma2(a_half2[i], __high2half2(b_half2[j]), c_half2[k1]);
       }
     }
 #endif
   }
 };


 }  // namespace gemm
 }  // namespace cutlass
cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::ScalarA
half ScalarA
The type for A.
Definition: hgemm_multiply_add.h:52

cutlass
Definition: convert.h:33

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::Accumulators
Fragment< half, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW > Accumulators
The accumulators.
Definition: hgemm_multiply_add.h:62

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::AccumulatorsPerWarp
ShapeMul< AccumulatorsPerThread, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: hgemm_multiply_add.h:50

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::ScalarC
half ScalarC
The type for C and D.
Definition: hgemm_multiply_add.h:60

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::ThreadMultiplyAdd
CUTLASS_DEVICE ThreadMultiplyAdd()
Make sure there&#39;s an even number of elements in both dimensions.
Definition: hgemm_multiply_add.h:69

cutlass::ShapeMul::Shape
Shape< A_::kD *B_::kD, A_::kH *B_::kH, A_::kW *B_::kW, A_::kC *B_::kC > Shape
Definition: shape.h:119

cutlass::Fragment
A template defining Fragment Concept.
Definition: fragment.h:99

thread_multiply_add.h
Template implementing matrix multiply-add operations on fragments.

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::InstructionShape
Shape< 1, 1, 2, 1 > InstructionShape
The shape of the instruction.
Definition: hgemm_multiply_add.h:44

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::ScalarB
half ScalarB
The type for B.
Definition: hgemm_multiply_add.h:56

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::ThreadsPerWarp
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: hgemm_multiply_add.h:48

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::AccumulatorsPerThread
AccumulatorsPerThread_ AccumulatorsPerThread
The number of accumulators per thread.
Definition: hgemm_multiply_add.h:46

static_assert
#define static_assert(__e, __m)
Definition: platform.h:145

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::multiply_add
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = a*b + c.
Definition: hgemm_multiply_add.h:72

cutlass::Shape
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64

cutlass::gemm::ThreadMultiplyAdd
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:43

fragment.h
Defines Fragment, a statically-sized array for storing parts of matrices within a thread&#39;s registers...

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::FragmentA
Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: hgemm_multiply_add.h:54

cutlass::gemm::ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half >::FragmentB
Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: hgemm_multiply_add.h:58