* base version * restructure pipelines, add special fp8 epilogue * add variants * add fp8 causal and modify dynamic tile scheduler * better causal schedule * maintain two schedules for non causal and causal * removing macros * fix regression * clean up unneeded methods and variants * fix mistake with NumProducerThreads * base version * restructure pipelines, add special fp8 epilogue * add variants * add fp8 causal and modify dynamic tile scheduler * better causal schedule * maintain two schedules for non causal and causal * removing macros * fix regression * clean up unneeded methods and variants * fix mistake with NumProducerThreads * use seqlen traits * add fp8 .cu files and benchmark script * fix merge issue * fix merge issue * fix merge issue * remove duplicate code * fix regression with varseqlen * move varseqlen init in constexpr * fix test script * more constexpr on varseqlen and add max offset * add back test cases
24 lines
741 B
C++
24 lines
741 B
C++
/******************************************************************************
|
|
* Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
|
|
******************************************************************************/
|
|
|
|
#pragma once
|
|
|
|
#include "cutlass/arch/barrier.h"
|
|
|
|
namespace flash {
|
|
|
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Enumerates the reserved named barriers to avoid potential conflicts
|
|
enum class FwdNamedBarriers {
|
|
QueryEmpty = 0,
|
|
ValueEmpty = 1,
|
|
TileCountSmemEmpty = 2,
|
|
TileCountSmemFull = 3,
|
|
WarpSchedulerWG1 = 4,
|
|
WarpSchedulerWG2 = 5,
|
|
WarpSchedulerWG3 = 6,
|
|
ProducerWG = 7
|
|
};
|
|
|
|
} // flash
|