dpp8 Namespace Reference

dpp8 Namespace Reference#

Composable Kernel: ck::dpp8 Namespace Reference
ck::dpp8 Namespace Reference

Classes

struct  dpp_datatypes
 
struct  dpp_datatypes< half_t >
 
struct  DppLanegroupGemm
 

Functions

template<int SrcLaneIdx>
__device__ void inline_v_dot2c_dpp8_instr (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 0 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 1 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 2 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 3 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 4 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 5 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 6 > (const half2_t &a, const half2_t &b, float &c)
 
template<>
__device__ void inline_v_dot2c_dpp8_instr< 7 > (const half2_t &a, const half2_t &b, float &c)
 
template<int SrcLaneIdx, bool ShareA>
__device__ void inline_v_dot2c_dpp8 (const half2_t &a, const half2_t &b, float &c)
 
template<int SrcLaneIdx>
constexpr int get_dpp_sel_mask_broadcast ()
 
template<int SrcLaneIdx>
__device__ void intrinsic_fdot2_impl (const half2_t &a, const half2_t &b, float &c)
 
template<int SrcLaneIdx, bool ShareA>
__device__ void intrinsic_fdot2 (const half2_t &a, const half2_t &b, float &c)
 
template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA>
__device__ void inner_product_dpp (const TA &a, const TB &b, TC &c)
 

Variables

constexpr index_t lane_group_size = 8
 Number of lanes that can share data using DPP8 modifiers. More...
 
constexpr std::array< int, dpp8::lane_group_sizeIntrinsicMaskDpp8
 

Function Documentation

◆ get_dpp_sel_mask_broadcast()

template<int SrcLaneIdx>
constexpr int ck::dpp8::get_dpp_sel_mask_broadcast ( )
constexpr

Returns DPP8 sel modifier as an integer required for the intrinsic instruction.

◆ inline_v_dot2c_dpp8()

template<int SrcLaneIdx, bool ShareA>
__device__ void ck::dpp8::inline_v_dot2c_dpp8 ( const half2_t a,
const half2_t b,
float &  c 
)

Dot product of two vectors using v_dot instruction with DPP8 submitted as inline assembly.

◆ inline_v_dot2c_dpp8_instr()

template<int SrcLaneIdx>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 0 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 0 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 1 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 1 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 2 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 2 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 3 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 3 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 4 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 4 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 5 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 5 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 6 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 6 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inline_v_dot2c_dpp8_instr< 7 >()

template<>
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 7 > ( const half2_t a,
const half2_t b,
float &  c 
)

◆ inner_product_dpp()

template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA>
__device__ void ck::dpp8::inner_product_dpp ( const TA &  a,
const TB &  b,
TC &  c 
)

Dot product of two input vectors a, b using v_dot instructions with DPP modifier.

DPP modifier allows us to share one of the vectors between lanes in a lane group. When ShareA is set, instruction uses vector a from lane SrcLaneIdx from the same lane group (8 lanes per lane group in DPP8). When ShareA is not set, vector b is shared. Note that all the threads in a lane group uses the same vector - broadcast pattern.

SrcLaneIdx must be in range from 0 to 7.

◆ intrinsic_fdot2()

template<int SrcLaneIdx, bool ShareA>
__device__ void ck::dpp8::intrinsic_fdot2 ( const half2_t a,
const half2_t b,
float &  c 
)

Dot product of two vectors using v_dot instruction with DPP8 submitted using intrinsics.

◆ intrinsic_fdot2_impl()

template<int SrcLaneIdx>
__device__ void ck::dpp8::intrinsic_fdot2_impl ( const half2_t a,
const half2_t b,
float &  c 
)

Variable Documentation

◆ IntrinsicMaskDpp8

constexpr std::array<int, dpp8::lane_group_size> ck::dpp8::IntrinsicMaskDpp8
constexpr
Initial value:
= {
0,
2396745,
4793490,
7190235,
9586980,
11983725,
14380470,
16777215,
}

DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast patters.

◆ lane_group_size

constexpr index_t ck::dpp8::lane_group_size = 8
constexpr

Number of lanes that can share data using DPP8 modifiers.