dpp8 Namespace Reference#
Classes | |
| struct | dpp_datatypes |
| struct | dpp_datatypes< half_t > |
| struct | DppLanegroupGemm |
Functions | |
| template<int SrcLaneIdx> | |
| __device__ void | inline_v_dot2c_dpp8_instr (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 0 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 1 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 2 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 3 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 4 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 5 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 6 > (const half2_t &a, const half2_t &b, float &c) |
| template<> | |
| __device__ void | inline_v_dot2c_dpp8_instr< 7 > (const half2_t &a, const half2_t &b, float &c) |
| template<int SrcLaneIdx, bool ShareA> | |
| __device__ void | inline_v_dot2c_dpp8 (const half2_t &a, const half2_t &b, float &c) |
| template<int SrcLaneIdx> | |
| constexpr int | get_dpp_sel_mask_broadcast () |
| template<int SrcLaneIdx> | |
| __device__ void | intrinsic_fdot2_impl (const half2_t &a, const half2_t &b, float &c) |
| template<int SrcLaneIdx, bool ShareA> | |
| __device__ void | intrinsic_fdot2 (const half2_t &a, const half2_t &b, float &c) |
| template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA> | |
| __device__ void | inner_product_dpp (const TA &a, const TB &b, TC &c) |
Variables | |
| constexpr index_t | lane_group_size = 8 |
| Number of lanes that can share data using DPP8 modifiers. More... | |
| constexpr std::array< int, dpp8::lane_group_size > | IntrinsicMaskDpp8 |
Function Documentation
◆ get_dpp_sel_mask_broadcast()
|
constexpr |
Returns DPP8 sel modifier as an integer required for the intrinsic instruction.
◆ inline_v_dot2c_dpp8()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8 | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
Dot product of two vectors using v_dot instruction with DPP8 submitted as inline assembly.
◆ inline_v_dot2c_dpp8_instr()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 0 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 0 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 1 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 1 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 2 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 2 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 3 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 3 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 4 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 4 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 5 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 5 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 6 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 6 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inline_v_dot2c_dpp8_instr< 7 >()
| __device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 7 > | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
◆ inner_product_dpp()
| __device__ void ck::dpp8::inner_product_dpp | ( | const TA & | a, |
| const TB & | b, | ||
| TC & | c | ||
| ) |
Dot product of two input vectors a, b using v_dot instructions with DPP modifier.
DPP modifier allows us to share one of the vectors between lanes in a lane group. When ShareA is set, instruction uses vector a from lane SrcLaneIdx from the same lane group (8 lanes per lane group in DPP8). When ShareA is not set, vector b is shared. Note that all the threads in a lane group uses the same vector - broadcast pattern.
SrcLaneIdx must be in range from 0 to 7.
◆ intrinsic_fdot2()
| __device__ void ck::dpp8::intrinsic_fdot2 | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
Dot product of two vectors using v_dot instruction with DPP8 submitted using intrinsics.
◆ intrinsic_fdot2_impl()
| __device__ void ck::dpp8::intrinsic_fdot2_impl | ( | const half2_t & | a, |
| const half2_t & | b, | ||
| float & | c | ||
| ) |
Variable Documentation
◆ IntrinsicMaskDpp8
|
constexpr |
DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast patters.
◆ lane_group_size
|
constexpr |
Number of lanes that can share data using DPP8 modifiers.