dpp8 Namespace Reference#
Classes | |
struct | dpp_datatypes |
struct | dpp_datatypes< half_t > |
struct | DppLanegroupGemm |
Functions | |
template<int SrcLaneIdx> | |
__device__ void | inline_v_dot2c_dpp8_instr (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 0 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 1 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 2 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 3 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 4 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 5 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 6 > (const half2_t &a, const half2_t &b, float &c) |
template<> | |
__device__ void | inline_v_dot2c_dpp8_instr< 7 > (const half2_t &a, const half2_t &b, float &c) |
template<int SrcLaneIdx, bool ShareA> | |
__device__ void | inline_v_dot2c_dpp8 (const half2_t &a, const half2_t &b, float &c) |
template<int SrcLaneIdx> | |
constexpr int | get_dpp_sel_mask_broadcast () |
template<int SrcLaneIdx> | |
__device__ void | intrinsic_fdot2_impl (const half2_t &a, const half2_t &b, float &c) |
template<int SrcLaneIdx, bool ShareA> | |
__device__ void | intrinsic_fdot2 (const half2_t &a, const half2_t &b, float &c) |
template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA> | |
__device__ void | inner_product_dpp (const TA &a, const TB &b, TC &c) |
Variables | |
constexpr index_t | lane_group_size = 8 |
Number of lanes that can share data using DPP8 modifiers. More... | |
constexpr std::array< int, dpp8::lane_group_size > | IntrinsicMaskDpp8 |
Function Documentation
◆ get_dpp_sel_mask_broadcast()
|
constexpr |
Returns DPP8 sel modifier as an integer required for the intrinsic instruction.
◆ inline_v_dot2c_dpp8()
__device__ void ck::dpp8::inline_v_dot2c_dpp8 | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
Dot product of two vectors using v_dot
instruction with DPP8 submitted as inline assembly.
◆ inline_v_dot2c_dpp8_instr()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 0 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 0 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 1 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 1 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 2 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 2 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 3 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 3 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 4 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 4 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 5 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 5 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 6 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 6 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inline_v_dot2c_dpp8_instr< 7 >()
__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 7 > | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
◆ inner_product_dpp()
__device__ void ck::dpp8::inner_product_dpp | ( | const TA & | a, |
const TB & | b, | ||
TC & | c | ||
) |
Dot product of two input vectors a
, b
using v_dot
instructions with DPP modifier.
DPP modifier allows us to share one of the vectors between lanes in a lane group. When ShareA
is set, instruction uses vector a
from lane SrcLaneIdx
from the same lane group (8 lanes per lane group in DPP8). When ShareA
is not set, vector b
is shared. Note that all the threads in a lane group uses the same vector - broadcast pattern.
SrcLaneIdx
must be in range from 0 to 7.
◆ intrinsic_fdot2()
__device__ void ck::dpp8::intrinsic_fdot2 | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
Dot product of two vectors using v_dot
instruction with DPP8 submitted using intrinsics.
◆ intrinsic_fdot2_impl()
__device__ void ck::dpp8::intrinsic_fdot2_impl | ( | const half2_t & | a, |
const half2_t & | b, | ||
float & | c | ||
) |
Variable Documentation
◆ IntrinsicMaskDpp8
|
constexpr |
DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast patters.
◆ lane_group_size
|
constexpr |
Number of lanes that can share data using DPP8 modifiers.