dpp8 Namespace Reference

dpp8 Namespace Reference#

Composable Kernel: ck::dpp8 Namespace Reference

Classes
struct	dpp_datatypes

struct	dpp_datatypes< half_t >

struct	DppLanegroupGemm

Functions
template<int SrcLaneIdx>
__device__ void	inline_v_dot2c_dpp8_instr (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 0 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 1 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 2 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 3 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 4 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 5 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 6 > (const half2_t &a, const half2_t &b, float &c)

template<>
__device__ void	inline_v_dot2c_dpp8_instr< 7 > (const half2_t &a, const half2_t &b, float &c)

template<int SrcLaneIdx, bool ShareA>
__device__ void	inline_v_dot2c_dpp8 (const half2_t &a, const half2_t &b, float &c)

template<int SrcLaneIdx>
constexpr int	get_dpp_sel_mask_broadcast ()

template<int SrcLaneIdx>
__device__ void	intrinsic_fdot2_impl (const half2_t &a, const half2_t &b, float &c)

template<int SrcLaneIdx, bool ShareA>
__device__ void	intrinsic_fdot2 (const half2_t &a, const half2_t &b, float &c)

template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA>
__device__ void	inner_product_dpp (const TA &a, const TB &b, TC &c)

Variables
constexpr index_t	lane_group_size = 8
	Number of lanes that can share data using DPP8 modifiers. More...

constexpr std::array< int, dpp8::lane_group_size >	IntrinsicMaskDpp8

Function Documentation

◆ get_dpp_sel_mask_broadcast()

template<int SrcLaneIdx>

constexpr int ck::dpp8::get_dpp_sel_mask_broadcast ( )

constexpr

Returns DPP8 sel modifier as an integer required for the intrinsic instruction.

◆ inline_v_dot2c_dpp8()

template<int SrcLaneIdx, bool ShareA>

__device__ void ck::dpp8::inline_v_dot2c_dpp8	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

Dot product of two vectors using v_dot instruction with DPP8 submitted as inline assembly.

◆ inline_v_dot2c_dpp8_instr()

template<int SrcLaneIdx>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 0 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 0 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 1 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 1 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 2 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 2 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 3 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 3 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 4 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 4 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 5 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 5 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 6 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 6 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inline_v_dot2c_dpp8_instr< 7 >()

template<>

__device__ void ck::dpp8::inline_v_dot2c_dpp8_instr< 7 >	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

◆ inner_product_dpp()

template<typename TA , typename TB , typename TC , int SrcLaneIdx, bool ShareA>

__device__ void ck::dpp8::inner_product_dpp	(	const TA &	a,
		const TB &	b,
		TC &	c
	)

Dot product of two input vectors a, b using v_dot instructions with DPP modifier.

DPP modifier allows us to share one of the vectors between lanes in a lane group. When ShareA is set, instruction uses vector a from lane SrcLaneIdx from the same lane group (8 lanes per lane group in DPP8). When ShareA is not set, vector b is shared. Note that all the threads in a lane group uses the same vector - broadcast pattern.

SrcLaneIdx must be in range from 0 to 7.

◆ intrinsic_fdot2()

template<int SrcLaneIdx, bool ShareA>

__device__ void ck::dpp8::intrinsic_fdot2	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

Dot product of two vectors using v_dot instruction with DPP8 submitted using intrinsics.

◆ intrinsic_fdot2_impl()

template<int SrcLaneIdx>

__device__ void ck::dpp8::intrinsic_fdot2_impl	(	const half2_t &	a,
		const half2_t &	b,
		float &	c
	)

Variable Documentation

◆ IntrinsicMaskDpp8

constexpr std::array<int, dpp8::lane_group_size> ck::dpp8::IntrinsicMaskDpp8

constexpr

Initial value:

DPP8 instrinsics expects to get an integer mask, hardcoding integers for specific broadcast patters.

◆ lane_group_size

constexpr index_t ck::dpp8::lane_group_size = 8

constexpr

Number of lanes that can share data using DPP8 modifiers.

dpp8 Namespace Reference

dpp8 Namespace Reference#

Classes

Functions

Variables

Function Documentation

◆ get_dpp_sel_mask_broadcast()

◆ inline_v_dot2c_dpp8()

◆ inline_v_dot2c_dpp8_instr()

◆ inline_v_dot2c_dpp8_instr< 0 >()

◆ inline_v_dot2c_dpp8_instr< 1 >()

◆ inline_v_dot2c_dpp8_instr< 2 >()

◆ inline_v_dot2c_dpp8_instr< 3 >()

◆ inline_v_dot2c_dpp8_instr< 4 >()

◆ inline_v_dot2c_dpp8_instr< 5 >()

◆ inline_v_dot2c_dpp8_instr< 6 >()

◆ inline_v_dot2c_dpp8_instr< 7 >()

◆ inner_product_dpp()

◆ intrinsic_fdot2()

◆ intrinsic_fdot2_impl()

Variable Documentation

◆ IntrinsicMaskDpp8

◆ lane_group_size