11 __device__
auto amd_global_load_transpose_to_vgpr(
const T* in_ptr)
13 using vector_t =
typename vector_type<T, 8>::type;
14 if constexpr(
sizeof(T) == 2)
16 typedef __attribute__((__vector_size__(8 *
sizeof(__fp16)))) __fp16 llvm_fp16x8_t;
17 __attribute__((address_space(1))) llvm_fp16x8_t* glb_ptr =
18 reinterpret_cast<__attribute__((address_space(1))) llvm_fp16x8_t*>(
20 return
bit_cast<vector_t>(__builtin_amdgcn_global_load_tr_b128_v8f16(glb_ptr));
22 else if constexpr(sizeof(T) == 1)
24 typedef __attribute__((__vector_size__(2 *
sizeof(
int))))
int llvm_intx2_t;
25 __attribute__((address_space(1))) llvm_intx2_t* glb_ptr =
26 reinterpret_cast<__attribute__((address_space(1))) llvm_intx2_t*>(
28 return
bit_cast<vector_t>(__builtin_amdgcn_global_load_tr_b64_v2i32(glb_ptr));
32 static_assert(
false,
"not implemented");
__host__ constexpr __device__ Y bit_cast(const X &x)
Definition: type.hpp:306
_W64 unsigned int uintptr_t
Definition: stdint.h:164