30 #ifndef HIPCUB_ROCPRIM_THREAD_THREAD_LOAD_HPP_
31 #define HIPCUB_ROCPRIM_THREAD_THREAD_LOAD_HPP_
32 BEGIN_HIPCUB_NAMESPACE
34 enum CacheLoadModifier : int32_t
45 template<CacheLoadModifier MODIFIER = LOAD_DEFAULT,
typename T>
46 HIPCUB_DEVICE __forceinline__ T AsmThreadLoad(
void * ptr)
49 __builtin_memcpy(&retval, ptr,
sizeof(T));
53 #if HIPCUB_THREAD_LOAD_USE_CACHE_MODIFIERS == 1
57 #define HIPCUB_ASM_THREAD_LOAD(cache_modifier, \
58 llvm_cache_modifier, \
65 HIPCUB_DEVICE __forceinline__ type AsmThreadLoad<cache_modifier, type>(void * ptr) \
67 interim_type retval; \
69 #asm_operator " %0, %1 " llvm_cache_modifier "\n" \
70 "\ts_waitcnt " wait_cmd "(0)" : "=" #output_modifier(retval) : "v"(ptr) \
76 #define HIPCUB_ASM_THREAD_LOAD_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \
77 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_load_sbyte, v, wait_cmd); \
78 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_load_sshort, v, wait_cmd); \
79 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_load_ubyte, v, wait_cmd); \
80 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_load_ushort, v, wait_cmd); \
81 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_load_dword, v, wait_cmd); \
82 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_load_dword, v, wait_cmd); \
83 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_cmd); \
84 HIPCUB_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_cmd);
86 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
87 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CA,
"sc0",
"");
88 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CG,
"sc1",
"");
89 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CV,
"sc0 sc1",
"vmcnt");
90 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_VOLATILE,
"sc0 sc1",
"vmcnt");
92 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CA,
"glc",
"");
93 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CG,
"glc slc",
"");
94 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CV,
"glc",
"vmcnt");
95 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_VOLATILE,
"glc",
"vmcnt");
99 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_LDG,
"",
"");
100 HIPCUB_ASM_THREAD_LOAD_GROUP(LOAD_CS,
"",
"");
104 template<CacheLoadModifier MODIFIER = LOAD_DEFAULT,
typename InputIteratorT>
105 HIPCUB_DEVICE __forceinline__
106 typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
108 using T =
typename std::iterator_traits<InputIteratorT>::value_type;
109 T retval = ThreadLoad<MODIFIER>(&(*itr));
113 template<CacheLoadModifier MODIFIER = LOAD_DEFAULT,
typename T>
114 HIPCUB_DEVICE __forceinline__ T
117 return AsmThreadLoad<MODIFIER, T>(ptr);