30 #ifndef HIPCUB_ROCPRIM_THREAD_THREAD_STORE_HPP_
31 #define HIPCUB_ROCPRIM_THREAD_THREAD_STORE_HPP_
32 BEGIN_HIPCUB_NAMESPACE
34 enum CacheStoreModifier
46 template<CacheStoreModifier MODIFIER = STORE_DEFAULT,
typename T>
47 HIPCUB_DEVICE __forceinline__
void AsmThreadStore(
void * ptr, T val)
49 __builtin_memcpy(ptr, &val,
sizeof(T));
52 #if HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS == 1
59 #define HIPCUB_ASM_THREAD_STORE(cache_modifier, \
60 llvm_cache_modifier, \
67 HIPCUB_DEVICE __forceinline__ void AsmThreadStore<cache_modifier, type>(void * ptr, type val) \
69 interim_type temp_val = val; \
70 asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier : : "v"(ptr), #output_modifier(temp_val)); \
71 asm volatile("s_waitcnt " wait_cmd "(%0)" : : "I"(0x00)); \
76 #define HIPCUB_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \
77 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_cmd); \
78 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_cmd); \
79 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_cmd); \
80 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_cmd); \
81 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_cmd); \
82 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_cmd); \
83 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \
84 HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd);
86 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
87 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WB,
"sc0 sc1",
"");
88 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CG,
"sc0 sc1",
"");
89 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WT,
"sc0 sc1",
"vmcnt");
90 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_VOLATILE,
"sc0 sc1",
"vmcnt");
92 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WB,
"glc",
"");
93 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CG,
"glc slc",
"");
94 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WT,
"glc",
"vmcnt");
95 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_VOLATILE,
"glc",
"vmcnt");
99 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CS,
"",
"");
103 template<CacheStoreModifier MODIFIER = STORE_DEFAULT,
typename OutputIteratorT,
typename T>
104 __device__ __forceinline__
void ThreadStore(OutputIteratorT itr, T val)
106 ThreadStore<MODIFIER>(&(*itr), val);
109 template<CacheStoreModifier MODIFIER = STORE_DEFAULT,
typename T>
110 __device__ __forceinline__
void ThreadStore(T * ptr, T val)
112 AsmThreadStore<MODIFIER, T>(ptr, val);