/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp Source File#

hipCUB: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-hipcub/checkouts/docs-5.7.0/hipcub/include/hipcub/backend/rocprim/thread/thread_store.hpp Source File
thread_store.hpp
1 /******************************************************************************
2  * Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 #ifndef HIPCUB_ROCPRIM_THREAD_THREAD_STORE_HPP_
31 #define HIPCUB_ROCPRIM_THREAD_THREAD_STORE_HPP_
32 BEGIN_HIPCUB_NAMESPACE
33 
34 enum CacheStoreModifier
35 {
36  STORE_DEFAULT,
37  STORE_WB,
38  STORE_CG,
39  STORE_CS,
40  STORE_WT,
41  STORE_VOLATILE,
42 };
43 
44 // TODO add to detail namespace
45 // TODO cleanup
46 template<CacheStoreModifier MODIFIER = STORE_DEFAULT, typename T>
47 HIPCUB_DEVICE __forceinline__ void AsmThreadStore(void * ptr, T val)
48 {
49  __builtin_memcpy(ptr, &val, sizeof(T));
50 }
51 
52 #if HIPCUB_THREAD_STORE_USE_CACHE_MODIFIERS == 1
53 
54 // NOTE: the reason there is an interim_type is because of a bug for 8bit types.
55 // TODO fix flat_store_ubyte and flat_store_sbyte issues
56 
57 // Important for syncing. Check section 9.2.2 or 7.3 in the following document
58 // https://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf
59 #define HIPCUB_ASM_THREAD_STORE(cache_modifier, \
60  llvm_cache_modifier, \
61  type, \
62  interim_type, \
63  asm_operator, \
64  output_modifier, \
65  wait_cmd) \
66  template<> \
67  HIPCUB_DEVICE __forceinline__ void AsmThreadStore<cache_modifier, type>(void * ptr, type val) \
68  { \
69  interim_type temp_val = val; \
70  asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier : : "v"(ptr), #output_modifier(temp_val)); \
71  asm volatile("s_waitcnt " wait_cmd "(%0)" : : "I"(0x00)); \
72  }
73 
74 // TODO fix flat_store_ubyte and flat_store_sbyte issues
75 // TODO Add specialization for custom larger data types
76 #define HIPCUB_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \
77  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_cmd); \
78  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_cmd); \
79  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_cmd); \
80  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_cmd); \
81  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_cmd); \
82  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_cmd); \
83  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \
84  HIPCUB_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd);
85 
86 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
87 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WB, "sc0 sc1", ""); // TODO: gfx942 validation
88 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CG, "sc0 sc1", "");
89 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WT, "sc0 sc1", "vmcnt");
90 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_VOLATILE, "sc0 sc1", "vmcnt");
91 #else
92 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WB, "glc", "");
93 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CG, "glc slc", "");
94 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_WT, "glc", "vmcnt");
95 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_VOLATILE, "glc", "vmcnt");
96 #endif
97 
98 // TODO find correct modifiers to match these
99 HIPCUB_ASM_THREAD_STORE_GROUP(STORE_CS, "", "");
100 
101 #endif
102 
103 template<CacheStoreModifier MODIFIER = STORE_DEFAULT, typename OutputIteratorT, typename T>
104 __device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
105 {
106  ThreadStore<MODIFIER>(&(*itr), val);
107 }
108 
109 template<CacheStoreModifier MODIFIER = STORE_DEFAULT, typename T>
110 __device__ __forceinline__ void ThreadStore(T * ptr, T val)
111 {
112  AsmThreadStore<MODIFIER, T>(ptr, val);
113 }
114 
115 END_HIPCUB_NAMESPACE
116 #endif