21 #ifndef ROCRAND_COMMON_H_
22 #define ROCRAND_COMMON_H_
24 #define ROCRAND_2POW16_INV (1.5258789e-05f)
25 #define ROCRAND_2POW16_INV_2PI (9.58738e-05f)
26 #define ROCRAND_2POW32_INV (2.3283064e-10f)
27 #define ROCRAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
28 #define ROCRAND_2POW64_INV (5.4210109e-20f)
29 #define ROCRAND_2POW64_INV_DOUBLE (5.4210108624275221700372640043497e-20)
30 #define ROCRAND_2POW32_INV_2PI (1.46291807e-09f)
31 #define ROCRAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
32 #define ROCRAND_PI (3.141592653f)
33 #define ROCRAND_PI_DOUBLE (3.1415926535897932)
34 #define ROCRAND_2PI (6.2831855f)
35 #define ROCRAND_SQRT2 (1.4142135f)
36 #define ROCRAND_SQRT2_DOUBLE (1.4142135623730951)
38 #include <hip/hip_runtime.h>
42 #define ROCRAND_KERNEL __global__ static
44 #if __HIP_DEVICE_COMPILE__ \
45 && (defined(__HIP_PLATFORM_AMD__) \
46 || (defined(__HIP_PLATFORM_NVCC__) && (__CUDA_ARCH__ >= 530)))
47 #define ROCRAND_HALF_MATH_SUPPORTED
58 #define ROCRAND_STRINGIZE(X) ROCRAND_DO_STRINGIZE(X)
59 #define ROCRAND_DO_STRINGIZE(X) #X
70 #if defined(__INTEL_COMPILER)
71 #define ROCRAND_PRAGMA_MESSAGE(x) \
72 __pragma(message(__FILE__ "(" ROCRAND_STRINGIZE(__LINE__) "): note: " x))
73 #elif defined(__GNUC__)
74 #define ROCRAND_PRAGMA_MESSAGE(x) _Pragma(ROCRAND_STRINGIZE(message(x)))
75 #elif defined(_MSC_VER)
76 #define ROCRAND_PRAGMA_MESSAGE(x) \
77 __pragma(message(__FILE__ "(" ROCRAND_STRINGIZE(__LINE__) "): note: " x))
79 #define ROCRAND_PRAGMA_MESSAGE(x)
82 #if __cplusplus >= 201402L
83 #define ROCRAND_DEPRECATED(msg) [[deprecated(msg)]]
84 #elif defined(_MSC_VER) && !defined(__clang__)
85 #define ROCRAND_DEPRECATED(msg) __declspec(deprecated(msg))
86 #elif defined(__clang__) || defined(__GNUC__)
87 #define ROCRAND_DEPRECATED(msg) __attribute__((deprecated(msg)))
89 #define ROCRAND_DEPRECATED(msg)
92 namespace rocrand_device {
95 #if ( defined(__HIP_PLATFORM_NVCC__) || \
96 defined(__gfx801__) || \
97 defined(__gfx802__) || \
98 defined(__gfx803__) || \
99 defined(__gfx810__) || \
100 defined(__gfx900__) || \
101 defined(__gfx902__) || \
102 defined(__gfx904__) || \
103 defined(__gfx906__) || \
104 defined(__gfx908__) || \
105 defined(__gfx909__) || \
106 defined(__gfx1030__) )
107 #if !defined(ROCRAND_ENABLE_INLINE_ASM)
108 #define ROCRAND_ENABLE_INLINE_ASM
111 #if defined(__HIP_DEVICE_COMPILE__) && defined(ROCRAND_ENABLE_INLINE_ASM)
112 #undef ROCRAND_ENABLE_INLINE_ASM
116 __forceinline__ __device__ __host__
unsigned long long
117 mad_u64_u32(
const unsigned int x,
const unsigned int y,
const unsigned long long z)
119 #if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_DEVICE_COMPILE__) \
120 && defined(ROCRAND_ENABLE_INLINE_ASM)
122 #if __AMDGCN_WAVEFRONT_SIZE == 64u
123 using sgpr_t =
unsigned long long;
124 #elif __AMDGCN_WAVEFRONT_SIZE == 32u
125 using sgpr_t =
unsigned int;
128 unsigned long long r;
134 asm volatile(
"v_mad_u64_u32 %0, %1, %2, %3, %4"
135 :
"=v"(r),
"=s"(c) :
"r"(x),
"v"(y),
"v"(z)
138 #elif defined(__HIP_PLATFORM_NVCC__) && defined(__HIP_DEVICE_COMPILE__) \
139 && defined(ROCRAND_ENABLE_INLINE_ASM)
141 unsigned long long r;
142 asm(
"mad.wide.u32 %0, %1, %2, %3;"
143 :
"=l"(r) :
"r"(x),
"r"(y),
"l"(z)
149 return static_cast<unsigned long long>(x) *
static_cast<unsigned long long>(y) + z;
156 template<
typename Engine>
157 struct engine_boxmuller_helper
159 static __forceinline__ __device__ __host__
bool has_float(
const Engine* engine)
161 return engine->m_state.boxmuller_float_state != 0;
164 static __forceinline__ __device__ __host__
float get_float(Engine* engine)
166 engine->m_state.boxmuller_float_state = 0;
167 return engine->m_state.boxmuller_float;
170 static __forceinline__ __device__ __host__
void save_float(Engine* engine,
float f)
172 engine->m_state.boxmuller_float_state = 1;
173 engine->m_state.boxmuller_float = f;
176 static __forceinline__ __device__ __host__
bool has_double(
const Engine* engine)
178 return engine->m_state.boxmuller_double_state != 0;
181 static __forceinline__ __device__ __host__
float get_double(Engine* engine)
183 engine->m_state.boxmuller_double_state = 0;
184 return engine->m_state.boxmuller_double;
187 static __forceinline__ __device__ __host__
void save_double(Engine* engine,
double d)
189 engine->m_state.boxmuller_double_state = 1;
190 engine->m_state.boxmuller_double = d;
195 __forceinline__ __device__ __host__
void split_ull(T& lo, T& hi,
unsigned long long int val);
198 __forceinline__ __device__ __host__
void
199 split_ull(
unsigned int& lo,
unsigned int& hi,
unsigned long long int val)
201 lo = val & 0xFFFFFFFF;
202 hi = (val >> 32) & 0xFFFFFFFF;
206 __forceinline__ __device__ __host__
void
207 split_ull(
unsigned long long int& lo,
unsigned long long int& hi,
unsigned long long int val)