21 #ifndef ROCRAND_COMMON_H_ 
   22 #define ROCRAND_COMMON_H_ 
   24 #define ROCRAND_2POW16_INV (1.5258789e-05f) 
   25 #define ROCRAND_2POW16_INV_2PI (9.58738e-05f) 
   26 #define ROCRAND_2POW32_INV (2.3283064e-10f) 
   27 #define ROCRAND_2POW32_INV_DOUBLE (2.3283064365386963e-10) 
   28 #define ROCRAND_2POW64_INV (5.4210109e-20f) 
   29 #define ROCRAND_2POW64_INV_DOUBLE (5.4210108624275221700372640043497e-20) 
   30 #define ROCRAND_2POW32_INV_2PI (1.46291807e-09f) 
   31 #define ROCRAND_2POW53_INV_DOUBLE (1.1102230246251565e-16) 
   32 #define ROCRAND_PI (3.141592653f) 
   33 #define ROCRAND_PI_DOUBLE  (3.1415926535897932) 
   34 #define ROCRAND_2PI (6.2831855f) 
   35 #define ROCRAND_SQRT2 (1.4142135f) 
   36 #define ROCRAND_SQRT2_DOUBLE (1.4142135623730951) 
   38 #include <hip/hip_runtime.h> 
   42 #define ROCRAND_KERNEL __global__ static 
   44 #if __HIP_DEVICE_COMPILE__            \ 
   45     && (defined(__HIP_PLATFORM_AMD__) \ 
   46         || (defined(__HIP_PLATFORM_NVCC__) && (__CUDA_ARCH__ >= 530))) 
   47     #define ROCRAND_HALF_MATH_SUPPORTED 
   58 #define ROCRAND_STRINGIZE(X) ROCRAND_DO_STRINGIZE(X) 
   59 #define ROCRAND_DO_STRINGIZE(X) #X 
   70 #if defined(__INTEL_COMPILER) 
   71     #define ROCRAND_PRAGMA_MESSAGE(x) \ 
   72         __pragma(message(__FILE__ "(" ROCRAND_STRINGIZE(__LINE__) "): note: " x))
 
   73 #elif defined(__GNUC__) 
   74     #define ROCRAND_PRAGMA_MESSAGE(x) _Pragma(ROCRAND_STRINGIZE(message(x))) 
   75 #elif defined(_MSC_VER) 
   76     #define ROCRAND_PRAGMA_MESSAGE(x) \ 
   77         __pragma(message(__FILE__ "(" ROCRAND_STRINGIZE(__LINE__) "): note: " x))
 
   79     #define ROCRAND_PRAGMA_MESSAGE(x) 
   82 #if __cplusplus >= 201402L 
   83     #define ROCRAND_DEPRECATED(msg) [[deprecated(msg)]] 
   84 #elif defined(_MSC_VER) && !defined(__clang__) 
   85     #define ROCRAND_DEPRECATED(msg) __declspec(deprecated(msg)) 
   86 #elif defined(__clang__) || defined(__GNUC__) 
   87     #define ROCRAND_DEPRECATED(msg) __attribute__((deprecated(msg))) 
   89     #define ROCRAND_DEPRECATED(msg) 
   92 namespace rocrand_device {
 
   95 #if ( defined(__HIP_PLATFORM_NVCC__) || \ 
   96       defined(__gfx801__) || \ 
   97       defined(__gfx802__) || \ 
   98       defined(__gfx803__) || \ 
   99       defined(__gfx810__) || \ 
  100       defined(__gfx900__) || \ 
  101       defined(__gfx902__) || \ 
  102       defined(__gfx904__) || \ 
  103       defined(__gfx906__) || \ 
  104       defined(__gfx908__) || \ 
  105       defined(__gfx909__) || \ 
  106       defined(__gfx1030__) ) 
  107   #if !defined(ROCRAND_ENABLE_INLINE_ASM) 
  108     #define ROCRAND_ENABLE_INLINE_ASM 
  111   #if defined(__HIP_DEVICE_COMPILE__) && defined(ROCRAND_ENABLE_INLINE_ASM) 
  112     #undef ROCRAND_ENABLE_INLINE_ASM 
  116 __forceinline__ __device__ __host__ 
unsigned long long 
  117     mad_u64_u32(
const unsigned int x, 
const unsigned int y, 
const unsigned long long z)
 
  119 #if defined(__HIP_PLATFORM_AMD__) && defined(__HIP_DEVICE_COMPILE__) \ 
  120     && defined(ROCRAND_ENABLE_INLINE_ASM) 
  122     #if __AMDGCN_WAVEFRONT_SIZE == 64u 
  123     using sgpr_t = 
unsigned long long;
 
  124   #elif __AMDGCN_WAVEFRONT_SIZE == 32u 
  125     using sgpr_t = 
unsigned int;
 
  128     unsigned long long r;
 
  134     asm volatile(
"v_mad_u64_u32 %0, %1, %2, %3, %4" 
  135       : 
"=v"(r), 
"=s"(c) : 
"r"(x), 
"v"(y), 
"v"(z)
 
  138   #elif defined(__HIP_PLATFORM_NVCC__) && defined(__HIP_DEVICE_COMPILE__) \ 
  139         && defined(ROCRAND_ENABLE_INLINE_ASM) 
  141     unsigned long long r;
 
  142     asm(
"mad.wide.u32 %0, %1, %2, %3;" 
  143         : 
"=l"(r) : 
"r"(x), 
"r"(y), 
"l"(z)
 
  149     return static_cast<unsigned long long>(x) * 
static_cast<unsigned long long>(y) + z;
 
  156 template<
typename Engine>
 
  157 struct engine_boxmuller_helper
 
  159     static __forceinline__ __device__ __host__ 
bool has_float(
const Engine* engine)
 
  161         return engine->m_state.boxmuller_float_state != 0;
 
  164     static __forceinline__ __device__ __host__ 
float get_float(Engine* engine)
 
  166         engine->m_state.boxmuller_float_state = 0;
 
  167         return engine->m_state.boxmuller_float;
 
  170     static __forceinline__ __device__ __host__ 
void save_float(Engine* engine, 
float f)
 
  172         engine->m_state.boxmuller_float_state = 1;
 
  173         engine->m_state.boxmuller_float = f;
 
  176     static __forceinline__ __device__ __host__ 
bool has_double(
const Engine* engine)
 
  178         return engine->m_state.boxmuller_double_state != 0;
 
  181     static __forceinline__ __device__ __host__ 
float get_double(Engine* engine)
 
  183         engine->m_state.boxmuller_double_state = 0;
 
  184         return engine->m_state.boxmuller_double;
 
  187     static __forceinline__ __device__ __host__ 
void save_double(Engine* engine, 
double d)
 
  189         engine->m_state.boxmuller_double_state = 1;
 
  190         engine->m_state.boxmuller_double = d;
 
  195 __forceinline__ __device__ __host__ 
void split_ull(T& lo, T& hi, 
unsigned long long int val);
 
  198 __forceinline__ __device__ __host__ 
void 
  199     split_ull(
unsigned int& lo, 
unsigned int& hi, 
unsigned long long int val)
 
  201     lo = val & 0xFFFFFFFF;
 
  202     hi = (val >> 32) & 0xFFFFFFFF;
 
  206 __forceinline__ __device__ __host__ 
void 
  207     split_ull(
unsigned long long int& lo, 
unsigned long long int& hi, 
unsigned long long int val)