53 #ifndef ROCRAND_PHILOX4X32_10_H_ 
   54 #define ROCRAND_PHILOX4X32_10_H_ 
   56 #include "rocrand/rocrand_common.h" 
   58 #include <hip/hip_runtime.h> 
   62 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U 
   63 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U 
   64 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U 
   65 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U 
   75 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL  
   78 namespace rocrand_device
 
   81 class philox4x32_10_engine
 
   84     struct philox4x32_10_state
 
   89         unsigned int substate;
 
   91     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
   97         unsigned int boxmuller_float_state; 
 
   98         unsigned int boxmuller_double_state; 
 
   99         float boxmuller_float; 
 
  100         double boxmuller_double; 
 
  104     __forceinline__ __device__ __host__ philox4x32_10_engine()
 
  114     __forceinline__ __device__ __host__ philox4x32_10_engine(
const unsigned long long seed,
 
  115                                                              const unsigned long long subsequence,
 
  116                                                              const unsigned long long offset)
 
  118         this->seed(seed, subsequence, offset);
 
  126     __forceinline__ __device__ __host__ 
void seed(
unsigned long long       seed_value,
 
  127                                                   const unsigned long long subsequence,
 
  128                                                   const unsigned long long offset)
 
  130         m_state.key.x = 
static_cast<unsigned int>(seed_value);
 
  131         m_state.key.y = 
static_cast<unsigned int>(seed_value >> 32);
 
  132         this->restart(subsequence, offset);
 
  136     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  138         this->discard_impl(offset);
 
  139         this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  146     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  148         this->discard_subsequence_impl(subsequence);
 
  149         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  152     __forceinline__ __device__ __host__ 
void restart(
const unsigned long long subsequence,
 
  153                                                      const unsigned long long offset)
 
  155         m_state.counter = {0, 0, 0, 0};
 
  156         m_state.result  = {0, 0, 0, 0};
 
  157         m_state.substate = 0;
 
  158     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  159         m_state.boxmuller_float_state = 0;
 
  160         m_state.boxmuller_double_state = 0;
 
  162         this->discard_subsequence_impl(subsequence);
 
  163         this->discard_impl(offset);
 
  164         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  167     __forceinline__ __device__ __host__ 
unsigned int operator()()
 
  172     __forceinline__ __device__ __host__ 
unsigned int next()
 
  174     #if defined(__HIP_PLATFORM_AMD__) 
  175         unsigned int ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
 
  177         unsigned int ret = (&m_state.result.x)[m_state.substate];
 
  181         if(m_state.substate == 4)
 
  183             m_state.substate = 0;
 
  184             this->discard_state();
 
  185             m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  190     __forceinline__ __device__ __host__ uint4 next4()
 
  192         uint4 ret = m_state.result;
 
  193         this->discard_state();
 
  194         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  195         return this->interleave(ret, m_state.result);
 
  201     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  204         m_state.substate += offset & 3;
 
  205         unsigned long long counter_offset = offset / 4;
 
  206         counter_offset += m_state.substate < 4 ? 0 : 1;
 
  207         m_state.substate += m_state.substate < 4 ? 0 : -4;
 
  209         this->discard_state(counter_offset);
 
  213     __forceinline__ __device__ __host__ 
void 
  214         discard_subsequence_impl(
unsigned long long subsequence)
 
  216         unsigned int lo = 
static_cast<unsigned int>(subsequence);
 
  217         unsigned int hi = 
static_cast<unsigned int>(subsequence >> 32);
 
  219         unsigned int temp = m_state.counter.z;
 
  220         m_state.counter.z += lo;
 
  221         m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
 
  226     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  228         unsigned int lo = 
static_cast<unsigned int>(offset);
 
  229         unsigned int hi = 
static_cast<unsigned int>(offset >> 32);
 
  231         uint4 temp = m_state.counter;
 
  232         m_state.counter.x += lo;
 
  233         m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
 
  234         m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
 
  235         m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
 
  240     __forceinline__ __device__ __host__ 
void discard_state()
 
  242         m_state.counter = this->bump_counter(m_state.counter);
 
  245     __forceinline__ __device__ __host__ 
static uint4 bump_counter(uint4 counter)
 
  248         unsigned int add      = counter.x == 0 ? 1 : 0;
 
  249         counter.y += add; add = counter.y == 0 ? add : 0;
 
  250         counter.z += add; add = counter.z == 0 ? add : 0;
 
  255     __forceinline__ __device__ __host__ uint4 interleave(
const uint4 prev, 
const uint4 next)
 const 
  257         switch(m_state.substate)
 
  262                 return uint4{ prev.y, prev.z, prev.w, next.x };
 
  264                 return uint4{ prev.z, prev.w, next.x, next.y };
 
  266                 return uint4{ prev.w, next.x, next.y, next.z };
 
  268         __builtin_unreachable();
 
  272     __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
 
  274         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  275         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  276         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  277         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  278         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  279         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  280         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  281         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  282         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  283         return this->single_round(counter, key);                        
 
  288     __forceinline__ __device__ __host__ 
static uint4 single_round(uint4 counter, uint2 key)
 
  291         unsigned long long mul0 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_0, counter.x);
 
  292         unsigned int       hi0  = 
static_cast<unsigned int>(mul0 >> 32);
 
  293         unsigned int       lo0  = 
static_cast<unsigned int>(mul0);
 
  294         unsigned long long mul1 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_1, counter.z);
 
  295         unsigned int       hi1  = 
static_cast<unsigned int>(mul1 >> 32);
 
  296         unsigned int       lo1  = 
static_cast<unsigned int>(mul1);
 
  297         return uint4{hi1 ^ counter.y ^ key.x, lo1, hi0 ^ counter.w ^ key.y, lo0};
 
  300     __forceinline__ __device__ __host__ 
static uint2 bumpkey(uint2 key)
 
  302         key.x += ROCRAND_PHILOX_W32_0;
 
  303         key.y += ROCRAND_PHILOX_W32_1;
 
  309     philox4x32_10_state m_state;
 
  311     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  312     friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
 
  325 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
 
  339 __forceinline__ __device__ __host__
 
  341                   const unsigned long long     subsequence,
 
  342                   const unsigned long long     offset,
 
  343                   rocrand_state_philox4x32_10* state)
 
  345     *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
 
  360 __forceinline__ __device__ __host__
 
  361 unsigned int rocrand(rocrand_state_philox4x32_10* state)
 
  363     return state->next();
 
  378 __forceinline__ __device__ __host__
 
  381     return state->next4();
 
  392 __forceinline__ __device__ __host__
 
  393 void skipahead(
unsigned long long offset, rocrand_state_philox4x32_10* state)
 
  395     return state->discard(offset);
 
  407 __forceinline__ __device__ __host__
 
  410     return state->discard_subsequence(subsequence);
 
  422 __forceinline__ __device__ __host__
 
  425     return state->discard_subsequence(sequence);
 
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:393
 
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:340
 
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:75
 
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:379
 
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:423
 
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:361
 
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:408