53 #ifndef ROCRAND_PHILOX4X32_10_H_ 
   54 #define ROCRAND_PHILOX4X32_10_H_ 
   56 #include "rocrand/rocrand_common.h" 
   60 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U 
   61 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U 
   62 #define ROCRAND_PHILOX_W32_0   0x9E3779B9U 
   63 #define ROCRAND_PHILOX_W32_1   0xBB67AE85U 
   73 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL  
   76 namespace rocrand_device {
 
   79 __forceinline__ __device__ __host__ 
unsigned int 
   80     mulhilo32(
unsigned int x, 
unsigned int y, 
unsigned int& z)
 
   82     unsigned long long xy = mad_u64_u32(x, y, 0);
 
   83     z = 
static_cast<unsigned int>(xy >> 32);
 
   84     return static_cast<unsigned int>(xy);
 
   89 class philox4x32_10_engine
 
   92     struct philox4x32_10_state
 
   97         unsigned int substate;
 
   99     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  105         unsigned int boxmuller_float_state; 
 
  106         unsigned int boxmuller_double_state; 
 
  107         float boxmuller_float; 
 
  108         double boxmuller_double; 
 
  112     __forceinline__ __device__ __host__ philox4x32_10_engine()
 
  122     __forceinline__ __device__ __host__ philox4x32_10_engine(
const unsigned long long seed,
 
  123                                                              const unsigned long long subsequence,
 
  124                                                              const unsigned long long offset)
 
  126         this->seed(seed, subsequence, offset);
 
  134     __forceinline__ __device__ __host__ 
void seed(
unsigned long long       seed_value,
 
  135                                                   const unsigned long long subsequence,
 
  136                                                   const unsigned long long offset)
 
  138         m_state.key.x = 
static_cast<unsigned int>(seed_value);
 
  139         m_state.key.y = 
static_cast<unsigned int>(seed_value >> 32);
 
  140         this->restart(subsequence, offset);
 
  144     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  146         this->discard_impl(offset);
 
  147         this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  154     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  156         this->discard_subsequence_impl(subsequence);
 
  157         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  160     __forceinline__ __device__ __host__ 
void restart(
const unsigned long long subsequence,
 
  161                                                      const unsigned long long offset)
 
  163         m_state.counter = {0, 0, 0, 0};
 
  164         m_state.result  = {0, 0, 0, 0};
 
  165         m_state.substate = 0;
 
  166     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  167         m_state.boxmuller_float_state = 0;
 
  168         m_state.boxmuller_double_state = 0;
 
  170         this->discard_subsequence_impl(subsequence);
 
  171         this->discard_impl(offset);
 
  172         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  175     __forceinline__ __device__ __host__ 
unsigned int operator()()
 
  180     __forceinline__ __device__ __host__ 
unsigned int next()
 
  182     #if defined(__HIP_PLATFORM_AMD__) 
  183         unsigned int ret = m_state.result.data[m_state.substate];
 
  185         unsigned int ret = (&m_state.result.x)[m_state.substate];
 
  188         if(m_state.substate == 4)
 
  190             m_state.substate = 0;
 
  191             this->discard_state();
 
  192             m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  197     __forceinline__ __device__ __host__ uint4 next4()
 
  199         uint4 ret = m_state.result;
 
  200         this->discard_state();
 
  201         m_state.result = this->ten_rounds(m_state.counter, m_state.key);
 
  202         return this->interleave(ret, m_state.result);
 
  208     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  211         m_state.substate += offset & 3;
 
  212         unsigned long long counter_offset = offset / 4;
 
  213         counter_offset += m_state.substate < 4 ? 0 : 1;
 
  214         m_state.substate += m_state.substate < 4 ? 0 : -4;
 
  216         this->discard_state(counter_offset);
 
  220     __forceinline__ __device__ __host__ 
void 
  221         discard_subsequence_impl(
unsigned long long subsequence)
 
  223         unsigned int lo = 
static_cast<unsigned int>(subsequence);
 
  224         unsigned int hi = 
static_cast<unsigned int>(subsequence >> 32);
 
  226         unsigned int temp = m_state.counter.z;
 
  227         m_state.counter.z += lo;
 
  228         m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
 
  233     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  235         unsigned int lo = 
static_cast<unsigned int>(offset);
 
  236         unsigned int hi = 
static_cast<unsigned int>(offset >> 32);
 
  238         uint4 temp = m_state.counter;
 
  239         m_state.counter.x += lo;
 
  240         m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
 
  241         m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
 
  242         m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
 
  247     __forceinline__ __device__ __host__ 
void discard_state()
 
  249         m_state.counter = this->bump_counter(m_state.counter);
 
  252     __forceinline__ __device__ __host__ 
static uint4 bump_counter(uint4 counter)
 
  255         unsigned int add      = counter.x == 0 ? 1 : 0;
 
  256         counter.y += add; add = counter.y == 0 ? add : 0;
 
  257         counter.z += add; add = counter.z == 0 ? add : 0;
 
  262     __forceinline__ __device__ __host__ uint4 interleave(
const uint4 prev, 
const uint4 next)
 const 
  264         switch(m_state.substate)
 
  269                 return uint4{ prev.y, prev.z, prev.w, next.x };
 
  271                 return uint4{ prev.z, prev.w, next.x, next.y };
 
  273                 return uint4{ prev.w, next.x, next.y, next.z };
 
  275         __builtin_unreachable();
 
  279     __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
 
  281         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  282         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  283         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  284         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  285         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  286         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  287         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  288         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  289         counter = this->single_round(counter, key); key = this->bumpkey(key); 
 
  290         return this->single_round(counter, key);                        
 
  295     __forceinline__ __device__ __host__ 
static uint4 single_round(uint4 counter, uint2 key)
 
  300         unsigned int lo0 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_0, counter.x, hi0);
 
  301         unsigned int lo1 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_1, counter.z, hi1);
 
  303             hi1 ^ counter.y ^ key.x,
 
  305             hi0 ^ counter.w ^ key.y,
 
  310     __forceinline__ __device__ __host__ 
static uint2 bumpkey(uint2 key)
 
  312         key.x += ROCRAND_PHILOX_W32_0;
 
  313         key.y += ROCRAND_PHILOX_W32_1;
 
  319     philox4x32_10_state m_state;
 
  321     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  322     friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
 
  335 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
 
  349 __forceinline__ __device__ __host__ 
void rocrand_init(
const unsigned long long     seed,
 
  350                                                       const unsigned long long     subsequence,
 
  351                                                       const unsigned long long     offset,
 
  352                                                       rocrand_state_philox4x32_10* state)
 
  354     *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
 
  369 __forceinline__ __device__ __host__ 
unsigned int rocrand(rocrand_state_philox4x32_10* state)
 
  371     return state->next();
 
  386 __forceinline__ __device__ __host__ uint4 
rocrand4(rocrand_state_philox4x32_10* state)
 
  388     return state->next4();
 
  399 __forceinline__ __device__ __host__ 
void skipahead(
unsigned long long           offset,
 
  400                                                    rocrand_state_philox4x32_10* state)
 
  402     return state->discard(offset);
 
  415                                                                rocrand_state_philox4x32_10* state)
 
  417     return state->discard_subsequence(subsequence);
 
  430                                                             rocrand_state_philox4x32_10* state)
 
  432     return state->discard_subsequence(sequence);
 
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:399
 
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:349
 
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:73
 
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:386
 
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:429
 
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:369
 
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:414