53 #ifndef ROCRAND_PHILOX4X32_10_H_
54 #define ROCRAND_PHILOX4X32_10_H_
56 #include "rocrand/rocrand_common.h"
60 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
61 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
62 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U
63 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U
73 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL
76 namespace rocrand_device {
79 __forceinline__ __device__ __host__
unsigned int
80 mulhilo32(
unsigned int x,
unsigned int y,
unsigned int& z)
82 unsigned long long xy = mad_u64_u32(x, y, 0);
83 z =
static_cast<unsigned int>(xy >> 32);
84 return static_cast<unsigned int>(xy);
89 class philox4x32_10_engine
92 struct philox4x32_10_state
97 unsigned int substate;
99 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
105 unsigned int boxmuller_float_state;
106 unsigned int boxmuller_double_state;
107 float boxmuller_float;
108 double boxmuller_double;
112 __forceinline__ __device__ __host__ philox4x32_10_engine()
122 __forceinline__ __device__ __host__ philox4x32_10_engine(
const unsigned long long seed,
123 const unsigned long long subsequence,
124 const unsigned long long offset)
126 this->seed(seed, subsequence, offset);
134 __forceinline__ __device__ __host__
void seed(
unsigned long long seed_value,
135 const unsigned long long subsequence,
136 const unsigned long long offset)
138 m_state.key.x =
static_cast<unsigned int>(seed_value);
139 m_state.key.y =
static_cast<unsigned int>(seed_value >> 32);
140 this->restart(subsequence, offset);
144 __forceinline__ __device__ __host__
void discard(
unsigned long long offset)
146 this->discard_impl(offset);
147 this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
154 __forceinline__ __device__ __host__
void discard_subsequence(
unsigned long long subsequence)
156 this->discard_subsequence_impl(subsequence);
157 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
160 __forceinline__ __device__ __host__
void restart(
const unsigned long long subsequence,
161 const unsigned long long offset)
163 m_state.counter = {0, 0, 0, 0};
164 m_state.result = {0, 0, 0, 0};
165 m_state.substate = 0;
166 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
167 m_state.boxmuller_float_state = 0;
168 m_state.boxmuller_double_state = 0;
170 this->discard_subsequence_impl(subsequence);
171 this->discard_impl(offset);
172 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
175 __forceinline__ __device__ __host__
unsigned int operator()()
180 __forceinline__ __device__ __host__
unsigned int next()
182 #if defined(__HIP_PLATFORM_AMD__)
183 unsigned int ret = m_state.result.data[m_state.substate];
185 unsigned int ret = (&m_state.result.x)[m_state.substate];
188 if(m_state.substate == 4)
190 m_state.substate = 0;
191 this->discard_state();
192 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
197 __forceinline__ __device__ __host__ uint4 next4()
199 uint4 ret = m_state.result;
200 this->discard_state();
201 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
202 return this->interleave(ret, m_state.result);
208 __forceinline__ __device__ __host__
void discard_impl(
unsigned long long offset)
211 m_state.substate += offset & 3;
212 unsigned long long counter_offset = offset / 4;
213 counter_offset += m_state.substate < 4 ? 0 : 1;
214 m_state.substate += m_state.substate < 4 ? 0 : -4;
216 this->discard_state(counter_offset);
220 __forceinline__ __device__ __host__
void
221 discard_subsequence_impl(
unsigned long long subsequence)
223 unsigned int lo =
static_cast<unsigned int>(subsequence);
224 unsigned int hi =
static_cast<unsigned int>(subsequence >> 32);
226 unsigned int temp = m_state.counter.z;
227 m_state.counter.z += lo;
228 m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
233 __forceinline__ __device__ __host__
void discard_state(
unsigned long long offset)
235 unsigned int lo =
static_cast<unsigned int>(offset);
236 unsigned int hi =
static_cast<unsigned int>(offset >> 32);
238 uint4 temp = m_state.counter;
239 m_state.counter.x += lo;
240 m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
241 m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
242 m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
247 __forceinline__ __device__ __host__
void discard_state()
249 m_state.counter = this->bump_counter(m_state.counter);
252 __forceinline__ __device__ __host__
static uint4 bump_counter(uint4 counter)
255 unsigned int add = counter.x == 0 ? 1 : 0;
256 counter.y += add; add = counter.y == 0 ? add : 0;
257 counter.z += add; add = counter.z == 0 ? add : 0;
262 __forceinline__ __device__ __host__ uint4 interleave(
const uint4 prev,
const uint4 next)
const
264 switch(m_state.substate)
269 return uint4{ prev.y, prev.z, prev.w, next.x };
271 return uint4{ prev.z, prev.w, next.x, next.y };
273 return uint4{ prev.w, next.x, next.y, next.z };
275 __builtin_unreachable();
279 __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
281 counter = this->single_round(counter, key); key = this->bumpkey(key);
282 counter = this->single_round(counter, key); key = this->bumpkey(key);
283 counter = this->single_round(counter, key); key = this->bumpkey(key);
284 counter = this->single_round(counter, key); key = this->bumpkey(key);
285 counter = this->single_round(counter, key); key = this->bumpkey(key);
286 counter = this->single_round(counter, key); key = this->bumpkey(key);
287 counter = this->single_round(counter, key); key = this->bumpkey(key);
288 counter = this->single_round(counter, key); key = this->bumpkey(key);
289 counter = this->single_round(counter, key); key = this->bumpkey(key);
290 return this->single_round(counter, key);
295 __forceinline__ __device__ __host__
static uint4 single_round(uint4 counter, uint2 key)
300 unsigned int lo0 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_0, counter.x, hi0);
301 unsigned int lo1 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_1, counter.z, hi1);
303 hi1 ^ counter.y ^ key.x,
305 hi0 ^ counter.w ^ key.y,
310 __forceinline__ __device__ __host__
static uint2 bumpkey(uint2 key)
312 key.x += ROCRAND_PHILOX_W32_0;
313 key.y += ROCRAND_PHILOX_W32_1;
319 philox4x32_10_state m_state;
321 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
322 friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
335 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
349 __forceinline__ __device__ __host__
void rocrand_init(
const unsigned long long seed,
350 const unsigned long long subsequence,
351 const unsigned long long offset,
352 rocrand_state_philox4x32_10* state)
354 *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
369 __forceinline__ __device__ __host__
unsigned int rocrand(rocrand_state_philox4x32_10* state)
371 return state->next();
386 __forceinline__ __device__ __host__ uint4
rocrand4(rocrand_state_philox4x32_10* state)
388 return state->next4();
399 __forceinline__ __device__ __host__
void skipahead(
unsigned long long offset,
400 rocrand_state_philox4x32_10* state)
402 return state->discard(offset);
415 rocrand_state_philox4x32_10* state)
417 return state->discard_subsequence(subsequence);
430 rocrand_state_philox4x32_10* state)
432 return state->discard_subsequence(sequence);
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:399
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:349
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:73
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:386
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:429
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:369
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:414