53 #ifndef ROCRAND_PHILOX4X32_10_H_
54 #define ROCRAND_PHILOX4X32_10_H_
56 #include "rocrand/rocrand_common.h"
58 #include <hip/hip_runtime.h>
62 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
63 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
64 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U
65 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U
75 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL
78 namespace rocrand_device
81 class philox4x32_10_engine
84 struct philox4x32_10_state
89 unsigned int substate;
91 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
97 float boxmuller_float;
98 double boxmuller_double;
102 __forceinline__ __device__ __host__ philox4x32_10_engine()
112 __forceinline__ __device__ __host__ philox4x32_10_engine(
const unsigned long long seed,
113 const unsigned long long subsequence,
114 const unsigned long long offset)
116 this->seed(seed, subsequence, offset);
124 __forceinline__ __device__ __host__
void seed(
unsigned long long seed_value,
125 const unsigned long long subsequence,
126 const unsigned long long offset)
128 m_state.key.x =
static_cast<unsigned int>(seed_value);
129 m_state.key.y =
static_cast<unsigned int>(seed_value >> 32);
130 this->restart(subsequence, offset);
134 __forceinline__ __device__ __host__
void discard(
unsigned long long offset)
136 this->discard_impl(offset);
137 this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
144 __forceinline__ __device__ __host__
void discard_subsequence(
unsigned long long subsequence)
146 this->discard_subsequence_impl(subsequence);
147 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
150 __forceinline__ __device__ __host__
void restart(
const unsigned long long subsequence,
151 const unsigned long long offset)
153 m_state.counter = {0, 0, 0, 0};
154 m_state.result = {0, 0, 0, 0};
155 m_state.substate = 0;
156 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
157 m_state.boxmuller_float = ROCRAND_NAN_FLOAT;
158 m_state.boxmuller_double = ROCRAND_NAN_DOUBLE;
160 this->discard_subsequence_impl(subsequence);
161 this->discard_impl(offset);
162 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
165 __forceinline__ __device__ __host__
unsigned int operator()()
170 __forceinline__ __device__ __host__
unsigned int next()
172 #if defined(__HIP_PLATFORM_AMD__)
173 unsigned int ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
175 unsigned int ret = (&m_state.result.x)[m_state.substate];
179 if(m_state.substate == 4)
181 m_state.substate = 0;
182 this->discard_state();
183 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
188 __forceinline__ __device__ __host__ uint4 next4()
190 uint4 ret = m_state.result;
191 this->discard_state();
192 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
193 return this->interleave(ret, m_state.result);
199 __forceinline__ __device__ __host__
void discard_impl(
unsigned long long offset)
202 m_state.substate += offset & 3;
203 unsigned long long counter_offset = offset / 4;
204 counter_offset += m_state.substate < 4 ? 0 : 1;
205 m_state.substate += m_state.substate < 4 ? 0 : -4;
207 this->discard_state(counter_offset);
211 __forceinline__ __device__ __host__
void
212 discard_subsequence_impl(
unsigned long long subsequence)
214 unsigned int lo =
static_cast<unsigned int>(subsequence);
215 unsigned int hi =
static_cast<unsigned int>(subsequence >> 32);
217 unsigned int temp = m_state.counter.z;
218 m_state.counter.z += lo;
219 m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
224 __forceinline__ __device__ __host__
void discard_state(
unsigned long long offset)
226 unsigned int lo =
static_cast<unsigned int>(offset);
227 unsigned int hi =
static_cast<unsigned int>(offset >> 32);
229 uint4 temp = m_state.counter;
230 m_state.counter.x += lo;
231 m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
232 m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
233 m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
238 __forceinline__ __device__ __host__
void discard_state()
240 m_state.counter = this->bump_counter(m_state.counter);
243 __forceinline__ __device__ __host__
static uint4 bump_counter(uint4 counter)
246 unsigned int add = counter.x == 0 ? 1 : 0;
247 counter.y += add; add = counter.y == 0 ? add : 0;
248 counter.z += add; add = counter.z == 0 ? add : 0;
253 __forceinline__ __device__ __host__ uint4 interleave(
const uint4 prev,
const uint4 next)
const
255 switch(m_state.substate)
260 return uint4{ prev.y, prev.z, prev.w, next.x };
262 return uint4{ prev.z, prev.w, next.x, next.y };
264 return uint4{ prev.w, next.x, next.y, next.z };
266 __builtin_unreachable();
270 __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
272 counter = this->single_round(counter, key); key = this->bumpkey(key);
273 counter = this->single_round(counter, key); key = this->bumpkey(key);
274 counter = this->single_round(counter, key); key = this->bumpkey(key);
275 counter = this->single_round(counter, key); key = this->bumpkey(key);
276 counter = this->single_round(counter, key); key = this->bumpkey(key);
277 counter = this->single_round(counter, key); key = this->bumpkey(key);
278 counter = this->single_round(counter, key); key = this->bumpkey(key);
279 counter = this->single_round(counter, key); key = this->bumpkey(key);
280 counter = this->single_round(counter, key); key = this->bumpkey(key);
281 return this->single_round(counter, key);
286 __forceinline__ __device__ __host__
static uint4 single_round(uint4 counter, uint2 key)
289 unsigned long long mul0 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_0, counter.x);
290 unsigned int hi0 =
static_cast<unsigned int>(mul0 >> 32);
291 unsigned int lo0 =
static_cast<unsigned int>(mul0);
292 unsigned long long mul1 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_1, counter.z);
293 unsigned int hi1 =
static_cast<unsigned int>(mul1 >> 32);
294 unsigned int lo1 =
static_cast<unsigned int>(mul1);
295 return uint4{hi1 ^ counter.y ^ key.x, lo1, hi0 ^ counter.w ^ key.y, lo0};
298 __forceinline__ __device__ __host__
static uint2 bumpkey(uint2 key)
300 key.x += ROCRAND_PHILOX_W32_0;
301 key.y += ROCRAND_PHILOX_W32_1;
307 philox4x32_10_state m_state;
309 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
310 friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
323 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
337 __forceinline__ __device__ __host__
339 const unsigned long long subsequence,
340 const unsigned long long offset,
341 rocrand_state_philox4x32_10* state)
343 *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
358 __forceinline__ __device__ __host__
359 unsigned int rocrand(rocrand_state_philox4x32_10* state)
361 return state->next();
376 __forceinline__ __device__ __host__
379 return state->next4();
390 __forceinline__ __device__ __host__
391 void skipahead(
unsigned long long offset, rocrand_state_philox4x32_10* state)
393 return state->discard(offset);
405 __forceinline__ __device__ __host__
408 return state->discard_subsequence(subsequence);
420 __forceinline__ __device__ __host__
423 return state->discard_subsequence(sequence);
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:391
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:338
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:75
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:377
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:421
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:359
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:406