21 #ifndef ROCRAND_MRG32K3A_H_
22 #define ROCRAND_MRG32K3A_H_
24 #include "rocrand/rocrand_common.h"
25 #include "rocrand/rocrand_mrg32k3a_precomputed.h"
27 #define ROCRAND_MRG32K3A_POW32 4294967296
28 #define ROCRAND_MRG32K3A_M1 4294967087
29 #define ROCRAND_MRG32K3A_M1C 209
30 #define ROCRAND_MRG32K3A_M2 4294944443
31 #define ROCRAND_MRG32K3A_M2C 22853
32 #define ROCRAND_MRG32K3A_A12 1403580
33 #define ROCRAND_MRG32K3A_A13 (4294967087 - 810728)
34 #define ROCRAND_MRG32K3A_A13N 810728
35 #define ROCRAND_MRG32K3A_A21 527612
36 #define ROCRAND_MRG32K3A_A23 (4294944443 - 1370589)
37 #define ROCRAND_MRG32K3A_A23N 1370589
38 #define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10)
39 #define ROCRAND_MRG32K3A_UINT_NORM (1.000000048661607)
49 #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL
52 namespace rocrand_device {
62 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
68 unsigned int boxmuller_float_state;
69 unsigned int boxmuller_double_state;
70 float boxmuller_float;
71 double boxmuller_double;
75 __forceinline__ __device__ __host__ mrg32k3a_engine()
88 __forceinline__ __device__ __host__ mrg32k3a_engine(
const unsigned long long seed,
89 const unsigned long long subsequence,
90 const unsigned long long offset)
92 this->seed(seed, subsequence, offset);
103 __forceinline__ __device__ __host__
void seed(
unsigned long long seed_value,
104 const unsigned long long subsequence,
105 const unsigned long long offset)
111 unsigned int x = (
unsigned int) seed_value ^ 0x55555555U;
112 unsigned int y = (
unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
113 m_state.g1[0] = mod_mul_m1(x, seed_value);
114 m_state.g1[1] = mod_mul_m1(y, seed_value);
115 m_state.g1[2] = mod_mul_m1(x, seed_value);
116 m_state.g2[0] = mod_mul_m2(y, seed_value);
117 m_state.g2[1] = mod_mul_m2(x, seed_value);
118 m_state.g2[2] = mod_mul_m2(y, seed_value);
119 this->restart(subsequence, offset);
123 __forceinline__ __device__ __host__
void discard(
unsigned long long offset)
125 this->discard_impl(offset);
130 __forceinline__ __device__ __host__
void discard_subsequence(
unsigned long long subsequence)
132 this->discard_subsequence_impl(subsequence);
137 __forceinline__ __device__ __host__
void discard_sequence(
unsigned long long sequence)
139 this->discard_sequence_impl(sequence);
142 __forceinline__ __device__ __host__
void restart(
const unsigned long long subsequence,
143 const unsigned long long offset)
145 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
146 m_state.boxmuller_float_state = 0;
147 m_state.boxmuller_double_state = 0;
149 this->discard_subsequence_impl(subsequence);
150 this->discard_impl(offset);
153 __forceinline__ __device__ __host__
unsigned int operator()()
160 __forceinline__ __device__ __host__
unsigned int next()
162 const unsigned int p1 = mod_m1(
164 ROCRAND_MRG32K3A_A12,
167 ROCRAND_MRG32K3A_A13N,
168 (ROCRAND_MRG32K3A_M1 - m_state.g1[0]),
174 m_state.g1[0] = m_state.g1[1]; m_state.g1[1] = m_state.g1[2];
177 const unsigned int p2 = mod_m2(
179 ROCRAND_MRG32K3A_A21,
182 ROCRAND_MRG32K3A_A23N,
183 (ROCRAND_MRG32K3A_M2 - m_state.g2[0]),
189 m_state.g2[0] = m_state.g2[1]; m_state.g2[1] = m_state.g2[2];
192 return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
198 __forceinline__ __device__ __host__
void discard_impl(
unsigned long long offset)
200 discard_state(offset);
204 __forceinline__ __device__ __host__
void
205 discard_subsequence_impl(
unsigned long long subsequence)
209 while(subsequence > 0) {
210 if (subsequence & 1) {
211 #if defined(__HIP_DEVICE_COMPILE__)
212 mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
213 mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
215 mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
216 mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
225 __forceinline__ __device__ __host__
void discard_sequence_impl(
unsigned long long sequence)
229 while(sequence > 0) {
231 #if defined(__HIP_DEVICE_COMPILE__)
232 mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
233 mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
235 mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
236 mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
246 __forceinline__ __device__ __host__
void discard_state(
unsigned long long offset)
252 #if defined(__HIP_DEVICE_COMPILE__)
253 mod_mat_vec_m1(d_A1 + i, m_state.g1);
254 mod_mat_vec_m2(d_A2 + i, m_state.g2);
256 mod_mat_vec_m1(h_A1 + i, m_state.g1);
257 mod_mat_vec_m2(h_A2 + i, m_state.g2);
267 __forceinline__ __device__ __host__
void discard_state()
273 __forceinline__ __device__ __host__
static void mod_mat_vec_m1(
const unsigned long long* A,
276 unsigned long long x[3];
278 x[0] = mod_m1(mod_m1(A[0] * s[0])
279 + mod_m1(A[1] * s[1])
280 + mod_m1(A[2] * s[2]));
282 x[1] = mod_m1(mod_m1(A[3] * s[0])
283 + mod_m1(A[4] * s[1])
284 + mod_m1(A[5] * s[2]));
286 x[2] = mod_m1(mod_m1(A[6] * s[0])
287 + mod_m1(A[7] * s[1])
288 + mod_m1(A[8] * s[2]));
295 __forceinline__ __device__ __host__
static void mod_mat_vec_m2(
const unsigned long long* A,
298 unsigned long long x[3];
300 x[0] = mod_m2(mod_m2(A[0] * s[0])
301 + mod_m2(A[1] * s[1])
302 + mod_m2(A[2] * s[2]));
304 x[1] = mod_m2(mod_m2(A[3] * s[0])
305 + mod_m2(A[4] * s[1])
306 + mod_m2(A[5] * s[2]));
308 x[2] = mod_m2(mod_m2(A[6] * s[0])
309 + mod_m2(A[7] * s[1])
310 + mod_m2(A[8] * s[2]));
317 __forceinline__ __device__ __host__
static unsigned long long mod_mul_m1(
unsigned int i,
318 unsigned long long j)
320 long long hi, lo, temp1, temp2;
323 lo = i - (hi * 131072);
324 temp1 = mod_m1(hi * j) * 131072;
325 temp2 = mod_m1(lo * j);
326 lo = mod_m1(temp1 + temp2);
329 lo += ROCRAND_MRG32K3A_M1;
333 __forceinline__ __device__ __host__
static unsigned long long mod_m1(
unsigned long long p)
335 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
336 if (p >= ROCRAND_MRG32K3A_M1)
337 p -= ROCRAND_MRG32K3A_M1;
342 __forceinline__ __device__ __host__
static unsigned long long mod_mul_m2(
unsigned int i,
343 unsigned long long j)
345 long long hi, lo, temp1, temp2;
348 lo = i - (hi * 131072);
349 temp1 = mod_m2(hi * j) * 131072;
350 temp2 = mod_m2(lo * j);
351 lo = mod_m2(temp1 + temp2);
354 lo += ROCRAND_MRG32K3A_M2;
358 __forceinline__ __device__ __host__
static unsigned long long mod_m2(
unsigned long long p)
360 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
361 p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
362 if (p >= ROCRAND_MRG32K3A_M2)
363 p -= ROCRAND_MRG32K3A_M2;
370 mrg32k3a_state m_state;
372 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
373 friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
386 typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
400 __forceinline__ __device__ __host__
void rocrand_init(
const unsigned long long seed,
401 const unsigned long long subsequence,
402 const unsigned long long offset,
403 rocrand_state_mrg32k3a* state)
405 *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
420 __forceinline__ __device__ __host__
unsigned int rocrand(rocrand_state_mrg32k3a* state)
423 return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
434 __forceinline__ __device__ __host__
void skipahead(
unsigned long long offset,
435 rocrand_state_mrg32k3a* state)
437 return state->discard(offset);
450 rocrand_state_mrg32k3a* state)
452 return state->discard_subsequence(subsequence);
465 rocrand_state_mrg32k3a* state)
467 return state->discard_sequence(sequence);
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition: rocrand_mrg32k3a.h:49
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition: rocrand_mrg32k3a.h:400
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg32k3a.h:449
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition: rocrand_mrg32k3a.h:434
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg32k3a.h:420
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition: rocrand_mrg32k3a.h:464