/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/latest/library/include/rocrand/rocrand_mrg32k3a.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/latest/library/include/rocrand/rocrand_mrg32k3a.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/latest/library/include/rocrand/rocrand_mrg32k3a.h Source File
rocrand_mrg32k3a.h
1 // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCRAND_MRG32K3A_H_
22 #define ROCRAND_MRG32K3A_H_
23 
24 #include "rocrand/rocrand_common.h"
25 #include "rocrand/rocrand_mrg32k3a_precomputed.h"
26 
27 #define ROCRAND_MRG32K3A_POW32 4294967296
28 #define ROCRAND_MRG32K3A_M1 4294967087
29 #define ROCRAND_MRG32K3A_M1C 209
30 #define ROCRAND_MRG32K3A_M2 4294944443
31 #define ROCRAND_MRG32K3A_M2C 22853
32 #define ROCRAND_MRG32K3A_A12 1403580
33 #define ROCRAND_MRG32K3A_A13 (4294967087 - 810728)
34 #define ROCRAND_MRG32K3A_A13N 810728
35 #define ROCRAND_MRG32K3A_A21 527612
36 #define ROCRAND_MRG32K3A_A23 (4294944443 - 1370589)
37 #define ROCRAND_MRG32K3A_A23N 1370589
38 #define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10) // 1/ROCRAND_MRG32K3A_M1
39 #define ROCRAND_MRG32K3A_UINT_NORM (1.000000048661607) // (ROCRAND_MRG32K3A_POW32 - 1)/(ROCRAND_MRG32K3A_M1 - 1)
40 
49  #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL // end of group rocranddevice
51 
52 namespace rocrand_device {
53 
54 class mrg32k3a_engine
55 {
56 public:
57  struct mrg32k3a_state
58  {
59  unsigned int g1[3];
60  unsigned int g2[3];
61 
62  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
63  // The Box–Muller transform requires two inputs to convert uniformly
64  // distributed real values [0; 1] to normally distributed real values
65  // (with mean = 0, and stddev = 1). Often user wants only one
66  // normally distributed number, to save performance and random
67  // numbers the 2nd value is saved for future requests.
68  unsigned int boxmuller_float_state; // is there a float in boxmuller_float
69  unsigned int boxmuller_double_state; // is there a double in boxmuller_double
70  float boxmuller_float; // normally distributed float
71  double boxmuller_double; // normally distributed double
72  #endif
73  };
74 
75  __forceinline__ __device__ __host__ mrg32k3a_engine()
76  {
77  this->seed(ROCRAND_MRG32K3A_DEFAULT_SEED, 0, 0);
78  }
79 
88  __forceinline__ __device__ __host__ mrg32k3a_engine(const unsigned long long seed,
89  const unsigned long long subsequence,
90  const unsigned long long offset)
91  {
92  this->seed(seed, subsequence, offset);
93  }
94 
103  __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
104  const unsigned long long subsequence,
105  const unsigned long long offset)
106  {
107  if(seed_value == 0)
108  {
109  seed_value = ROCRAND_MRG32K3A_DEFAULT_SEED;
110  }
111  unsigned int x = (unsigned int) seed_value ^ 0x55555555U;
112  unsigned int y = (unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
113  m_state.g1[0] = mod_mul_m1(x, seed_value);
114  m_state.g1[1] = mod_mul_m1(y, seed_value);
115  m_state.g1[2] = mod_mul_m1(x, seed_value);
116  m_state.g2[0] = mod_mul_m2(y, seed_value);
117  m_state.g2[1] = mod_mul_m2(x, seed_value);
118  m_state.g2[2] = mod_mul_m2(y, seed_value);
119  this->restart(subsequence, offset);
120  }
121 
123  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
124  {
125  this->discard_impl(offset);
126  }
127 
130  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
131  {
132  this->discard_subsequence_impl(subsequence);
133  }
134 
137  __forceinline__ __device__ __host__ void discard_sequence(unsigned long long sequence)
138  {
139  this->discard_sequence_impl(sequence);
140  }
141 
142  __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
143  const unsigned long long offset)
144  {
145  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
146  m_state.boxmuller_float_state = 0;
147  m_state.boxmuller_double_state = 0;
148  #endif
149  this->discard_subsequence_impl(subsequence);
150  this->discard_impl(offset);
151  }
152 
153  __forceinline__ __device__ __host__ unsigned int operator()()
154  {
155  return this->next();
156  }
157 
158  // Returned value is in range [1, ROCRAND_MRG32K3A_M1],
159  // where ROCRAND_MRG32K3A_M1 < UINT_MAX
160  __forceinline__ __device__ __host__ unsigned int next()
161  {
162  const unsigned int p1 = mod_m1(
163  detail::mad_u64_u32(
164  ROCRAND_MRG32K3A_A12,
165  m_state.g1[1],
166  detail::mad_u64_u32(
167  ROCRAND_MRG32K3A_A13N,
168  (ROCRAND_MRG32K3A_M1 - m_state.g1[0]),
169  0
170  )
171  )
172  );
173 
174  m_state.g1[0] = m_state.g1[1]; m_state.g1[1] = m_state.g1[2];
175  m_state.g1[2] = p1;
176 
177  const unsigned int p2 = mod_m2(
178  detail::mad_u64_u32(
179  ROCRAND_MRG32K3A_A21,
180  m_state.g2[2],
181  detail::mad_u64_u32(
182  ROCRAND_MRG32K3A_A23N,
183  (ROCRAND_MRG32K3A_M2 - m_state.g2[0]),
184  0
185  )
186  )
187  );
188 
189  m_state.g2[0] = m_state.g2[1]; m_state.g2[1] = m_state.g2[2];
190  m_state.g2[2] = p2;
191 
192  return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
193  }
194 
195 protected:
196  // Advances the internal state to skip \p offset numbers.
197  // DOES NOT CALCULATE NEW ULONGLONG
198  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
199  {
200  discard_state(offset);
201  }
202 
203  // DOES NOT CALCULATE NEW ULONGLONG
204  __forceinline__ __device__ __host__ void
205  discard_subsequence_impl(unsigned long long subsequence)
206  {
207  int i = 0;
208 
209  while(subsequence > 0) {
210  if (subsequence & 1) {
211  #if defined(__HIP_DEVICE_COMPILE__)
212  mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
213  mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
214  #else
215  mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
216  mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
217  #endif
218  }
219  subsequence >>= 1;
220  i += 9;
221  }
222  }
223 
224  // DOES NOT CALCULATE NEW ULONGLONG
225  __forceinline__ __device__ __host__ void discard_sequence_impl(unsigned long long sequence)
226  {
227  int i = 0;
228 
229  while(sequence > 0) {
230  if (sequence & 1) {
231  #if defined(__HIP_DEVICE_COMPILE__)
232  mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
233  mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
234  #else
235  mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
236  mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
237  #endif
238  }
239  sequence >>= 1;
240  i += 9;
241  }
242  }
243 
244  // Advances the internal state by offset times.
245  // DOES NOT CALCULATE NEW ULONGLONG
246  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
247  {
248  int i = 0;
249 
250  while(offset > 0) {
251  if (offset & 1) {
252  #if defined(__HIP_DEVICE_COMPILE__)
253  mod_mat_vec_m1(d_A1 + i, m_state.g1);
254  mod_mat_vec_m2(d_A2 + i, m_state.g2);
255  #else
256  mod_mat_vec_m1(h_A1 + i, m_state.g1);
257  mod_mat_vec_m2(h_A2 + i, m_state.g2);
258  #endif
259  }
260  offset >>= 1;
261  i += 9;
262  }
263  }
264 
265  // Advances the internal state to the next state
266  // DOES NOT CALCULATE NEW ULONGLONG
267  __forceinline__ __device__ __host__ void discard_state()
268  {
269  discard_state(1);
270  }
271 
272 private:
273  __forceinline__ __device__ __host__ static void mod_mat_vec_m1(const unsigned long long* A,
274  unsigned int* s)
275  {
276  unsigned long long x[3];
277 
278  x[0] = mod_m1(mod_m1(A[0] * s[0])
279  + mod_m1(A[1] * s[1])
280  + mod_m1(A[2] * s[2]));
281 
282  x[1] = mod_m1(mod_m1(A[3] * s[0])
283  + mod_m1(A[4] * s[1])
284  + mod_m1(A[5] * s[2]));
285 
286  x[2] = mod_m1(mod_m1(A[6] * s[0])
287  + mod_m1(A[7] * s[1])
288  + mod_m1(A[8] * s[2]));
289 
290  s[0] = x[0];
291  s[1] = x[1];
292  s[2] = x[2];
293  }
294 
295  __forceinline__ __device__ __host__ static void mod_mat_vec_m2(const unsigned long long* A,
296  unsigned int* s)
297  {
298  unsigned long long x[3];
299 
300  x[0] = mod_m2(mod_m2(A[0] * s[0])
301  + mod_m2(A[1] * s[1])
302  + mod_m2(A[2] * s[2]));
303 
304  x[1] = mod_m2(mod_m2(A[3] * s[0])
305  + mod_m2(A[4] * s[1])
306  + mod_m2(A[5] * s[2]));
307 
308  x[2] = mod_m2(mod_m2(A[6] * s[0])
309  + mod_m2(A[7] * s[1])
310  + mod_m2(A[8] * s[2]));
311 
312  s[0] = x[0];
313  s[1] = x[1];
314  s[2] = x[2];
315  }
316 
317  __forceinline__ __device__ __host__ static unsigned long long mod_mul_m1(unsigned int i,
318  unsigned long long j)
319  {
320  long long hi, lo, temp1, temp2;
321 
322  hi = i / 131072;
323  lo = i - (hi * 131072);
324  temp1 = mod_m1(hi * j) * 131072;
325  temp2 = mod_m1(lo * j);
326  lo = mod_m1(temp1 + temp2);
327 
328  if (lo < 0)
329  lo += ROCRAND_MRG32K3A_M1;
330  return lo;
331  }
332 
333  __forceinline__ __device__ __host__ static unsigned long long mod_m1(unsigned long long p)
334  {
335  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
336  if (p >= ROCRAND_MRG32K3A_M1)
337  p -= ROCRAND_MRG32K3A_M1;
338 
339  return p;
340  }
341 
342  __forceinline__ __device__ __host__ static unsigned long long mod_mul_m2(unsigned int i,
343  unsigned long long j)
344  {
345  long long hi, lo, temp1, temp2;
346 
347  hi = i / 131072;
348  lo = i - (hi * 131072);
349  temp1 = mod_m2(hi * j) * 131072;
350  temp2 = mod_m2(lo * j);
351  lo = mod_m2(temp1 + temp2);
352 
353  if (lo < 0)
354  lo += ROCRAND_MRG32K3A_M2;
355  return lo;
356  }
357 
358  __forceinline__ __device__ __host__ static unsigned long long mod_m2(unsigned long long p)
359  {
360  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
361  p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
362  if (p >= ROCRAND_MRG32K3A_M2)
363  p -= ROCRAND_MRG32K3A_M2;
364 
365  return p;
366  }
367 
368 protected:
369  // State
370  mrg32k3a_state m_state;
371 
372  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
373  friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
374  #endif
375 
376 }; // mrg32k3a_engine class
377 
378 } // end namespace rocrand_device
379 
386 typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
388 
400 __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
401  const unsigned long long subsequence,
402  const unsigned long long offset,
403  rocrand_state_mrg32k3a* state)
404 {
405  *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
406 }
407 
420 __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a* state)
421 {
422  // next() in [1, ROCRAND_MRG32K3A_M1]
423  return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
424 }
425 
434 __forceinline__ __device__ __host__ void skipahead(unsigned long long offset,
435  rocrand_state_mrg32k3a* state)
436 {
437  return state->discard(offset);
438 }
439 
449 __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence,
450  rocrand_state_mrg32k3a* state)
451 {
452  return state->discard_subsequence(subsequence);
453 }
454 
464 __forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence,
465  rocrand_state_mrg32k3a* state)
466 {
467  return state->discard_sequence(sequence);
468 }
469 
470 #endif // ROCRAND_MRG32K3A_H_
471  // end of group rocranddevice
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition: rocrand_mrg32k3a.h:49
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition: rocrand_mrg32k3a.h:400
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg32k3a.h:449
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition: rocrand_mrg32k3a.h:434
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg32k3a.h:420
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition: rocrand_mrg32k3a.h:464