21 #ifndef ROCRAND_MRG32K3A_H_ 
   22 #define ROCRAND_MRG32K3A_H_ 
   24 #include "rocrand/rocrand_common.h" 
   25 #include "rocrand/rocrand_mrg32k3a_precomputed.h" 
   27 #define ROCRAND_MRG32K3A_POW32 4294967296 
   28 #define ROCRAND_MRG32K3A_M1 4294967087 
   29 #define ROCRAND_MRG32K3A_M1C 209 
   30 #define ROCRAND_MRG32K3A_M2 4294944443 
   31 #define ROCRAND_MRG32K3A_M2C 22853 
   32 #define ROCRAND_MRG32K3A_A12 1403580 
   33 #define ROCRAND_MRG32K3A_A13 (4294967087 -  810728) 
   34 #define ROCRAND_MRG32K3A_A13N 810728 
   35 #define ROCRAND_MRG32K3A_A21 527612 
   36 #define ROCRAND_MRG32K3A_A23 (4294944443 - 1370589) 
   37 #define ROCRAND_MRG32K3A_A23N 1370589 
   38 #define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10)  
   39 #define ROCRAND_MRG32K3A_UINT_NORM (1.000000048661607)  
   49  #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL  
   52 namespace rocrand_device {
 
   62     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
   68         unsigned int boxmuller_float_state; 
 
   69         unsigned int boxmuller_double_state; 
 
   70         float boxmuller_float; 
 
   71         double boxmuller_double; 
 
   75     __forceinline__ __device__ __host__ mrg32k3a_engine()
 
   88     __forceinline__ __device__ __host__ mrg32k3a_engine(
const unsigned long long seed,
 
   89                                                         const unsigned long long subsequence,
 
   90                                                         const unsigned long long offset)
 
   92         this->seed(seed, subsequence, offset);
 
  103     __forceinline__ __device__ __host__ 
void seed(
unsigned long long       seed_value,
 
  104                                                   const unsigned long long subsequence,
 
  105                                                   const unsigned long long offset)
 
  111         unsigned int x = (
unsigned int) seed_value ^ 0x55555555U;
 
  112         unsigned int y = (
unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
 
  113         m_state.g1[0] = mod_mul_m1(x, seed_value);
 
  114         m_state.g1[1] = mod_mul_m1(y, seed_value);
 
  115         m_state.g1[2] = mod_mul_m1(x, seed_value);
 
  116         m_state.g2[0] = mod_mul_m2(y, seed_value);
 
  117         m_state.g2[1] = mod_mul_m2(x, seed_value);
 
  118         m_state.g2[2] = mod_mul_m2(y, seed_value);
 
  119         this->restart(subsequence, offset);
 
  123     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  125         this->discard_impl(offset);
 
  130     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  132         this->discard_subsequence_impl(subsequence);
 
  137     __forceinline__ __device__ __host__ 
void discard_sequence(
unsigned long long sequence)
 
  139         this->discard_sequence_impl(sequence);
 
  142     __forceinline__ __device__ __host__ 
void restart(
const unsigned long long subsequence,
 
  143                                                      const unsigned long long offset)
 
  145     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  146         m_state.boxmuller_float_state = 0;
 
  147         m_state.boxmuller_double_state = 0;
 
  149         this->discard_subsequence_impl(subsequence);
 
  150         this->discard_impl(offset);
 
  153     __forceinline__ __device__ __host__ 
unsigned int operator()()
 
  160     __forceinline__ __device__ __host__ 
unsigned int next()
 
  162         const unsigned int p1 = mod_m1(
 
  164                 ROCRAND_MRG32K3A_A12,
 
  167                     ROCRAND_MRG32K3A_A13N,
 
  168                     (ROCRAND_MRG32K3A_M1 - m_state.g1[0]),
 
  174         m_state.g1[0] = m_state.g1[1]; m_state.g1[1] = m_state.g1[2];
 
  177         const unsigned int p2 = mod_m2(
 
  179                 ROCRAND_MRG32K3A_A21,
 
  182                     ROCRAND_MRG32K3A_A23N,
 
  183                     (ROCRAND_MRG32K3A_M2 - m_state.g2[0]),
 
  189         m_state.g2[0] = m_state.g2[1]; m_state.g2[1] = m_state.g2[2];
 
  192         return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
 
  198     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  200         discard_state(offset);
 
  204     __forceinline__ __device__ __host__ 
void 
  205         discard_subsequence_impl(
unsigned long long subsequence)
 
  209         while(subsequence > 0) {
 
  210             if (subsequence & 1) {
 
  211                 #if defined(__HIP_DEVICE_COMPILE__) 
  212                 mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
 
  213                 mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
 
  215                 mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
 
  216                 mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
 
  225     __forceinline__ __device__ __host__ 
void discard_sequence_impl(
unsigned long long sequence)
 
  229         while(sequence > 0) {
 
  231                 #if defined(__HIP_DEVICE_COMPILE__) 
  232                 mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
 
  233                 mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
 
  235                 mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
 
  236                 mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
 
  246     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  252                 #if defined(__HIP_DEVICE_COMPILE__) 
  253                 mod_mat_vec_m1(d_A1 + i, m_state.g1);
 
  254                 mod_mat_vec_m2(d_A2 + i, m_state.g2);
 
  256                 mod_mat_vec_m1(h_A1 + i, m_state.g1);
 
  257                 mod_mat_vec_m2(h_A2 + i, m_state.g2);
 
  267     __forceinline__ __device__ __host__ 
void discard_state()
 
  273     __forceinline__ __device__ __host__ 
static void mod_mat_vec_m1(
const unsigned long long* A,
 
  276         unsigned long long x[3];
 
  278         x[0] = mod_m1(mod_m1(A[0] * s[0])
 
  279                     + mod_m1(A[1] * s[1])
 
  280                     + mod_m1(A[2] * s[2]));
 
  282         x[1] = mod_m1(mod_m1(A[3] * s[0])
 
  283                     + mod_m1(A[4] * s[1])
 
  284                     + mod_m1(A[5] * s[2]));
 
  286         x[2] = mod_m1(mod_m1(A[6] * s[0])
 
  287                     + mod_m1(A[7] * s[1])
 
  288                     + mod_m1(A[8] * s[2]));
 
  295     __forceinline__ __device__ __host__ 
static void mod_mat_vec_m2(
const unsigned long long* A,
 
  298         unsigned long long x[3];
 
  300         x[0] = mod_m2(mod_m2(A[0] * s[0])
 
  301                     + mod_m2(A[1] * s[1])
 
  302                     + mod_m2(A[2] * s[2]));
 
  304         x[1] = mod_m2(mod_m2(A[3] * s[0])
 
  305                     + mod_m2(A[4] * s[1])
 
  306                     + mod_m2(A[5] * s[2]));
 
  308         x[2] = mod_m2(mod_m2(A[6] * s[0])
 
  309                     + mod_m2(A[7] * s[1])
 
  310                     + mod_m2(A[8] * s[2]));
 
  317     __forceinline__ __device__ __host__ 
static unsigned long long mod_mul_m1(
unsigned int       i,
 
  318                                                                              unsigned long long j)
 
  320         long long hi, lo, temp1, temp2;
 
  323         lo = i - (hi * 131072);
 
  324         temp1 = mod_m1(hi * j) * 131072;
 
  325         temp2 = mod_m1(lo * j);
 
  326         lo = mod_m1(temp1 + temp2);
 
  329             lo += ROCRAND_MRG32K3A_M1;
 
  333     __forceinline__ __device__ __host__ 
static unsigned long long mod_m1(
unsigned long long p)
 
  335         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
 
  336         if (p >= ROCRAND_MRG32K3A_M1)
 
  337             p -= ROCRAND_MRG32K3A_M1;
 
  342     __forceinline__ __device__ __host__ 
static unsigned long long mod_mul_m2(
unsigned int       i,
 
  343                                                                              unsigned long long j)
 
  345         long long hi, lo, temp1, temp2;
 
  348         lo = i - (hi * 131072);
 
  349         temp1 = mod_m2(hi * j) * 131072;
 
  350         temp2 = mod_m2(lo * j);
 
  351         lo = mod_m2(temp1 + temp2);
 
  354             lo += ROCRAND_MRG32K3A_M2;
 
  358     __forceinline__ __device__ __host__ 
static unsigned long long mod_m2(
unsigned long long p)
 
  360         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
 
  361         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C, (p >> 32), p & (ROCRAND_MRG32K3A_POW32 - 1));
 
  362         if (p >= ROCRAND_MRG32K3A_M2)
 
  363             p -= ROCRAND_MRG32K3A_M2;
 
  370     mrg32k3a_state m_state;
 
  372     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  373     friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
 
  386 typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
 
  400 __forceinline__ __device__ __host__ 
void rocrand_init(
const unsigned long long seed,
 
  401                                                       const unsigned long long subsequence,
 
  402                                                       const unsigned long long offset,
 
  403                                                       rocrand_state_mrg32k3a*  state)
 
  405     *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
 
  420 __forceinline__ __device__ __host__ 
unsigned int rocrand(rocrand_state_mrg32k3a* state)
 
  423     return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
 
  434 __forceinline__ __device__ __host__ 
void skipahead(
unsigned long long      offset,
 
  435                                                    rocrand_state_mrg32k3a* state)
 
  437     return state->discard(offset);
 
  450                                                                rocrand_state_mrg32k3a* state)
 
  452     return state->discard_subsequence(subsequence);
 
  465                                                             rocrand_state_mrg32k3a* state)
 
  467     return state->discard_sequence(sequence);
 
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition: rocrand_mrg32k3a.h:49
 
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition: rocrand_mrg32k3a.h:400
 
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg32k3a.h:449
 
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition: rocrand_mrg32k3a.h:434
 
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg32k3a.h:420
 
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition: rocrand_mrg32k3a.h:464