21 #ifndef ROCRAND_MRG32K3A_H_ 
   22 #define ROCRAND_MRG32K3A_H_ 
   24 #include "rocrand/rocrand_common.h" 
   25 #include "rocrand/rocrand_mrg32k3a_precomputed.h" 
   27 #include <hip/hip_runtime.h> 
   29 #define ROCRAND_MRG32K3A_POW32 4294967296U 
   30 #define ROCRAND_MRG32K3A_M1 4294967087U 
   31 #define ROCRAND_MRG32K3A_M1C 209U 
   32 #define ROCRAND_MRG32K3A_M2 4294944443U 
   33 #define ROCRAND_MRG32K3A_M2C 22853U 
   34 #define ROCRAND_MRG32K3A_A12 1403580U 
   35 #define ROCRAND_MRG32K3A_A13 (4294967087U - 810728U) 
   36 #define ROCRAND_MRG32K3A_A13N 810728U 
   37 #define ROCRAND_MRG32K3A_A21 527612U 
   38 #define ROCRAND_MRG32K3A_A23 (4294944443U - 1370589U) 
   39 #define ROCRAND_MRG32K3A_A23N 1370589U 
   40 #define ROCRAND_MRG32K3A_NORM_DOUBLE (2.3283065498378288e-10)  
   41 #define ROCRAND_MRG32K3A_UINT_NORM \ 
   52  #define ROCRAND_MRG32K3A_DEFAULT_SEED 12345ULL  
   55 namespace rocrand_device {
 
   65     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
   71         unsigned int boxmuller_float_state; 
 
   72         unsigned int boxmuller_double_state; 
 
   73         float boxmuller_float; 
 
   74         double boxmuller_double; 
 
   78     __forceinline__ __device__ __host__ mrg32k3a_engine()
 
   91     __forceinline__ __device__ __host__ mrg32k3a_engine(
const unsigned long long seed,
 
   92                                                         const unsigned long long subsequence,
 
   93                                                         const unsigned long long offset)
 
   95         this->seed(seed, subsequence, offset);
 
  106     __forceinline__ __device__ __host__ 
void seed(
unsigned long long       seed_value,
 
  107                                                   const unsigned long long subsequence,
 
  108                                                   const unsigned long long offset)
 
  114         unsigned int x = (
unsigned int) seed_value ^ 0x55555555U;
 
  115         unsigned int y = (
unsigned int) ((seed_value >> 32) ^ 0xAAAAAAAAU);
 
  116         m_state.g1[0] = mod_mul_m1(x, seed_value);
 
  117         m_state.g1[1] = mod_mul_m1(y, seed_value);
 
  118         m_state.g1[2] = mod_mul_m1(x, seed_value);
 
  119         m_state.g2[0] = mod_mul_m2(y, seed_value);
 
  120         m_state.g2[1] = mod_mul_m2(x, seed_value);
 
  121         m_state.g2[2] = mod_mul_m2(y, seed_value);
 
  122         this->restart(subsequence, offset);
 
  126     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  128         this->discard_impl(offset);
 
  133     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  135         this->discard_subsequence_impl(subsequence);
 
  140     __forceinline__ __device__ __host__ 
void discard_sequence(
unsigned long long sequence)
 
  142         this->discard_sequence_impl(sequence);
 
  145     __forceinline__ __device__ __host__ 
void restart(
const unsigned long long subsequence,
 
  146                                                      const unsigned long long offset)
 
  148     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  149         m_state.boxmuller_float_state = 0;
 
  150         m_state.boxmuller_double_state = 0;
 
  152         this->discard_subsequence_impl(subsequence);
 
  153         this->discard_impl(offset);
 
  156     __forceinline__ __device__ __host__ 
unsigned int operator()()
 
  163     __forceinline__ __device__ __host__
 
  166         const unsigned int p1 = mod_m1(detail::mad_u64_u32(
 
  167             ROCRAND_MRG32K3A_A12,
 
  169             detail::mul_u64_u32(ROCRAND_MRG32K3A_A13N, (ROCRAND_MRG32K3A_M1 - m_state.g1[0]))));
 
  171         m_state.g1[0] = m_state.g1[1];
 
  172         m_state.g1[1] = m_state.g1[2];
 
  175         const unsigned int p2 = mod_m2(detail::mad_u64_u32(
 
  176             ROCRAND_MRG32K3A_A21,
 
  178             detail::mul_u64_u32(ROCRAND_MRG32K3A_A23N, (ROCRAND_MRG32K3A_M2 - m_state.g2[0]))));
 
  180         m_state.g2[0] = m_state.g2[1];
 
  181         m_state.g2[1] = m_state.g2[2];
 
  184         return (p1 - p2) + (p1 <= p2 ? ROCRAND_MRG32K3A_M1 : 0);
 
  190     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  192         discard_state(offset);
 
  196     __forceinline__ __device__ __host__ 
void 
  197         discard_subsequence_impl(
unsigned long long subsequence)
 
  201         while(subsequence > 0) {
 
  202             if (subsequence & 1) {
 
  203                 #if defined(__HIP_DEVICE_COMPILE__) 
  204                 mod_mat_vec_m1(d_A1P76 + i, m_state.g1);
 
  205                 mod_mat_vec_m2(d_A2P76 + i, m_state.g2);
 
  207                 mod_mat_vec_m1(h_A1P76 + i, m_state.g1);
 
  208                 mod_mat_vec_m2(h_A2P76 + i, m_state.g2);
 
  217     __forceinline__ __device__ __host__ 
void discard_sequence_impl(
unsigned long long sequence)
 
  221         while(sequence > 0) {
 
  223                 #if defined(__HIP_DEVICE_COMPILE__) 
  224                 mod_mat_vec_m1(d_A1P127 + i, m_state.g1);
 
  225                 mod_mat_vec_m2(d_A2P127 + i, m_state.g2);
 
  227                 mod_mat_vec_m1(h_A1P127 + i, m_state.g1);
 
  228                 mod_mat_vec_m2(h_A2P127 + i, m_state.g2);
 
  238     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  244                 #if defined(__HIP_DEVICE_COMPILE__) 
  245                 mod_mat_vec_m1(d_A1 + i, m_state.g1);
 
  246                 mod_mat_vec_m2(d_A2 + i, m_state.g2);
 
  248                 mod_mat_vec_m1(h_A1 + i, m_state.g1);
 
  249                 mod_mat_vec_m2(h_A2 + i, m_state.g2);
 
  259     __forceinline__ __device__ __host__ 
void discard_state()
 
  265     __forceinline__ __device__ __host__
 
  266     static void mod_mat_vec_m1(
const unsigned int* A, 
unsigned int* s)
 
  268         unsigned long long x[3] = {s[0], s[1], s[2]};
 
  270         s[0] = mod_m1(mod_m1(A[0] * x[0]) + mod_m1(A[1] * x[1]) + mod_m1(A[2] * x[2]));
 
  272         s[1] = mod_m1(mod_m1(A[3] * x[0]) + mod_m1(A[4] * x[1]) + mod_m1(A[5] * x[2]));
 
  274         s[2] = mod_m1(mod_m1(A[6] * x[0]) + mod_m1(A[7] * x[1]) + mod_m1(A[8] * x[2]));
 
  277     __forceinline__ __device__ __host__
 
  278     static void mod_mat_vec_m2(
const unsigned int* A, 
unsigned int* s)
 
  280         unsigned long long x[3] = {s[0], s[1], s[2]};
 
  282         s[0] = mod_m2(mod_m2(A[0] * x[0]) + mod_m2(A[1] * x[1]) + mod_m2(A[2] * x[2]));
 
  284         s[1] = mod_m2(mod_m2(A[3] * x[0]) + mod_m2(A[4] * x[1]) + mod_m2(A[5] * x[2]));
 
  286         s[2] = mod_m2(mod_m2(A[6] * x[0]) + mod_m2(A[7] * x[1]) + mod_m2(A[8] * x[2]));
 
  289     __forceinline__ __device__ __host__ 
static unsigned long long mod_mul_m1(
unsigned int       i,
 
  290                                                                              unsigned long long j)
 
  292         long long hi, lo, temp1, temp2;
 
  295         lo = i - (hi * 131072);
 
  296         temp1 = mod_m1(hi * j) * 131072;
 
  297         temp2 = mod_m1(lo * j);
 
  298         lo = mod_m1(temp1 + temp2);
 
  301             lo += ROCRAND_MRG32K3A_M1;
 
  305     __forceinline__ __device__ __host__
 
  306     static unsigned long long mod_m1(
unsigned long long p)
 
  308         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M1C,
 
  309                                 static_cast<unsigned int>(p >> 32),
 
  310                                 static_cast<unsigned int>(p));
 
  311         if(p >= ROCRAND_MRG32K3A_M1)
 
  312             p -= ROCRAND_MRG32K3A_M1;
 
  317     __forceinline__ __device__ __host__
 
  318     static unsigned long long mod_mul_m2(
unsigned int i, 
unsigned long long j)
 
  320         long long hi, lo, temp1, temp2;
 
  323         lo = i - (hi * 131072);
 
  324         temp1 = mod_m2(hi * j) * 131072;
 
  325         temp2 = mod_m2(lo * j);
 
  326         lo = mod_m2(temp1 + temp2);
 
  329             lo += ROCRAND_MRG32K3A_M2;
 
  333     __forceinline__ __device__ __host__
 
  334     static unsigned long long mod_m2(
unsigned long long p)
 
  336         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
 
  337                                 static_cast<unsigned int>(p >> 32),
 
  338                                 static_cast<unsigned int>(p));
 
  339         p = detail::mad_u64_u32(ROCRAND_MRG32K3A_M2C,
 
  340                                 static_cast<unsigned int>(p >> 32),
 
  341                                 static_cast<unsigned int>(p));
 
  342         if(p >= ROCRAND_MRG32K3A_M2)
 
  343             p -= ROCRAND_MRG32K3A_M2;
 
  350     mrg32k3a_state m_state;
 
  352     #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  353     friend struct detail::engine_boxmuller_helper<mrg32k3a_engine>;
 
  366 typedef rocrand_device::mrg32k3a_engine rocrand_state_mrg32k3a;
 
  380 __forceinline__ __device__ __host__
 
  382                   const unsigned long long subsequence,
 
  383                   const unsigned long long offset,
 
  384                   rocrand_state_mrg32k3a*  state)
 
  386     *state = rocrand_state_mrg32k3a(seed, subsequence, offset);
 
  401 __forceinline__ __device__ __host__
 
  402 unsigned int rocrand(rocrand_state_mrg32k3a* state)
 
  405     return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG32K3A_UINT_NORM);
 
  416 __forceinline__ __device__ __host__
 
  417 void skipahead(
unsigned long long offset, rocrand_state_mrg32k3a* state)
 
  419     return state->discard(offset);
 
  431 __forceinline__ __device__ __host__
 
  434     return state->discard_subsequence(subsequence);
 
  446 __forceinline__ __device__ __host__
 
  449     return state->discard_sequence(sequence);
 
#define ROCRAND_MRG32K3A_DEFAULT_SEED
Default seed for MRG32K3A PRNG.
Definition: rocrand_mrg32k3a.h:52
 
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg32k3a *state)
Initializes MRG32K3A state.
Definition: rocrand_mrg32k3a.h:381
 
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg32k3a.h:432
 
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by offset elements.
Definition: rocrand_mrg32k3a.h:417
 
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg32k3a *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg32k3a.h:402
 
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg32k3a *state)
Updates MRG32K3A state to skip ahead by sequence sequences.
Definition: rocrand_mrg32k3a.h:447