21 #ifndef ROCRAND_MRG31K3P_H_ 
   22 #define ROCRAND_MRG31K3P_H_ 
   24 #include "rocrand/rocrand_common.h" 
   25 #include "rocrand/rocrand_mrg31k3p_precomputed.h" 
   27 #include <hip/hip_runtime.h> 
   29 #define ROCRAND_MRG31K3P_M1 2147483647U  
   30 #define ROCRAND_MRG31K3P_M2 2147462579U  
   31 #define ROCRAND_MRG31K3P_MASK12 511U  
   32 #define ROCRAND_MRG31K3P_MASK13 16777215U  
   33 #define ROCRAND_MRG31K3P_MASK21 65535U  
   34 #define ROCRAND_MRG31K3P_NORM_DOUBLE (4.656612875245796923e-10)  
   35 #define ROCRAND_MRG31K3P_UINT32_NORM \ 
   36     (2.000000001396983862)  
   46 #define ROCRAND_MRG31K3P_DEFAULT_SEED 12345ULL  
   49 namespace rocrand_device
 
   60 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
   66         unsigned int boxmuller_float_state; 
 
   67         unsigned int boxmuller_double_state; 
 
   68         float        boxmuller_float; 
 
   69         double       boxmuller_double; 
 
   73     __forceinline__ __device__ __host__ mrg31k3p_engine()
 
   86     __forceinline__ __device__ __host__ mrg31k3p_engine(
const unsigned long long seed,
 
   87                                                         const unsigned long long subsequence,
 
   88                                                         const unsigned long long offset)
 
   90         this->seed(seed, subsequence, offset);
 
  101     __forceinline__ __device__ __host__ 
void seed(
unsigned long long       seed_value,
 
  102                                                   const unsigned long long subsequence,
 
  103                                                   const unsigned long long offset)
 
  109         unsigned int x = 
static_cast<unsigned int>(seed_value ^ 0x55555555U);
 
  110         unsigned int y = 
static_cast<unsigned int>((seed_value >> 32) ^ 0xAAAAAAAAU);
 
  111         m_state.x1[0]  = mod_mul_m1(x, seed_value);
 
  112         m_state.x1[1]  = mod_mul_m1(y, seed_value);
 
  113         m_state.x1[2]  = mod_mul_m1(x, seed_value);
 
  114         m_state.x2[0]  = mod_mul_m2(y, seed_value);
 
  115         m_state.x2[1]  = mod_mul_m2(x, seed_value);
 
  116         m_state.x2[2]  = mod_mul_m2(y, seed_value);
 
  117         this->restart(subsequence, offset);
 
  121     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  123         this->discard_impl(offset);
 
  128     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  130         this->discard_subsequence_impl(subsequence);
 
  135     __forceinline__ __device__ __host__ 
void discard_sequence(
unsigned long long sequence)
 
  137         this->discard_sequence_impl(sequence);
 
  140     __forceinline__ __device__ __host__ 
void restart(
const unsigned long long subsequence,
 
  141                                                      const unsigned long long offset)
 
  143 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  144         m_state.boxmuller_float_state  = 0;
 
  145         m_state.boxmuller_double_state = 0;
 
  147         this->discard_subsequence_impl(subsequence);
 
  148         this->discard_impl(offset);
 
  151     __forceinline__ __device__ __host__ 
unsigned int operator()()
 
  157     __forceinline__ __device__ __host__ 
unsigned int next()
 
  161             = (((m_state.x1[1] & ROCRAND_MRG31K3P_MASK12) << 22) + (m_state.x1[1] >> 9))
 
  162               + (((m_state.x1[2] & ROCRAND_MRG31K3P_MASK13) << 7) + (m_state.x1[2] >> 24));
 
  163         tmp -= (tmp >= ROCRAND_MRG31K3P_M1) ? ROCRAND_MRG31K3P_M1 : 0;
 
  164         tmp += m_state.x1[2];
 
  165         tmp -= (tmp >= ROCRAND_MRG31K3P_M1) ? ROCRAND_MRG31K3P_M1 : 0;
 
  166         m_state.x1[2] = m_state.x1[1];
 
  167         m_state.x1[1] = m_state.x1[0];
 
  171         tmp = (((m_state.x2[0] & ROCRAND_MRG31K3P_MASK21) << 15) + 21069 * (m_state.x2[0] >> 16));
 
  172         tmp -= (tmp >= ROCRAND_MRG31K3P_M2) ? ROCRAND_MRG31K3P_M2 : 0;
 
  173         tmp += ((m_state.x2[2] & ROCRAND_MRG31K3P_MASK21) << 15);
 
  174         tmp -= (tmp >= ROCRAND_MRG31K3P_M2) ? ROCRAND_MRG31K3P_M2 : 0;
 
  175         tmp += 21069 * (m_state.x2[2] >> 16);
 
  176         tmp -= (tmp >= ROCRAND_MRG31K3P_M2) ? ROCRAND_MRG31K3P_M2 : 0;
 
  177         tmp += m_state.x2[2];
 
  178         tmp -= (tmp >= ROCRAND_MRG31K3P_M2) ? ROCRAND_MRG31K3P_M2 : 0;
 
  179         m_state.x2[2] = m_state.x2[1];
 
  180         m_state.x2[1] = m_state.x2[0];
 
  184         return m_state.x1[0] - m_state.x2[0]
 
  185                + (m_state.x1[0] <= m_state.x2[0] ? ROCRAND_MRG31K3P_M1 : 0);
 
  190     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  192         discard_state(offset);
 
  196     __forceinline__ __device__ __host__ 
void 
  197         discard_subsequence_impl(
unsigned long long subsequence)
 
  201         while(subsequence > 0)
 
  205 #if defined(__HIP_DEVICE_COMPILE__) 
  206                 mod_mat_vec_m1(d_mrg31k3p_A1P72 + i, m_state.x1);
 
  207                 mod_mat_vec_m2(d_mrg31k3p_A2P72 + i, m_state.x2);
 
  209                 mod_mat_vec_m1(h_mrg31k3p_A1P72 + i, m_state.x1);
 
  210                 mod_mat_vec_m2(h_mrg31k3p_A2P72 + i, m_state.x2);
 
  219     __forceinline__ __device__ __host__ 
void discard_sequence_impl(
unsigned long long sequence)
 
  227 #if defined(__HIP_DEVICE_COMPILE__) 
  228                 mod_mat_vec_m1(d_mrg31k3p_A1P134 + i, m_state.x1);
 
  229                 mod_mat_vec_m2(d_mrg31k3p_A2P134 + i, m_state.x2);
 
  231                 mod_mat_vec_m1(h_mrg31k3p_A1P134 + i, m_state.x1);
 
  232                 mod_mat_vec_m2(h_mrg31k3p_A2P134 + i, m_state.x2);
 
  241     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  249 #if defined(__HIP_DEVICE_COMPILE__) 
  250                 mod_mat_vec_m1(d_mrg31k3p_A1 + i, m_state.x1);
 
  251                 mod_mat_vec_m2(d_mrg31k3p_A2 + i, m_state.x2);
 
  253                 mod_mat_vec_m1(h_mrg31k3p_A1 + i, m_state.x1);
 
  254                 mod_mat_vec_m2(h_mrg31k3p_A2 + i, m_state.x2);
 
  263     __forceinline__ __device__ __host__ 
void discard_state()
 
  269     __forceinline__ __device__ __host__ 
static void mod_mat_vec_m1(
const unsigned int* A,
 
  272         unsigned long long x[3] = {s[0], s[1], s[2]};
 
  274         s[0] = mod_m1(mod_m1(A[0] * x[0]) + mod_m1(A[1] * x[1]) + mod_m1(A[2] * x[2]));
 
  276         s[1] = mod_m1(mod_m1(A[3] * x[0]) + mod_m1(A[4] * x[1]) + mod_m1(A[5] * x[2]));
 
  278         s[2] = mod_m1(mod_m1(A[6] * x[0]) + mod_m1(A[7] * x[1]) + mod_m1(A[8] * x[2]));
 
  281     __forceinline__ __device__ __host__ 
static void mod_mat_vec_m2(
const unsigned int* A,
 
  284         unsigned long long x[3] = {s[0], s[1], s[2]};
 
  286         s[0] = mod_m2(mod_m2(A[0] * x[0]) + mod_m2(A[1] * x[1]) + mod_m2(A[2] * x[2]));
 
  288         s[1] = mod_m2(mod_m2(A[3] * x[0]) + mod_m2(A[4] * x[1]) + mod_m2(A[5] * x[2]));
 
  290         s[2] = mod_m2(mod_m2(A[6] * x[0]) + mod_m2(A[7] * x[1]) + mod_m2(A[8] * x[2]));
 
  293     __forceinline__ __device__ __host__ 
static unsigned long long mod_mul_m1(
unsigned int       i,
 
  294                                                                              unsigned long long j)
 
  296         return mod_m1(i * j);
 
  299     __forceinline__ __device__ __host__ 
static unsigned long long mod_m1(
unsigned long long p)
 
  301         return p % ROCRAND_MRG31K3P_M1;
 
  304     __forceinline__ __device__ __host__ 
static unsigned long long mod_mul_m2(
unsigned int       i,
 
  305                                                                              unsigned long long j)
 
  307         return mod_m2(i * j);
 
  310     __forceinline__ __device__ __host__ 
static unsigned long long mod_m2(
unsigned long long p)
 
  312         return p % ROCRAND_MRG31K3P_M2;
 
  317     mrg31k3p_state m_state;
 
  319 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE 
  320     friend struct detail::engine_boxmuller_helper<mrg31k3p_engine>;
 
  332 typedef rocrand_device::mrg31k3p_engine rocrand_state_mrg31k3p;
 
  346 __forceinline__ __device__ __host__
 
  348                   const unsigned long long subsequence,
 
  349                   const unsigned long long offset,
 
  350                   rocrand_state_mrg31k3p*  state)
 
  352     *state = rocrand_state_mrg31k3p(seed, subsequence, offset);
 
  367 __forceinline__ __device__ __host__
 
  368 unsigned int rocrand(rocrand_state_mrg31k3p* state)
 
  371     return static_cast<unsigned int>((state->next() - 1) * ROCRAND_MRG31K3P_UINT32_NORM);
 
  382 __forceinline__ __device__ __host__
 
  383 void skipahead(
unsigned long long offset, rocrand_state_mrg31k3p* state)
 
  385     return state->discard(offset);
 
  397 __forceinline__ __device__ __host__
 
  400     return state->discard_subsequence(subsequence);
 
  412 __forceinline__ __device__ __host__
 
  415     return state->discard_sequence(sequence);
 
#define ROCRAND_MRG31K3P_DEFAULT_SEED
Default seed for MRG31K3P PRNG.
Definition: rocrand_mrg31k3p.h:46
 
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_mrg31k3p *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mrg31k3p.h:368
 
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_mrg31k3p *state)
Updates MRG31K3P state to skip ahead by subsequence subsequences.
Definition: rocrand_mrg31k3p.h:398
 
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_mrg31k3p *state)
Updates MRG31K3P state to skip ahead by offset elements.
Definition: rocrand_mrg31k3p.h:383
 
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_mrg31k3p *state)
Updates MRG31K3P state to skip ahead by sequence sequences.
Definition: rocrand_mrg31k3p.h:413
 
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_mrg31k3p *state)
Initializes MRG31K3P state.
Definition: rocrand_mrg31k3p.h:347