55 #ifndef ROCRAND_MTGP32_H_
56 #define ROCRAND_MTGP32_H_
61 #define FQUALIFIERS __forceinline__ __device__
64 #include "rocrand/rocrand.h"
65 #include "rocrand/rocrand_common.h"
67 #define MTGP_MEXP 11213
69 #define MTGP_FLOOR_2P 256
70 #define MTGP_CEIL_2P 512
71 #define MTGP_TN MTGP_FLOOR_2P
72 #define MTGP_LS (MTGP_TN * 3)
73 #define MTGP_BN_MAX 512
75 #define MTGP_STATE 1024
76 #define MTGP_MASK 1023
111 namespace rocrand_device {
115 unsigned int pos_tbl[MTGP_BN_MAX];
116 unsigned int param_tbl[MTGP_BN_MAX][MTGP_TS];
117 unsigned int temper_tbl[MTGP_BN_MAX][MTGP_TS];
118 unsigned int single_temper_tbl[MTGP_BN_MAX][MTGP_TS];
119 unsigned int sh1_tbl[MTGP_BN_MAX];
120 unsigned int sh2_tbl[MTGP_BN_MAX];
121 unsigned int mask[1];
130 unsigned int status[MTGP_STATE];
134 void rocrand_mtgp32_init_state(
unsigned int array[],
135 const mtgp32_fast_params *para,
unsigned int seed)
138 int size = para->mexp / 32 + 1;
139 unsigned int hidden_seed;
141 hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
145 memset(array, tmp & 0xff,
sizeof(
unsigned int) * size);
147 array[1] = hidden_seed;
148 for (i = 1; i < size; i++)
149 array[i] ^= (1812433253) * (array[i - 1] ^ (array[i - 1] >> 30)) + i;
163 mtgp32_engine(
const mtgp32_state &m_state,
164 const mtgp32_params * params,
167 this->m_state = m_state;
168 pos_tbl = params->pos_tbl[bid];
169 sh1_tbl = params->sh1_tbl[bid];
170 sh2_tbl = params->sh2_tbl[bid];
171 mask = params->mask[0];
172 for (
int j = 0; j < MTGP_TS; j++) {
173 param_tbl[j] = params->param_tbl[bid][j];
174 temper_tbl[j] = params->temper_tbl[bid][j];
175 single_temper_tbl[j] = params->single_temper_tbl[bid][j];
180 void copy(
const mtgp32_engine * m_engine)
182 #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
183 const unsigned int thread_id = threadIdx.x;
184 for(
int i = thread_id; i < MTGP_STATE; i += blockDim.x)
185 m_state.status[i] = m_engine->m_state.status[i];
189 m_state.offset = m_engine->m_state.offset;
190 m_state.id = m_engine->m_state.id;
191 pos_tbl = m_engine->pos_tbl;
192 sh1_tbl = m_engine->sh1_tbl;
193 sh2_tbl = m_engine->sh2_tbl;
194 mask = m_engine->mask;
196 if (thread_id < MTGP_TS)
198 param_tbl[thread_id] = m_engine->param_tbl[thread_id];
199 temper_tbl[thread_id] = m_engine->temper_tbl[thread_id];
200 single_temper_tbl[thread_id] = m_engine->single_temper_tbl[thread_id];
204 this->m_state = m_engine->m_state;
205 pos_tbl = m_engine->pos_tbl;
206 sh1_tbl = m_engine->sh1_tbl;
207 sh2_tbl = m_engine->sh2_tbl;
208 mask = m_engine->mask;
209 for (
int j = 0; j < MTGP_TS; j++) {
210 param_tbl[j] = m_engine->param_tbl[j];
211 temper_tbl[j] = m_engine->temper_tbl[j];
212 single_temper_tbl[j] = m_engine->single_temper_tbl[j];
218 void set_params(mtgp32_params * params)
220 pos_tbl = params->pos_tbl[m_state.id];
221 sh1_tbl = params->sh1_tbl[m_state.id];
222 sh2_tbl = params->sh2_tbl[m_state.id];
223 mask = params->mask[0];
224 for (
int j = 0; j < MTGP_TS; j++) {
225 param_tbl[j] = params->param_tbl[m_state.id][j];
226 temper_tbl[j] = params->temper_tbl[m_state.id][j];
227 single_temper_tbl[j] = params->single_temper_tbl[m_state.id][j];
232 unsigned int operator()()
240 #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
241 unsigned int t = threadIdx.x;
242 unsigned int d = blockDim.x;
247 r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
248 m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
249 m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
250 m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;
252 o = temper(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
255 m_state.offset = (m_state.offset + d) & MTGP_MASK;
264 unsigned int next_single()
266 #if defined(__HIP_DEVICE_COMPILE__) || defined(USE_HIP_CPU)
267 unsigned int t = threadIdx.x;
268 unsigned int d = blockDim.x;
273 r = para_rec(m_state.status[(t + m_state.offset) & MTGP_MASK],
274 m_state.status[(t + m_state.offset + 1) & MTGP_MASK],
275 m_state.status[(t + m_state.offset + pos) & MTGP_MASK]);
276 m_state.status[(t + m_state.offset + MTGP_N) & MTGP_MASK] = r;
278 o = temper_single(r, m_state.status[(t + m_state.offset + pos - 1) & MTGP_MASK]);
281 m_state.offset = (m_state.offset + d) & MTGP_MASK;
291 unsigned int para_rec(
unsigned int X1,
unsigned int X2,
unsigned int Y)
const
293 unsigned int X = (X1 & mask) ^ X2;
297 Y = X ^ (Y >> sh2_tbl);
298 MAT = param_tbl[Y & 0x0f];
303 unsigned int temper(
unsigned int V,
unsigned int T)
const
309 MAT = temper_tbl[T & 0x0f];
314 unsigned int temper_single(
unsigned int V,
unsigned int T)
const
321 MAT = single_temper_tbl[T & 0x0f];
328 mtgp32_state m_state;
330 unsigned int pos_tbl;
331 unsigned int param_tbl[MTGP_TS];
332 unsigned int temper_tbl[MTGP_TS];
333 unsigned int sh1_tbl;
334 unsigned int sh2_tbl;
335 unsigned int single_temper_tbl[MTGP_TS];
348 typedef rocrand_device::mtgp32_engine rocrand_state_mtgp32;
349 typedef rocrand_device::mtgp32_state mtgp32_state;
351 typedef rocrand_device::mtgp32_params mtgp32_params;
371 mtgp32_fast_params params[],
373 unsigned long long seed)
376 rocrand_state_mtgp32 * h_state = (rocrand_state_mtgp32 *) malloc(
sizeof(rocrand_state_mtgp32) * n);
377 seed = seed ^ (seed >> 32);
382 for (i = 0; i < n; i++) {
383 rocrand_device::rocrand_mtgp32_init_state(&(h_state[i].m_state.status[0]), ¶ms[i], (
unsigned int)seed + i + 1);
384 h_state[i].m_state.offset = 0;
385 h_state[i].m_state.id = i;
386 h_state[i].pos_tbl = params[i].pos;
387 h_state[i].sh1_tbl = params[i].sh1;
388 h_state[i].sh2_tbl = params[i].sh2;
389 h_state[i].mask = params[0].mask;
390 for (
int j = 0; j < MTGP_TS; j++) {
391 h_state[i].param_tbl[j] = params[i].tbl[j];
392 h_state[i].temper_tbl[j] = params[i].tmp_tbl[j];
393 h_state[i].single_temper_tbl[j] = params[i].flt_tmp_tbl[j];
397 hipMemcpy(d_state, h_state,
sizeof(rocrand_state_mtgp32) * n, hipMemcpyHostToDevice);
400 if (hipGetLastError() != hipSuccess)
425 const int block_num = MTGP_BN_MAX;
426 const int size1 =
sizeof(uint32_t) * block_num;
427 const int size2 =
sizeof(uint32_t) * block_num * MTGP_TS;
431 uint32_t *h_param_tbl;
432 uint32_t *h_temper_tbl;
433 uint32_t *h_single_temper_tbl;
435 h_pos_tbl = (uint32_t *)malloc(size1);
436 h_sh1_tbl = (uint32_t *)malloc(size1);
437 h_sh2_tbl = (uint32_t *)malloc(size1);
438 h_param_tbl = (uint32_t *)malloc(size2);
439 h_temper_tbl = (uint32_t *)malloc(size2);
440 h_single_temper_tbl = (uint32_t *)malloc(size2);
441 h_mask = (uint32_t *)malloc(
sizeof(uint32_t));
444 if (h_pos_tbl == NULL || h_sh1_tbl == NULL || h_sh2_tbl == NULL
445 || h_param_tbl == NULL || h_temper_tbl == NULL || h_single_temper_tbl == NULL
447 printf(
"failure in allocating host memory for constant table.\n");
451 h_mask[0] = params[0].mask;
452 for (
int i = 0; i < block_num; i++) {
453 h_pos_tbl[i] = params[i].pos;
454 h_sh1_tbl[i] = params[i].sh1;
455 h_sh2_tbl[i] = params[i].sh2;
456 for (
int j = 0; j < MTGP_TS; j++) {
457 h_param_tbl[i * MTGP_TS + j] = params[i].tbl[j];
458 h_temper_tbl[i * MTGP_TS + j] = params[i].tmp_tbl[j];
459 h_single_temper_tbl[i * MTGP_TS + j] = params[i].flt_tmp_tbl[j];
463 if (hipMemcpy(p->pos_tbl, h_pos_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
465 if (hipMemcpy(p->sh1_tbl, h_sh1_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
467 if (hipMemcpy(p->sh2_tbl, h_sh2_tbl, size1, hipMemcpyHostToDevice) != hipSuccess)
469 if (hipMemcpy(p->param_tbl, h_param_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
471 if (hipMemcpy(p->temper_tbl, h_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
473 if (hipMemcpy(p->single_temper_tbl, h_single_temper_tbl, size2, hipMemcpyHostToDevice) != hipSuccess)
475 if (hipMemcpy(p->mask, h_mask,
sizeof(
unsigned int), hipMemcpyHostToDevice) != hipSuccess)
484 free(h_single_temper_tbl);
503 unsigned int rocrand(rocrand_state_mtgp32 * state)
505 return state->next();
554 state->set_params(params);
FQUALIFIERS void rocrand_mtgp32_block_copy(rocrand_state_mtgp32 *src, rocrand_state_mtgp32 *dest)
Copies MTGP32 state to another state using block of threads.
Definition: rocrand_mtgp32.h:540
__host__ rocrand_status rocrand_make_state_mtgp32(rocrand_state_mtgp32 *d_state, mtgp32_fast_params params[], int n, unsigned long long seed)
Initializes MTGP32 states.
Definition: rocrand_mtgp32.h:370
__host__ rocrand_status rocrand_make_constant(const mtgp32_fast_params params[], mtgp32_params *p)
Loads parameters for MTGP32.
Definition: rocrand_mtgp32.h:423
FQUALIFIERS unsigned int rocrand(rocrand_state_mtgp32 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_mtgp32.h:503
FQUALIFIERS void rocrand_mtgp32_set_params(rocrand_state_mtgp32 *state, mtgp32_params *params)
Changes parameters of a MTGP32 state.
Definition: rocrand_mtgp32.h:552
#define FQUALIFIERS
Shorthand for commonly used function qualifiers.
Definition: rocrand_uniform.h:31
rocrand_status
rocRAND function call status type
Definition: rocrand.h:60
@ ROCRAND_STATUS_SUCCESS
No errors.
Definition: rocrand.h:61
@ ROCRAND_STATUS_ALLOCATION_FAILED
Memory allocation failed during execution.
Definition: rocrand.h:64
Definition: rocrand_mtgp32.h:99
uint32_t tmp_tbl[16]
Definition: rocrand_mtgp32.h:105
int pos
Definition: rocrand_mtgp32.h:101
int mexp
Definition: rocrand_mtgp32.h:100
int sh2
Definition: rocrand_mtgp32.h:103
int sh1
Definition: rocrand_mtgp32.h:102
uint32_t mask
Definition: rocrand_mtgp32.h:107
unsigned char poly_sha1[21]
Definition: rocrand_mtgp32.h:108
uint32_t tbl[16]
Definition: rocrand_mtgp32.h:104
uint32_t flt_tmp_tbl[16]
Definition: rocrand_mtgp32.h:106