/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/docs-7.0.0/projects/rocrand/library/include/rocrand/rocrand_philox4x32_10.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/docs-7.0.0/projects/rocrand/library/include/rocrand/rocrand_philox4x32_10.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/docs-7.0.0/projects/rocrand/library/include/rocrand/rocrand_philox4x32_10.h Source File
rocrand_philox4x32_10.h
1 // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 /*
22 Copyright 2010-2011, D. E. Shaw Research.
23 All rights reserved.
24 
25 Redistribution and use in source and binary forms, with or without
26 modification, are permitted provided that the following conditions are
27 met:
28 
29 * Redistributions of source code must retain the above copyright
30  notice, this list of conditions, and the following disclaimer.
31 
32 * Redistributions in binary form must reproduce the above copyright
33  notice, this list of conditions, and the following disclaimer in the
34  documentation and/or other materials provided with the distribution.
35 
36 * Neither the name of D. E. Shaw Research nor the names of its
37  contributors may be used to endorse or promote products derived from
38  this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 */
52 
53 #ifndef ROCRAND_PHILOX4X32_10_H_
54 #define ROCRAND_PHILOX4X32_10_H_
55 
56 #include "rocrand/rocrand_common.h"
57 
58 #include <hip/hip_runtime.h>
59 
60 // Constants from Random123
61 // See https://www.deshawresearch.com/resources_random123.html
62 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
63 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
64 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U
65 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U
66 
75 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL // end of group rocranddevice
77 
78 namespace rocrand_device
79 {
80 
81 class philox4x32_10_engine
82 {
83 public:
84  struct philox4x32_10_state
85  {
86  uint4 counter;
87  uint4 result;
88  uint2 key;
89  unsigned int substate;
90 
91  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
92  // The Box–Muller transform requires two inputs to convert uniformly
93  // distributed real values [0; 1] to normally distributed real values
94  // (with mean = 0, and stddev = 1). Often user wants only one
95  // normally distributed number, to save performance and random
96  // numbers the 2nd value is saved for future requests.
97  unsigned int boxmuller_float_state; // is there a float in boxmuller_float
98  unsigned int boxmuller_double_state; // is there a double in boxmuller_double
99  float boxmuller_float; // normally distributed float
100  double boxmuller_double; // normally distributed double
101  #endif
102  };
103 
104  __forceinline__ __device__ __host__ philox4x32_10_engine()
105  {
106  this->seed(ROCRAND_PHILOX4x32_DEFAULT_SEED, 0, 0);
107  }
108 
114  __forceinline__ __device__ __host__ philox4x32_10_engine(const unsigned long long seed,
115  const unsigned long long subsequence,
116  const unsigned long long offset)
117  {
118  this->seed(seed, subsequence, offset);
119  }
120 
126  __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
127  const unsigned long long subsequence,
128  const unsigned long long offset)
129  {
130  m_state.key.x = static_cast<unsigned int>(seed_value);
131  m_state.key.y = static_cast<unsigned int>(seed_value >> 32);
132  this->restart(subsequence, offset);
133  }
134 
136  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
137  {
138  this->discard_impl(offset);
139  this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
140  }
141 
146  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
147  {
148  this->discard_subsequence_impl(subsequence);
149  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
150  }
151 
152  __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
153  const unsigned long long offset)
154  {
155  m_state.counter = {0, 0, 0, 0};
156  m_state.result = {0, 0, 0, 0};
157  m_state.substate = 0;
158  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
159  m_state.boxmuller_float_state = 0;
160  m_state.boxmuller_double_state = 0;
161  #endif
162  this->discard_subsequence_impl(subsequence);
163  this->discard_impl(offset);
164  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
165  }
166 
167  __forceinline__ __device__ __host__ unsigned int operator()()
168  {
169  return this->next();
170  }
171 
172  __forceinline__ __device__ __host__ unsigned int next()
173  {
174  #if defined(__HIP_PLATFORM_AMD__)
175  unsigned int ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
176  #else
177  unsigned int ret = (&m_state.result.x)[m_state.substate];
178  #endif
179  m_state.substate++;
180  if(m_state.substate == 4)
181  {
182  m_state.substate = 0;
183  this->discard_state();
184  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
185  }
186  return ret;
187  }
188 
189  __forceinline__ __device__ __host__ uint4 next4()
190  {
191  uint4 ret = m_state.result;
192  this->discard_state();
193  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
194  return this->interleave(ret, m_state.result);
195  }
196 
197 protected:
198  // Advances the internal state to skip \p offset numbers.
199  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
200  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
201  {
202  // Adjust offset for subset
203  m_state.substate += offset & 3;
204  unsigned long long counter_offset = offset / 4;
205  counter_offset += m_state.substate < 4 ? 0 : 1;
206  m_state.substate += m_state.substate < 4 ? 0 : -4;
207  // Discard states
208  this->discard_state(counter_offset);
209  }
210 
211  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
212  __forceinline__ __device__ __host__ void
213  discard_subsequence_impl(unsigned long long subsequence)
214  {
215  unsigned int lo = static_cast<unsigned int>(subsequence);
216  unsigned int hi = static_cast<unsigned int>(subsequence >> 32);
217 
218  unsigned int temp = m_state.counter.z;
219  m_state.counter.z += lo;
220  m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
221  }
222 
223  // Advances the internal state by offset times.
224  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
225  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
226  {
227  unsigned int lo = static_cast<unsigned int>(offset);
228  unsigned int hi = static_cast<unsigned int>(offset >> 32);
229 
230  uint4 temp = m_state.counter;
231  m_state.counter.x += lo;
232  m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
233  m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
234  m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
235  }
236 
237  // Advances the internal state to the next state
238  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
239  __forceinline__ __device__ __host__ void discard_state()
240  {
241  m_state.counter = this->bump_counter(m_state.counter);
242  }
243 
244  __forceinline__ __device__ __host__ static uint4 bump_counter(uint4 counter)
245  {
246  counter.x++;
247  unsigned int add = counter.x == 0 ? 1 : 0;
248  counter.y += add; add = counter.y == 0 ? add : 0;
249  counter.z += add; add = counter.z == 0 ? add : 0;
250  counter.w += add;
251  return counter;
252  }
253 
254  __forceinline__ __device__ __host__ uint4 interleave(const uint4 prev, const uint4 next) const
255  {
256  switch(m_state.substate)
257  {
258  case 0:
259  return prev;
260  case 1:
261  return uint4{ prev.y, prev.z, prev.w, next.x };
262  case 2:
263  return uint4{ prev.z, prev.w, next.x, next.y };
264  case 3:
265  return uint4{ prev.w, next.x, next.y, next.z };
266  }
267  __builtin_unreachable();
268  }
269 
270  // 10 Philox4x32 rounds
271  __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
272  {
273  counter = this->single_round(counter, key); key = this->bumpkey(key); // 1
274  counter = this->single_round(counter, key); key = this->bumpkey(key); // 2
275  counter = this->single_round(counter, key); key = this->bumpkey(key); // 3
276  counter = this->single_round(counter, key); key = this->bumpkey(key); // 4
277  counter = this->single_round(counter, key); key = this->bumpkey(key); // 5
278  counter = this->single_round(counter, key); key = this->bumpkey(key); // 6
279  counter = this->single_round(counter, key); key = this->bumpkey(key); // 7
280  counter = this->single_round(counter, key); key = this->bumpkey(key); // 8
281  counter = this->single_round(counter, key); key = this->bumpkey(key); // 9
282  return this->single_round(counter, key); // 10
283  }
284 
285 private:
286  // Single Philox4x32 round
287  __forceinline__ __device__ __host__ static uint4 single_round(uint4 counter, uint2 key)
288  {
289  // Source: Random123
290  unsigned long long mul0 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_0, counter.x);
291  unsigned int hi0 = static_cast<unsigned int>(mul0 >> 32);
292  unsigned int lo0 = static_cast<unsigned int>(mul0);
293  unsigned long long mul1 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_1, counter.z);
294  unsigned int hi1 = static_cast<unsigned int>(mul1 >> 32);
295  unsigned int lo1 = static_cast<unsigned int>(mul1);
296  return uint4{hi1 ^ counter.y ^ key.x, lo1, hi0 ^ counter.w ^ key.y, lo0};
297  }
298 
299  __forceinline__ __device__ __host__ static uint2 bumpkey(uint2 key)
300  {
301  key.x += ROCRAND_PHILOX_W32_0;
302  key.y += ROCRAND_PHILOX_W32_1;
303  return key;
304  }
305 
306 protected:
307  // State
308  philox4x32_10_state m_state;
309 
310  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
311  friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
312  #endif
313 
314 }; // philox4x32_10_engine class
315 
316 } // end namespace rocrand_device
317 
324 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
326 
338 __forceinline__ __device__ __host__
339 void rocrand_init(const unsigned long long seed,
340  const unsigned long long subsequence,
341  const unsigned long long offset,
342  rocrand_state_philox4x32_10* state)
343 {
344  *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
345 }
346 
359 __forceinline__ __device__ __host__
360 unsigned int rocrand(rocrand_state_philox4x32_10* state)
361 {
362  return state->next();
363 }
364 
377 __forceinline__ __device__ __host__
378 uint4 rocrand4(rocrand_state_philox4x32_10* state)
379 {
380  return state->next4();
381 }
382 
391 __forceinline__ __device__ __host__
392 void skipahead(unsigned long long offset, rocrand_state_philox4x32_10* state)
393 {
394  return state->discard(offset);
395 }
396 
406 __forceinline__ __device__ __host__
407 void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10* state)
408 {
409  return state->discard_subsequence(subsequence);
410 }
411 
421 __forceinline__ __device__ __host__
422 void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10* state)
423 {
424  return state->discard_subsequence(sequence);
425 }
426  // end of group rocranddevice
428 
429 #endif // ROCRAND_PHILOX4X32_10_H_
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:392
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:339
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:75
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:378
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:422
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:360
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:407