/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_philox4x32_10.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_philox4x32_10.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_philox4x32_10.h Source File
rocrand_philox4x32_10.h
1 // Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 /*
22 Copyright 2010-2011, D. E. Shaw Research.
23 All rights reserved.
24 
25 Redistribution and use in source and binary forms, with or without
26 modification, are permitted provided that the following conditions are
27 met:
28 
29 * Redistributions of source code must retain the above copyright
30  notice, this list of conditions, and the following disclaimer.
31 
32 * Redistributions in binary form must reproduce the above copyright
33  notice, this list of conditions, and the following disclaimer in the
34  documentation and/or other materials provided with the distribution.
35 
36 * Neither the name of D. E. Shaw Research nor the names of its
37  contributors may be used to endorse or promote products derived from
38  this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 */
52 
53 #ifndef ROCRAND_PHILOX4X32_10_H_
54 #define ROCRAND_PHILOX4X32_10_H_
55 
56 #include "rocrand/rocrand_common.h"
57 
58 // Constants from Random123
59 // See https://www.deshawresearch.com/resources_random123.html
60 #define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
61 #define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
62 #define ROCRAND_PHILOX_W32_0 0x9E3779B9U
63 #define ROCRAND_PHILOX_W32_1 0xBB67AE85U
64 
73 #define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL // end of group rocranddevice
75 
76 namespace rocrand_device {
77 namespace detail {
78 
79 __forceinline__ __device__ __host__ unsigned int
80  mulhilo32(unsigned int x, unsigned int y, unsigned int& z)
81 {
82  unsigned long long xy = mad_u64_u32(x, y, 0);
83  z = static_cast<unsigned int>(xy >> 32);
84  return static_cast<unsigned int>(xy);
85 }
86 
87 } // end detail namespace
88 
89 class philox4x32_10_engine
90 {
91 public:
92  struct philox4x32_10_state
93  {
94  uint4 counter;
95  uint4 result;
96  uint2 key;
97  unsigned int substate;
98 
99  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
100  // The Box–Muller transform requires two inputs to convert uniformly
101  // distributed real values [0; 1] to normally distributed real values
102  // (with mean = 0, and stddev = 1). Often user wants only one
103  // normally distributed number, to save performance and random
104  // numbers the 2nd value is saved for future requests.
105  unsigned int boxmuller_float_state; // is there a float in boxmuller_float
106  unsigned int boxmuller_double_state; // is there a double in boxmuller_double
107  float boxmuller_float; // normally distributed float
108  double boxmuller_double; // normally distributed double
109  #endif
110  };
111 
112  __forceinline__ __device__ __host__ philox4x32_10_engine()
113  {
114  this->seed(ROCRAND_PHILOX4x32_DEFAULT_SEED, 0, 0);
115  }
116 
122  __forceinline__ __device__ __host__ philox4x32_10_engine(const unsigned long long seed,
123  const unsigned long long subsequence,
124  const unsigned long long offset)
125  {
126  this->seed(seed, subsequence, offset);
127  }
128 
134  __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
135  const unsigned long long subsequence,
136  const unsigned long long offset)
137  {
138  m_state.key.x = static_cast<unsigned int>(seed_value);
139  m_state.key.y = static_cast<unsigned int>(seed_value >> 32);
140  this->restart(subsequence, offset);
141  }
142 
144  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
145  {
146  this->discard_impl(offset);
147  this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
148  }
149 
154  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
155  {
156  this->discard_subsequence_impl(subsequence);
157  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
158  }
159 
160  __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
161  const unsigned long long offset)
162  {
163  m_state.counter = {0, 0, 0, 0};
164  m_state.result = {0, 0, 0, 0};
165  m_state.substate = 0;
166  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
167  m_state.boxmuller_float_state = 0;
168  m_state.boxmuller_double_state = 0;
169  #endif
170  this->discard_subsequence_impl(subsequence);
171  this->discard_impl(offset);
172  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
173  }
174 
175  __forceinline__ __device__ __host__ unsigned int operator()()
176  {
177  return this->next();
178  }
179 
180  __forceinline__ __device__ __host__ unsigned int next()
181  {
182  #if defined(__HIP_PLATFORM_AMD__)
183  unsigned int ret = m_state.result.data[m_state.substate];
184  #else
185  unsigned int ret = (&m_state.result.x)[m_state.substate];
186  #endif
187  m_state.substate++;
188  if(m_state.substate == 4)
189  {
190  m_state.substate = 0;
191  this->discard_state();
192  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
193  }
194  return ret;
195  }
196 
197  __forceinline__ __device__ __host__ uint4 next4()
198  {
199  uint4 ret = m_state.result;
200  this->discard_state();
201  m_state.result = this->ten_rounds(m_state.counter, m_state.key);
202  return this->interleave(ret, m_state.result);
203  }
204 
205 protected:
206  // Advances the internal state to skip \p offset numbers.
207  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
208  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
209  {
210  // Adjust offset for subset
211  m_state.substate += offset & 3;
212  unsigned long long counter_offset = offset / 4;
213  counter_offset += m_state.substate < 4 ? 0 : 1;
214  m_state.substate += m_state.substate < 4 ? 0 : -4;
215  // Discard states
216  this->discard_state(counter_offset);
217  }
218 
219  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
220  __forceinline__ __device__ __host__ void
221  discard_subsequence_impl(unsigned long long subsequence)
222  {
223  unsigned int lo = static_cast<unsigned int>(subsequence);
224  unsigned int hi = static_cast<unsigned int>(subsequence >> 32);
225 
226  unsigned int temp = m_state.counter.z;
227  m_state.counter.z += lo;
228  m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
229  }
230 
231  // Advances the internal state by offset times.
232  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
233  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
234  {
235  unsigned int lo = static_cast<unsigned int>(offset);
236  unsigned int hi = static_cast<unsigned int>(offset >> 32);
237 
238  uint4 temp = m_state.counter;
239  m_state.counter.x += lo;
240  m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
241  m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
242  m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
243  }
244 
245  // Advances the internal state to the next state
246  // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
247  __forceinline__ __device__ __host__ void discard_state()
248  {
249  m_state.counter = this->bump_counter(m_state.counter);
250  }
251 
252  __forceinline__ __device__ __host__ static uint4 bump_counter(uint4 counter)
253  {
254  counter.x++;
255  unsigned int add = counter.x == 0 ? 1 : 0;
256  counter.y += add; add = counter.y == 0 ? add : 0;
257  counter.z += add; add = counter.z == 0 ? add : 0;
258  counter.w += add;
259  return counter;
260  }
261 
262  __forceinline__ __device__ __host__ uint4 interleave(const uint4 prev, const uint4 next) const
263  {
264  switch(m_state.substate)
265  {
266  case 0:
267  return prev;
268  case 1:
269  return uint4{ prev.y, prev.z, prev.w, next.x };
270  case 2:
271  return uint4{ prev.z, prev.w, next.x, next.y };
272  case 3:
273  return uint4{ prev.w, next.x, next.y, next.z };
274  }
275  __builtin_unreachable();
276  }
277 
278  // 10 Philox4x32 rounds
279  __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
280  {
281  counter = this->single_round(counter, key); key = this->bumpkey(key); // 1
282  counter = this->single_round(counter, key); key = this->bumpkey(key); // 2
283  counter = this->single_round(counter, key); key = this->bumpkey(key); // 3
284  counter = this->single_round(counter, key); key = this->bumpkey(key); // 4
285  counter = this->single_round(counter, key); key = this->bumpkey(key); // 5
286  counter = this->single_round(counter, key); key = this->bumpkey(key); // 6
287  counter = this->single_round(counter, key); key = this->bumpkey(key); // 7
288  counter = this->single_round(counter, key); key = this->bumpkey(key); // 8
289  counter = this->single_round(counter, key); key = this->bumpkey(key); // 9
290  return this->single_round(counter, key); // 10
291  }
292 
293 private:
294  // Single Philox4x32 round
295  __forceinline__ __device__ __host__ static uint4 single_round(uint4 counter, uint2 key)
296  {
297  // Source: Random123
298  unsigned int hi0;
299  unsigned int hi1;
300  unsigned int lo0 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_0, counter.x, hi0);
301  unsigned int lo1 = detail::mulhilo32(ROCRAND_PHILOX_M4x32_1, counter.z, hi1);
302  return uint4 {
303  hi1 ^ counter.y ^ key.x,
304  lo1,
305  hi0 ^ counter.w ^ key.y,
306  lo0
307  };
308  }
309 
310  __forceinline__ __device__ __host__ static uint2 bumpkey(uint2 key)
311  {
312  key.x += ROCRAND_PHILOX_W32_0;
313  key.y += ROCRAND_PHILOX_W32_1;
314  return key;
315  }
316 
317 protected:
318  // State
319  philox4x32_10_state m_state;
320 
321  #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
322  friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
323  #endif
324 
325 }; // philox4x32_10_engine class
326 
327 } // end namespace rocrand_device
328 
335 typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
337 
349 __forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed,
350  const unsigned long long subsequence,
351  const unsigned long long offset,
352  rocrand_state_philox4x32_10* state)
353 {
354  *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
355 }
356 
369 __forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10* state)
370 {
371  return state->next();
372 }
373 
386 __forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10* state)
387 {
388  return state->next4();
389 }
390 
399 __forceinline__ __device__ __host__ void skipahead(unsigned long long offset,
400  rocrand_state_philox4x32_10* state)
401 {
402  return state->discard(offset);
403 }
404 
414 __forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence,
415  rocrand_state_philox4x32_10* state)
416 {
417  return state->discard_subsequence(subsequence);
418 }
419 
429 __forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence,
430  rocrand_state_philox4x32_10* state)
431 {
432  return state->discard_subsequence(sequence);
433 }
434 
435 #endif // ROCRAND_PHILOX4X32_10_H_
436  // end of group rocranddevice
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition: rocrand_philox4x32_10.h:399
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition: rocrand_philox4x32_10.h:349
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition: rocrand_philox4x32_10.h:73
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:386
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition: rocrand_philox4x32_10.h:429
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition: rocrand_philox4x32_10.h:369
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition: rocrand_philox4x32_10.h:414