/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_threefry4_impl.h Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_threefry4_impl.h Source File#

API library: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-rocrand/checkouts/develop/library/include/rocrand/rocrand_threefry4_impl.h Source File
rocrand_threefry4_impl.h
1 // Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 /*
22 Copyright 2010-2011, D. E. Shaw Research.
23 All rights reserved.
24 
25 Redistribution and use in source and binary forms, with or without
26 modification, are permitted provided that the following conditions are
27 met:
28 
29 * Redistributions of source code must retain the above copyright
30  notice, this list of conditions, and the following disclaimer.
31 
32 * Redistributions in binary form must reproduce the above copyright
33  notice, this list of conditions, and the following disclaimer in the
34  documentation and/or other materials provided with the distribution.
35 
36 * Neither the name of D. E. Shaw Research nor the names of its
37  contributors may be used to endorse or promote products derived from
38  this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51 */
52 
53 #ifndef ROCRAND_THREEFRY4_IMPL_H_
54 #define ROCRAND_THREEFRY4_IMPL_H_
55 
56 #include "rocrand/rocrand_threefry_common.h"
57 #include <rocrand/rocrand_common.h>
58 
59 #ifndef THREEFRY4x32_DEFAULT_ROUNDS
60  #define THREEFRY4x32_DEFAULT_ROUNDS 20
61 #endif
62 
63 #ifndef THREEFRY4x64_DEFAULT_ROUNDS
64  #define THREEFRY4x64_DEFAULT_ROUNDS 20
65 #endif
66 
67 namespace rocrand_device
68 {
69 
70 template<class value>
71 __forceinline__ __device__ __host__ int threefry_rotation_array(int indexX, int indexY) = delete;
72 
73 template<>
74 __forceinline__ __device__ __host__ int threefry_rotation_array<unsigned int>(int indexX,
75  int indexY)
76 {
77  // Output from skein_rot_search: (srs-B128-X5000.out)
78  // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
79  // Start: Mon Aug 24 22:41:36 2009
80  // ...
81  // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format
82  static constexpr int THREEFRY_ROTATION_32_4[8][2] = {
83  {10, 26},
84  {11, 21},
85  {13, 27},
86  {23, 5},
87  { 6, 20},
88  {17, 11},
89  {25, 10},
90  {18, 20}
91  };
92  return THREEFRY_ROTATION_32_4[indexX][indexY];
93 }
94 
95 template<>
96 __forceinline__ __device__ __host__ int threefry_rotation_array<unsigned long long>(int indexX,
97  int indexY)
98 {
99  // These are the R_256 constants from the Threefish reference sources
100  // with names changed to R_64x4... */
101  static constexpr int THREEFRY_ROTATION_64_4[8][2] = {
102  {14, 16},
103  {52, 57},
104  {23, 40},
105  { 5, 37},
106  {25, 33},
107  {46, 12},
108  {58, 22},
109  {32, 32}
110  };
111  return THREEFRY_ROTATION_64_4[indexX][indexY];
112 }
113 
114 template<typename state_value, typename value, unsigned int Nrounds>
115 class threefry_engine4_base
116 {
117 public:
118  struct threefry_state_4
119  {
120  state_value counter;
121  state_value key;
122  state_value result;
123  unsigned int substate;
124  };
125  using state_type = threefry_state_4;
126  using state_vector_type = state_value;
127 
129  __forceinline__ __device__ __host__ void discard(unsigned long long offset)
130  {
131  this->discard_impl(offset);
132  this->m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
133  }
134 
140  __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
141  {
142  this->discard_subsequence_impl(subsequence);
143  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
144  }
145 
146  __forceinline__ __device__ __host__ value operator()()
147  {
148  return this->next();
149  }
150 
151  __forceinline__ __device__ __host__ value next()
152  {
153 #if defined(__HIP_PLATFORM_AMD__)
154  value ret = m_state.result.data[m_state.substate];
155 #else
156  value ret = (&m_state.result.x)[m_state.substate];
157 #endif
158  m_state.substate++;
159  if(m_state.substate == 4)
160  {
161  m_state.substate = 0;
162  m_state.counter = this->bump_counter(m_state.counter);
163  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
164  }
165  return ret;
166  }
167 
168  __forceinline__ __device__ __host__ state_value next4()
169  {
170  state_value ret = m_state.result;
171  m_state.counter = this->bump_counter(m_state.counter);
172  m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
173 
174  return this->interleave(ret, m_state.result);
175  }
176 
177 protected:
178  __forceinline__ __device__ __host__ static state_value threefry_rounds(state_value counter,
179  state_value key)
180  {
181  state_value X;
182  value ks[4 + 1];
183 
184  static_assert(Nrounds <= 72, "72 or less only supported in threefry rounds");
185 
186  ks[4] = skein_ks_parity<value>();
187 
188  ks[0] = key.x;
189  ks[1] = key.y;
190  ks[2] = key.z;
191  ks[3] = key.w;
192 
193  X.x = counter.x;
194  X.y = counter.y;
195  X.z = counter.z;
196  X.w = counter.w;
197 
198  ks[4] ^= key.x;
199  ks[4] ^= key.y;
200  ks[4] ^= key.z;
201  ks[4] ^= key.w;
202 
203  /* Insert initial key before round 0 */
204  X.x += ks[0];
205  X.y += ks[1];
206  X.z += ks[2];
207  X.w += ks[3];
208 
209  for(unsigned int round_idx = 0; round_idx < Nrounds; round_idx++)
210  {
211  int rot_0 = threefry_rotation_array<value>(round_idx & 7u, 0);
212  int rot_1 = threefry_rotation_array<value>(round_idx & 7u, 1);
213  if((round_idx & 2u) == 0)
214  {
215  X.x += X.y;
216  X.y = rotl<value>(X.y, rot_0);
217  X.y ^= X.x;
218  X.z += X.w;
219  X.w = rotl<value>(X.w, rot_1);
220  X.w ^= X.z;
221  }
222  else
223  {
224  X.x += X.w;
225  X.w = rotl<value>(X.w, rot_0);
226  X.w ^= X.x;
227  X.z += X.y;
228  X.y = rotl<value>(X.y, rot_1);
229  X.y ^= X.z;
230  }
231 
232  if((round_idx & 3u) == 3)
233  {
234  unsigned int inject_idx = round_idx / 4;
235  // InjectKey(r = 1 + inject_idx)
236  X.x += ks[(1 + inject_idx) % 5];
237  X.y += ks[(2 + inject_idx) % 5];
238  X.z += ks[(3 + inject_idx) % 5];
239  X.w += ks[(4 + inject_idx) % 5];
240  X.w += 1 + inject_idx;
241  }
242  }
243 
244  return X;
245  }
246 
249  __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
250  {
251  // Adjust offset for subset
252  m_state.substate += offset & 3;
253  unsigned long long counter_offset = offset / 4;
254  counter_offset += m_state.substate < 4 ? 0 : 1;
255  m_state.substate += m_state.substate < 4 ? 0 : -4;
256  // Discard states
257  this->discard_state(counter_offset);
258  }
259 
261  __forceinline__ __device__ __host__ void
262  discard_subsequence_impl(unsigned long long subsequence)
263  {
264  value lo, hi;
265  ::rocrand_device::detail::split_ull(lo, hi, subsequence);
266 
267  value old_counter = m_state.counter.z;
268  m_state.counter.z += lo;
269  m_state.counter.w += hi + (m_state.counter.z < old_counter ? 1 : 0);
270  }
271 
274  __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
275  {
276  value lo, hi;
277  ::rocrand_device::detail::split_ull(lo, hi, offset);
278 
279  state_value old_counter = m_state.counter;
280  m_state.counter.x += lo;
281  m_state.counter.y += hi + (m_state.counter.x < old_counter.x ? 1 : 0);
282  m_state.counter.z += (m_state.counter.y < old_counter.y ? 1 : 0);
283  m_state.counter.w += (m_state.counter.z < old_counter.z ? 1 : 0);
284  }
285 
286  __forceinline__ __device__ __host__ static state_value bump_counter(state_value counter)
287  {
288  counter.x++;
289  value add = counter.x == 0 ? 1 : 0;
290  counter.y += add;
291  add = counter.y == 0 ? add : 0;
292  counter.z += add;
293  add = counter.z == 0 ? add : 0;
294  counter.w += add;
295  return counter;
296  }
297 
298  __forceinline__ __device__ __host__ state_value interleave(const state_value prev,
299  const state_value next) const
300  {
301  switch(m_state.substate)
302  {
303  case 0: return prev;
304  case 1: return state_value{prev.y, prev.z, prev.w, next.x};
305  case 2: return state_value{prev.z, prev.w, next.x, next.y};
306  case 3: return state_value{prev.w, next.x, next.y, next.z};
307  }
308  __builtin_unreachable();
309  }
310 
311 protected:
312  threefry_state_4 m_state;
313 }; // threefry_engine4_base class
314 
315 } // end namespace rocrand_device
316 
317 #endif // ROCRAND_THREEFRY4_IMPL_H_