53 #ifndef ROCRAND_THREEFRY_COMMON_H_
54 #define ROCRAND_THREEFRY_COMMON_H_
56 #include <hip/hip_runtime.h>
59 #define SKEIN_MK_64(hi32, lo32) ((lo32) + (((unsigned long long)(hi32)) << 32))
60 #define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
61 #define SKEIN_KS_PARITY32 0x1BD11BDA
63 namespace rocrand_device
66 template<
typename value>
67 __forceinline__ __device__ __host__
68 value rotl(value x,
int d);
71 __forceinline__ __device__ __host__
72 unsigned long long rotl<unsigned long long>(
unsigned long long x,
int d)
74 return (x << (d & 63)) | (x >> ((64 - d) & 63));
78 __forceinline__ __device__ __host__
79 unsigned int rotl<unsigned int>(
unsigned int x,
int d)
81 return (x << (d & 31)) | (x >> ((32 - d) & 31));
84 template<
typename value>
85 __forceinline__ __device__ __host__
86 value skein_ks_parity();
89 __forceinline__ __device__ __host__
90 unsigned int skein_ks_parity<unsigned int>()
92 return SKEIN_KS_PARITY32;
96 __forceinline__ __device__ __host__
97 unsigned long long skein_ks_parity<unsigned long long>()
99 return SKEIN_KS_PARITY64;
102 template<
class value>
103 __forceinline__ __device__ __host__
104 int threefry_rotation_array(
int index)
108 __forceinline__ __device__ __host__
109 int threefry_rotation_array<unsigned int>(
int index)
115 static constexpr
int THREEFRY_ROTATION_32_2[8] = {13, 15, 26, 6, 17, 29, 16, 24};
116 return THREEFRY_ROTATION_32_2[index];
120 __forceinline__ __device__ __host__
121 int threefry_rotation_array<unsigned long long>(
int index)
127 static constexpr
int THREEFRY_ROTATION_64_2[8] = {16, 42, 12, 31, 16, 32, 24, 21};
128 return THREEFRY_ROTATION_64_2[index];
131 template<
typename state_value,
typename value,
unsigned int Nrounds>
132 __forceinline__ __device__ __host__
133 auto rounds_2(state_value X, value ks[3]) -> std::enable_if_t<Nrounds % 4 != 0, state_value>
135 for(
unsigned int round_idx = 0; round_idx < Nrounds; round_idx++)
138 X.y = rotl<value>(X.y, threefry_rotation_array<value>(round_idx & 7u));
141 if((round_idx & 3u) == 3)
143 unsigned int inject_idx = round_idx / 4;
145 X.x += ks[(1 + inject_idx) % 3];
146 X.y += ks[(2 + inject_idx) % 3];
147 X.y += 1 + inject_idx;
153 template<
typename state_value,
typename value,
unsigned int Nrounds>
154 __forceinline__ __device__ __host__
155 auto rounds_2(state_value X, value ks[3]) -> std::enable_if_t<Nrounds % 4 == 0, state_value>
158 for(
unsigned int i = 0; i < Nrounds / 4; i++)
160 unsigned int round_idx = 4 * i;
161 for(
unsigned int j = 0; j < 4; j++)
164 X.y = rotl<value>(X.y, threefry_rotation_array<value>((round_idx + j) & 7u));
170 X.x += ks[(1 + i) % 3];
171 X.y += ks[(2 + i) % 3];
177 template<
class value>
178 __forceinline__ __device__ __host__
179 int threefry_rotation_array(
int indexX,
int indexY)
183 __forceinline__ __device__ __host__
184 int threefry_rotation_array<unsigned int>(
int indexX,
int indexY)
191 static constexpr
int THREEFRY_ROTATION_32_4[8][2] = {
201 return THREEFRY_ROTATION_32_4[indexX][indexY];
205 __forceinline__ __device__ __host__
206 int threefry_rotation_array<unsigned long long>(
int indexX,
int indexY)
210 static constexpr
int THREEFRY_ROTATION_64_4[8][2] = {
220 return THREEFRY_ROTATION_64_4[indexX][indexY];
223 template<
typename state_value,
typename value,
unsigned int Nrounds>
224 __forceinline__ __device__ __host__
225 auto rounds_4(state_value X, value ks[5]) -> std::enable_if_t<Nrounds % 4 != 0, state_value>
227 for(
unsigned int round_idx = 0; round_idx < Nrounds; round_idx++)
229 int rot_0 = threefry_rotation_array<value>(round_idx & 7u, 0);
230 int rot_1 = threefry_rotation_array<value>(round_idx & 7u, 1);
231 if((round_idx & 2u) == 0)
234 X.y = rotl<value>(X.y, rot_0);
237 X.w = rotl<value>(X.w, rot_1);
243 X.w = rotl<value>(X.w, rot_0);
246 X.y = rotl<value>(X.y, rot_1);
250 if((round_idx & 3u) == 3)
252 unsigned int inject_idx = round_idx / 4;
254 X.x += ks[(1 + inject_idx) % 5];
255 X.y += ks[(2 + inject_idx) % 5];
256 X.z += ks[(3 + inject_idx) % 5];
257 X.w += ks[(4 + inject_idx) % 5];
258 X.w += 1 + inject_idx;
264 template<
typename state_value,
typename value,
unsigned int Nrounds>
265 __forceinline__ __device__ __host__
266 auto rounds_4(state_value X, value ks[5]) -> std::enable_if_t<Nrounds % 4 == 0, state_value>
269 for(
unsigned int i = 0; i < Nrounds / 4; i++)
271 unsigned int round_idx = 4 * i;
278 X.y = rotl<value>(X.y, threefry_rotation_array<value>((round_idx + j) & 7u, 0));
281 X.w = rotl<value>(X.w, threefry_rotation_array<value>((round_idx + j) & 7u, 1));
289 X.w = rotl<value>(X.w, threefry_rotation_array<value>((round_idx + j) & 7u, 0));
292 X.y = rotl<value>(X.y, threefry_rotation_array<value>((round_idx + j) & 7u, 1));
298 X.x += ks[(1 + i) % 5];
299 X.y += ks[(2 + i) % 5];
300 X.z += ks[(3 + i) % 5];
301 X.w += ks[(4 + i) % 5];