53 #ifndef ROCRAND_THREEFRY2_IMPL_H_ 
   54 #define ROCRAND_THREEFRY2_IMPL_H_ 
   56 #include "rocrand/rocrand_threefry_common.h" 
   57 #include <rocrand/rocrand_common.h> 
   59 #ifndef THREEFRY2x32_DEFAULT_ROUNDS 
   60     #define THREEFRY2x32_DEFAULT_ROUNDS 20 
   63 #ifndef THREEFRY2x64_DEFAULT_ROUNDS 
   64     #define THREEFRY2x64_DEFAULT_ROUNDS 20 
   67 namespace rocrand_device
 
   71 __forceinline__ __device__ __host__ 
int threefry_rotation_array(
int index) = 
delete;
 
   74 __forceinline__ __device__ __host__ 
int threefry_rotation_array<unsigned int>(
int index)
 
   80     static constexpr 
int THREEFRY_ROTATION_32_2[8] = {13, 15, 26, 6, 17, 29, 16, 24};
 
   81     return THREEFRY_ROTATION_32_2[index];
 
   85 __forceinline__ __device__ __host__ 
int threefry_rotation_array<unsigned long long>(
int index)
 
   91     static constexpr 
int THREEFRY_ROTATION_64_2[8] = {16, 42, 12, 31, 16, 32, 24, 21};
 
   92     return THREEFRY_ROTATION_64_2[index];
 
   95 template<
typename state_value, 
typename value, 
unsigned int Nrounds>
 
   96 class threefry_engine2_base
 
   99     struct threefry_state_2
 
  104         unsigned int substate;
 
  106     using state_type        = threefry_state_2;
 
  107     using state_vector_type = state_value;
 
  109     __forceinline__ __device__ __host__ 
void discard(
unsigned long long offset)
 
  111         this->discard_impl(offset);
 
  112         m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
 
  115     __forceinline__ __device__ __host__ 
void discard()
 
  117         m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
 
  125     __forceinline__ __device__ __host__ 
void discard_subsequence(
unsigned long long subsequence)
 
  127         this->discard_subsequence_impl(subsequence);
 
  128         m_state.result = this->threefry_rounds(m_state.counter, m_state.key);
 
  131     __forceinline__ __device__ __host__ value operator()()
 
  136     __forceinline__ __device__ __host__ value next()
 
  138 #if defined(__HIP_PLATFORM_AMD__) 
  139         value ret = m_state.result.data[m_state.substate];
 
  141         value ret = (&m_state.result.x)[m_state.substate];
 
  144         if(m_state.substate == 2)
 
  146             m_state.substate = 0;
 
  147             m_state.counter  = this->bump_counter(m_state.counter);
 
  148             m_state.result   = this->threefry_rounds(m_state.counter, m_state.key);
 
  153     __forceinline__ __device__ __host__ state_value next2()
 
  155         state_value ret = m_state.result;
 
  156         m_state.counter = this->bump_counter(m_state.counter);
 
  157         m_state.result  = this->threefry_rounds(m_state.counter, m_state.key);
 
  159         return this->interleave(ret, m_state.result);
 
  163     __forceinline__ __device__ __host__ 
static state_value threefry_rounds(state_value counter,
 
  169         static_assert(Nrounds <= 32, 
"32 or less only supported in threefry rounds");
 
  171         ks[2] = skein_ks_parity<value>();
 
  186         for(
unsigned int round_idx = 0; round_idx < Nrounds; round_idx++)
 
  189             X.y = rotl<value>(X.y, threefry_rotation_array<value>(round_idx & 7u));
 
  192             if((round_idx & 3u) == 3)
 
  194                 unsigned int inject_idx = round_idx / 4;
 
  196                 X.x += ks[(1 + inject_idx) % 3];
 
  197                 X.y += ks[(2 + inject_idx) % 3];
 
  198                 X.y += 1 + inject_idx;
 
  207     __forceinline__ __device__ __host__ 
void discard_impl(
unsigned long long offset)
 
  210         m_state.substate += offset & 1;
 
  211         unsigned long long counter_offset = offset / 2;
 
  212         counter_offset += m_state.substate < 2 ? 0 : 1;
 
  213         m_state.substate += m_state.substate < 2 ? 0 : -2;
 
  215         this->discard_state(counter_offset);
 
  219     __forceinline__ __device__ __host__ 
void 
  220         discard_subsequence_impl(
unsigned long long subsequence)
 
  222         m_state.counter.y += subsequence;
 
  227     __forceinline__ __device__ __host__ 
void discard_state(
unsigned long long offset)
 
  230         ::rocrand_device::detail::split_ull(lo, hi, offset);
 
  232         value old_counter = m_state.counter.x;
 
  233         m_state.counter.x += lo;
 
  234         m_state.counter.y += hi + (m_state.counter.x < old_counter ? 1 : 0);
 
  237     __forceinline__ __device__ __host__ 
static state_value bump_counter(state_value counter)
 
  240         value add = counter.x == 0 ? 1 : 0;
 
  245     __forceinline__ __device__ __host__ state_value interleave(
const state_value prev,
 
  246                                                                const state_value next)
 const 
  248         switch(m_state.substate)
 
  251             case 1: 
return state_value{prev.y, next.x};
 
  253         __builtin_unreachable();
 
  257     threefry_state_2 m_state;