/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/permute_pk_int4.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/permute_pk_int4.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/permute_pk_int4.hpp Source File
permute_pk_int4.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 #pragma once
5 namespace ck_tile {
6 
27 template <typename Tensor>
29 {
30  auto tensor_row_buf = tensor.data();
31  for(size_t idx = 0; idx < tensor.size(); idx += 4)
32  {
33  int8_t input[8];
34 
35  for(int k = 0; k < 4; k++)
36  {
37  int8_t i4x2 = bit_cast<int8_t>(tensor_row_buf[idx + k]);
38  input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
39  input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
40  }
41 
42  // permute 0x76543210 => 0x75316420
43  {
44  int8_t hi = input[2];
45  int8_t lo = input[0];
46  int8_t i4x2 = (hi << 4) | lo;
47 
48  tensor_row_buf[idx + 0] = bit_cast<pk_int4_t>(i4x2);
49  }
50 
51  {
52  int8_t hi = input[6];
53  int8_t lo = input[4];
54  int8_t i4x2 = (hi << 4) | lo;
55 
56  tensor_row_buf[idx + 1] = bit_cast<pk_int4_t>(i4x2);
57  }
58 
59  {
60  int8_t hi = input[3];
61  int8_t lo = input[1];
62  int8_t i4x2 = (hi << 4) | lo;
63 
64  tensor_row_buf[idx + 2] = bit_cast<pk_int4_t>(i4x2);
65  }
66 
67  {
68  int8_t hi = input[7];
69  int8_t lo = input[5];
70  int8_t i4x2 = (hi << 4) | lo;
71 
72  tensor_row_buf[idx + 3] = bit_cast<pk_int4_t>(i4x2);
73  }
74  }
75 }
76 
77 } // namespace ck_tile
Definition: cluster_descriptor.hpp:13
int8_t int8_t
Definition: int8.hpp:20
void permute_vectors_i4x4_b(Tensor &tensor)
Permute packed int4 vectors for device implementation compatibility.
Definition: permute_pk_int4.hpp:28
Tensor wrapper that performs static and dynamic buffer logic. The tensor is based on a descriptor sto...
Definition: tensor.hpp:214