hipBLASLtExt API reference

Contents

hipBLASLtExt API reference#

hipBLASLt contains extension APIs with the namespace hipblaslt_ext. They are only C++ compatible. The extensions support the following:

  1. GEMM

  2. Grouped GEMM

  3. Get all algorithms

hipBLASLtExt datatypes reference#

GemmType#

enum class hipblaslt_ext::GemmType#

This is an enumerated type used to specify the type of the GEMM problem in hipBLASLtExt APIs.

Values:

enumerator HIPBLASLT_GEMM#
enumerator HIPBLASLT_GROUPED_GEMM#

GemmProblemType#

class GemmProblemType#

hipBLASLt extension for the ProblemType for GEMM problems.

This structure sets the problem type of a GEMM problem.

Public Functions

void setOpA(hipblasOperation_t op)#

Set the A matrix transpose.

void setOpB(hipblasOperation_t op)#

Set the B matrix transpose.

void setTypeA(hipDataType type)#

Set the A matrix data type.

void setTypeB(hipDataType type)#

Set the B matrix data type.

void setTypeC(hipDataType type)#

Set the C matrix data type.

void setTypeD(hipDataType type)#

Set the D matrix data type.

void setTypeCompute(hipblasComputeType_t type)#

Set the compute data type.

void setOrderA(hipblasLtOrder_t order)#

Set the A matrix data order.

void setOrderB(hipblasLtOrder_t order)#

Set the B matrix data order.

hipblasOperation_t getOpA() const#

The A matrix transpose.

hipblasOperation_t getOpB() const#

The B matrix transpose.

hipDataType getTypeA() const#

The A matrix data type.

hipDataType getTypeB() const#

The B matrix data type.

hipDataType getTypeC() const#

The C matrix data type.

hipDataType getTypeD() const#

The D matrix data type.

hipblasComputeType_t getTypeCompute() const#

The compute data type.

hipblasLtOrder_t getOrderA() const#

The A matrix data order.

hipblasLtOrder_t getOrderB() const#

The B matrix data order.

GemmEpilogue#

class GemmEpilogue#

hipBLASLt extension for the epilogue for GEMM problems.

This class sets the epilogue of a GEMM problem.

Public Functions

void setMode(hipblasLtEpilogue_t mode)#

Set the mode of the epilogue. Default is gemm.

void setBiasDataType(hipDataType biasDataType)#

Set the bias data type. Only works if the mode is set to bias-related epilogues.

void setAuxDataType(hipDataType auxDataType)#

Set the aux data type. Only works if the mode is set to aux-related epilogues.

void setAuxLeadingDimension(int auxLeadingDimension)#

Set the aux leading dimension. Only works if the mode is set to aux-related epilogues.

void setAuxBatchStride(int auxBatchStride)#

Set the aux batch stride. Only works if the mode is set to aux-related epilogues.

void setScalingAType(hipblasLtMatmulMatrixScale_t scalingAType)#

Only works if DataTypeA = DataTypeB = FP8.

void setScalingBType(hipblasLtMatmulMatrixScale_t scalingBType)#

Only works if DataTypeA = DataTypeB = FP8.

void setAct0(float act0)#

Set the first extra argument for the activation function.

void setAct1(float act1)#

Set the second extra argument for the activation function.

hipblasLtEpilogue_t getMode() const#

The mode of the epilogue. Default is gemm.

hipDataType getBiasDataType() const#

The bias data type. Only works if the mode is set to bias-related epilogues.

hipDataType getAuxDataType() const#

The aux data type. Only works if the mode is set to aux-related epilogues.

int getAuxLeadingDimension() const#

The aux leading dimension. Only works if the mode is set to aux-related epilogues.

int getAuxBatchStride() const#

The aux batch stride. Only works if the mode is set to aux-related epilogues.

hipblasLtMatmulMatrixScale_t getScalingAType() const#

0 is scalar. 1 is vector. Only works if DataTypeA = DataTypeB = FP8.

hipblasLtMatmulMatrixScale_t getScalingBType() const#

0 is scalar. 1 is vector. Only works if DataTypeA = DataTypeB = FP8.

float getAct0()#

First extra argument for the activation function.

float getAct1()#

Second extra argument for the activation function.

GemmInputs#

class GemmInputs#

hipBLASLt extension for inputs for GEMM problems.

This class sets the input pointers of a GEMM problem.

Public Functions

void setA(const void *a)#

Set the A matrix input pointer.

void setB(const void *b)#

Set the B matrix input pointer.

void setC(const void *c)#

Set the C matrix input pointer.

void setD(const void *d)#

Set the D matrix input pointer.

void setAlpha(const void *alpha)#

Set the alpha value.

void setBeta(const void *beta)#

Set the beta value.

void setBias(const void *bias)#

Set the bias input pointer.

void setScaleA(const void *scaleA)#

Set the scale A input pointer.

void setScaleB(const void *scaleB)#

Set the scale B input pointer.

void setScaleC(const void *scaleC)#

Set the scale C input pointer.

void setScaleD(const void *scaleD)#

Set the scale D input pointer.

void setScaleAux(const void *scaleAux)#

Set the scale aux input pointer.

void setScaleAlphaVec(const void *scaleAlphaVec)#

Set the scaleAlpha vector input pointer.

void setAux(const void *aux)#

Set the aux input pointer.

void setAmaxD(const void *amaxD)#

Set the AmaxD input pointer.

const void *getA() const#

The A matrix input pointer.

const void *getB() const#

The B matrix input pointer.

const void *getC() const#

The C matrix input pointer.

const void *getD() const#

The D matrix input pointer.

const void *getAlpha() const#

The alpha value.

const void *getBeta() const#

The beta value.

const void *getBias() const#

The bias input pointer.

const void *getScaleA() const#

The scale A input pointer.

const void *getScaleB() const#

The scale B input pointer.

const void *getScaleC() const#

The scale C input pointer.

const void *getScaleD() const#

The scale D input pointer.

const void *getScaleAux() const#

The scale aux input pointer.

const void *getScaleAlphaVec() const#

The scaleAlpha vector input pointer.

const void *getAux() const#

The aux input pointer.

const void *getAmaxD() const#

The AmaxD input pointer.

hipBLASLtExt GEMM class reference#

GemmPreference#

class GemmPreference#

hipBLASLt extension for the preference for GEMM problems.

Currently only supports setting the maximum workspace size.

Public Functions

void setMaxWorkspaceBytes(size_t workspaceBytes)#

This function sets the maximum workspace size.

Parameters:

workspaceBytes[in] Set the maximum workspace size in bytes.

const size_t getMaxWorkspaceBytes() const#

This function returns the maximum workspace size that was set.

Return values:

size_t – Returns the set max workspace size.

GemmInstance#

class GemmInstance#

hipBLASLt extension for an instance of a GEMM problem.

Subclassed by hipblaslt_ext::Gemm, hipblaslt_ext::GroupedGemm

Public Functions

hipblasStatus_t algoGetHeuristic(const int requestedAlgoCount, const GemmPreference &pref, std::vector<hipblasLtMatmulHeuristicResult_t> &heuristicResults)#

Retrieve the possible algorithms.

This function retrieves the possible algorithms for the matrix multiply operation hipblasLtMatmul() function with the given data and compute type. The output is placed in heuristicResult in order of increasing estimated compute time.

Parameters:
  • requestedAlgoCount[in] The number of requested algorithms.

  • pref[in] hipBLASLt extension preference for GEMM problems.

  • heuristicResults[out] The algorithm heuristic vector.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. Verifies whether heuristicResults.size > 0 but could have heuristicResults.size < requestedAlgoCount as a valid state for the status.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If no heuristic function is available for the current configuration.

  • HIPBLAS_STATUS_INVALID_VALUE – If no solution is found.

hipblasStatus_t isAlgoSupported(hipblasLtMatmulAlgo_t &algo, size_t &workspaceSizeInBytes)#

Check whether the algorithm supports the problem (for the hipBLASLt extension API).

This function updates the problem saved inside the algorithm if the problem is supported. The required workspaceSizeInBytes is also returned.

Parameters:
  • algo[in] The algorithm heuristic.

  • workspaceSizeInBytes[out] Returns the required workspace size.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. The problem is supported by the algorithm.

  • HIPBLAS_STATUS_INVALID_VALUE – The problem is not supported.

hipblasStatus_t isAlgoSupported(hipblasLtMatmulAlgo_t &algo, GemmTuning &tuning, size_t &workspaceSizeInBytes)#

Check whether the algorithm supports the problem (for the hipBLASLt extension API).

This function updates the problem saved inside the algorithm if the problem is supported. The required workspaceSizeInBytes is also returned.

Parameters:
  • algo[in] The algorithm heuristic.

  • tuning[in] The tuning parameters.

  • workspaceSizeInBytes[out] Returns the required workspace size.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. The problem is supported by the algorithm.

  • HIPBLAS_STATUS_INVALID_VALUE – The problem is not supported.

void setMaxWorkspaceBytes(size_t workspaceBytes)#

This function sets the maximum workspace size.

Parameters:

workspaceBytes[in] Sets the maximum workspace size in bytes.

const size_t getMaxWorkspaceBytes() const#

This function returns the maximum workspace size that was set.

Return values:

size_t – Returns the maximum workspace size that was set.

hipblasStatus_t initialize(const hipblasLtMatmulAlgo_t &algo, void *workspace, bool useUserArgs = true, hipStream_t stream = 0)#

Create kernel arguments from a given hipblaslt_ext::GemmInstance.

This function creates kernel arguments from a given hipblaslt_ext::GemmInstance, then saves the arguments inside the instance.

Parameters:
  • algo[in] Handle for the matrix multiplication algorithm to be used. See hipblasLtMatmulAlgo_t in hipblaslt.h. When NULL, an implicit heuristics query with default search preferences will be performed to determine the actual algorithm to use.

  • workspace[in] Pointer to the workspace buffer allocated in the GPU memory. The pointer must be 16B aligned (that is, the lowest 4 bits of the address must be 0).

  • useUserArgs[in] Use user args. This does not affect standard GEMM. (This parameter might be deprecated in the future.)

  • stream[in] The HIP stream where all the GPU work will be submitted. (This parameter might be deprecated in the future.)

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_INVALID_VALUE – If the gemm_count = 0 or workspace is null but workspaceBytes is greater than zero. Note that workspaceBytes should be set with setMaxWorkspaceBytes.

hipblasStatus_t initialize(const hipblasLtMatmulAlgo_t &algo, GemmTuning &tuning, void *workspace, bool useUserArgs = true, hipStream_t stream = 0)#

Create kernel arguments from a given hipblaslt_ext::GemmInstance.

This function creates kernel arguments from a given hipblaslt_ext::GemmInstance, then saves the arguments inside the instance.

Parameters:
  • algo[in] Handle for matrix multiplication algorithm to be used. See hipblasLtMatmulAlgo_t in hipblaslt.h. When NULL, an implicit heuristics query with default search preferences will be performed to determine the actual algorithm to use.

  • tuning[in] Structure with user tuning parameters. Note that not every algorithm supports user tuning parameters. Will return HIPBLAS_STATUS_INVALID_VALUE if not supported.

  • workspace[in] Pointer to the workspace buffer allocated in the GPU memory. Pointer must be 16B aligned (that is, the lowest 4 bits of the address must be 0).

  • useUserArgs[in] Use user args. This does not affect standard GEMM. (This parameter might be deprecated in the future.)

  • stream[in] The HIP stream where all the GPU work will be submitted. (This parameter might be deprecated in the future.)

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_INVALID_VALUE – If the gemm_count = 0 or workspace is null but workspaceBytes is greater than zero. Note that workspaceBytes should be set with setMaxWorkspaceBytes.

hipblasStatus_t run(hipStream_t stream, hipEvent_t start = nullptr, hipEvent_t stop = nullptr)#

Execute the kernel arguments stored inside hipblaslt_ext::GemmInstance.

Parameters:
  • stream[in] The HIP stream where all the GPU work will take place.

  • start[in] The HIP event which will record the start of the kernel.

  • stop[in] The HIP event which will record the end of the submitted kernel.

Return values:

HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

Protected Functions

explicit GemmInstance(hipblasLtHandle_t handle, GemmType type)#

Constructor of GemmInstance.

Gemm#

class Gemm : public hipblaslt_ext::GemmInstance#

hipBLASLt extension for a GEMM instance.

The instance can be used to create arguments to compute the matrix multiplication of matrices A and B to produce the output matrix D, according to the following operation: D = alpha*( A *B) + beta*( C ), where A, B, and C are input matrices, and alpha and beta are input scalars.

Public Functions

explicit Gemm(hipblasLtHandle_t handle, hipblasOperation_t opA, hipblasOperation_t opB, hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD, hipblasComputeType_t typeCompute)#

Constructor.

This constructor sets up the problem from hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • handle[in] The handle from hipBLASLt.

  • opA, opB[in] The transpose type of matrices A and B.

  • typeA, typeB, typeC, typeD[in] The data type of matrices A, B, C, and D.

  • typeCompute[in] The compute type of the GEMM problem.

explicit Gemm(hipblasLtHandle_t handle, hipblasLtMatmulDesc_t matmul_descr, const void *alpha, const void *A, hipblasLtMatrixLayout_t matA, const void *B, hipblasLtMatrixLayout_t matB, const void *beta, const void *C, hipblasLtMatrixLayout_t matC, void *D, hipblasLtMatrixLayout_t matD)#

Constructor that sets the GEMM problem from hipBLASLt structures.

This constructor sets up the problem from hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • handle[in] The handle from hipBLASLt.

  • matmul_descr[in] Handle to a previously created matrix multiplication descriptor of type hipblasLtMatmulDesc_t.

  • alpha, beta[in] Pointers to the scalars used in the multiplication.

  • matA, matB, matC, matD[in] Handles to the previously created matrix layout descriptors of type hipblasLtMatrixLayout_t.

  • A, B, C[in] Pointers to the GPU memory associated with the corresponding descriptors matA, matB, and matC.

  • D[out] Pointer to the GPU memory associated with the descriptor matD.

hipblasStatus_t setProblem(int64_t m, int64_t n, int64_t k, int64_t batch_count, GemmEpilogue &epilogue, GemmInputs &inputs)#

Sets up a GEMM problem.

This function sets up the problem using m, n, k, and batch_count. It uses the problem type variables from the constructor.

Parameters:
  • m, n, k[in] The problem size.

  • batch_count[in] The batch count.

  • epilogue[in] The class that controls the epilogue.

  • inputs[in] The inputs of the problem.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

hipblasStatus_t setProblem(int64_t m, int64_t n, int64_t k, int64_t batch_count, int64_t lda, int64_t ldb, int64_t ldc, int64_t ldd, int64_t strideA, int64_t strideB, int64_t strideC, int64_t strideD, GemmEpilogue &epilogue, GemmInputs &inputs, GemmProblemType &problemtype)#

Sets up a GEMM problem.

This function sets up the problem using m, n, k, and batch_count. It uses a larger collection of problem type variables from the constructor.

Parameters:
  • m, n, k[in] The problem size.

  • batch_count[in] The batch count.

  • lda, ldb, ldc, ldd[in] The leading dimensions of the matrix.

  • strideA, strideB, strideC, strideD[in] The batch stride of the matrix.

  • epilogue[in] The structure that controls the epilogue.

  • inputs[in] The inputs of the problem.

  • problemtype[in] The structure that sets the problem type of a GEMM problem.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

hipblasStatus_t setProblem(hipblasLtMatmulDesc_t matmul_descr, const void *alpha, const void *A, hipblasLtMatrixLayout_t matA, const void *B, hipblasLtMatrixLayout_t matB, const void *beta, const void *C, hipblasLtMatrixLayout_t matC, void *D, hipblasLtMatrixLayout_t matD)#

Sets up a GEMM problem from the hipBLASLt structures.

This function sets up the GEMM problem using the hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • matmul_descr[in] Handle to a previously created matrix multiplication descriptor of type hipblasLtMatmulDesc_t.

  • alpha, beta[in] Pointers to the scalars used in the multiplication.

  • matA, matB, matC, matD[in] Handles to the previously created matrix layout descriptors of type hipblasLtMatrixLayout_t.

  • A, B, C[in] Pointers to the GPU memory associated with the corresponding descriptors matA, matB, and matC.

  • D[out] Pointer to the GPU memory associated with the descriptor matD.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

GroupedGemm#

class GroupedGemm : public hipblaslt_ext::GemmInstance#

hipBLASLt extension for a grouped GEMM instance.

The instance can be used to create arguments to compute the matrix multiplication of matrices A and B to produce the output matrix D, according to the following operation: D = alpha*( A *B) + beta*( C ), where A, B, and C are input matrices, and alpha and beta are input scalars.

Public Functions

explicit GroupedGemm(hipblasLtHandle_t handle, hipblasOperation_t opA, hipblasOperation_t opB, hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD, hipblasComputeType_t typeCompute)#

Constructor.

This function sets up the grouped GEMM problem from hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • handle[in] The handle from hipBLASLt.

  • opA, opB[in] The transpose type of matrices A and B.

  • typeA, typeB, typeC, typeD[in] The data type of matrices A, B, C, and D.

  • typeCompute[in] The compute type of the GEMM problem.

explicit GroupedGemm(hipblasLtHandle_t handle, std::vector<hipblasLtMatmulDesc_t> &matmul_descr, std::vector<void*> &alpha, std::vector<void*> &A, std::vector<hipblasLtMatrixLayout_t> &matA, std::vector<void*> &B, std::vector<hipblasLtMatrixLayout_t> &matB, std::vector<void*> &beta, std::vector<void*> &C, std::vector<hipblasLtMatrixLayout_t> &matC, std::vector<void*> &D, std::vector<hipblasLtMatrixLayout_t> &matD)#

Constructor that sets up the grouped GEMM problem using hipBLASLt structures.

This constructor sets up the grouped GEMM problem from hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • handle[in] The handle from hipBLASLt.

  • matmul_descr[in] Vectors of handles to a previously created matrix multiplication descriptor of type hipblasLtMatmulDesc_t.

  • alpha, beta[in] Vectors of type float used in the multiplication.

  • matA, matB, matC, matD[in] Vectors of handles to the previously created matrix layout descriptors of type hipblasLtMatrixLayout_t.

  • A, B, C[in] Vectors of pointers to the GPU memory associated with the corresponding descriptors matA, matB, and matC .

  • D[out] Vector of pointers to the GPU memory associated with the descriptor matD.

hipblasStatus_t setProblem(std::vector<int64_t> &m, std::vector<int64_t> &n, std::vector<int64_t> &k, std::vector<int64_t> &batch_count, std::vector<GemmEpilogue> &epilogue, std::vector<GemmInputs> &inputs)#

Sets up a GEMM problem.

This function sets up the problem using m, n, k, and batch_count. It uses the problem type variables from the constructor.

Parameters:
  • m, n, k[in] The problem size vector.

  • batch_count[in] The batch count vector.

  • epilogue[in] The structure vector that controls the epilogue.

  • inputs[in] The inputs vector of the problem.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

hipblasStatus_t setProblem(std::vector<int64_t> &m, std::vector<int64_t> &n, std::vector<int64_t> &k, std::vector<int64_t> &batch_count, std::vector<int64_t> &lda, std::vector<int64_t> &ldb, std::vector<int64_t> &ldc, std::vector<int64_t> &ldd, std::vector<int64_t> &strideA, std::vector<int64_t> &strideB, std::vector<int64_t> &strideC, std::vector<int64_t> &strideD, std::vector<GemmEpilogue> &epilogue, std::vector<GemmInputs> &inputs, GemmProblemType &problemtype)#

Sets up a GEMM problem.

This function sets up the problem using m, n, k, and batch_count. It uses the problem type variables from the constructor.

Parameters:
  • m, n, k[in] The problem size vector.

  • batch_count[in] The batch count vector.

  • lda, ldb, ldc, ldd[in] The leading dimensions vector of the matrix.

  • strideA, strideB, strideC, strideD[in] The batch stride vector of the matrix.

  • epilogue[in] The structure vector that controls the epilogue.

  • inputs[in] The inputs vector of the problem.

  • problemtype[in] The structure that sets the problem type of a GEMM problem.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

hipblasStatus_t setProblem(std::vector<hipblasLtMatmulDesc_t> &matmul_descr, std::vector<void*> &alpha, std::vector<void*> &A, std::vector<hipblasLtMatrixLayout_t> &matA, std::vector<void*> &B, std::vector<hipblasLtMatrixLayout_t> &matB, std::vector<void*> &beta, std::vector<void*> &C, std::vector<hipblasLtMatrixLayout_t> &matC, std::vector<void*> &D, std::vector<hipblasLtMatrixLayout_t> &matD)#

Sets up the grouped GEMM problem from hipBLASLt structures.

This function sets up the grouped GEMM problem from hipBLASLt structures. For more information about the structures, see hipblasLtMatmul.

Parameters:
  • matmul_descr[in] Vectors of handle to a previously created matrix multiplication descriptor of type hipblasLtMatmulDesc_t.

  • alpha, beta[in] Vectors of type float used in the multiplication.

  • matA, matB, matC, matD[in] Vectors of handles to the previously created matrix layout descriptors of type hipblasLtMatrixLayout_t.

  • A, B, C[in] Vectors of pointers to the GPU memory associated with the corresponding descriptors matA, matB, and matC.

  • D[out] Vector of pointers to the GPU memory associated with the descriptor matD.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_EXECUTION_FAILED – If HIP reported an execution error from the device.

  • HIPBLAS_STATUS_ARCH_MISMATCH – If the configured operation cannot be run using the selected device.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If the current implementation on the selected device doesn’t support the configured operation.

  • HIPBLAS_STATUS_INVALID_VALUE – If the parameters are unexpectedly NULL, in conflict, or in an impossible configuration.

  • HIBLAS_STATUS_NOT_INITIALIZED – If the hipBLASLt handle has not been initialized.

hipblasStatus_t getDefaultValueForDeviceUserArguments(void *hostDeviceUserArgs)#

A helper function to initialize DeviceUserArguments using the set problems saved in the GEMM object.

Parameters:

hostDeviceUserArgs[in] The DeviceUserArguments structure allocated in the host. The correct type must be used for DeviceUserArguments.

Return values:

HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

hipblasStatus_t run(void *deviceUserArgs, hipStream_t stream)#

Run the kernel using DeviceUserArguments.

Parameters:
  • deviceUserArgs[in] Pointer to the DeviceUserArguments buffer allocated in the GPU memory. The pointer must be 16B aligned (that is, the lowest 4 bits of the address must be 0).

  • stream[in] The HIP stream where all the GPU work will be submitted.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

  • HIPBLAS_STATUS_INVALID_VALUE – If the gemm_count = 0.

hipblasStatus_t run(hipStream_t stream, hipEvent_t start = nullptr, hipEvent_t stop = nullptr)#

Execute the kernel arguments stored inside hipblaslt_ext::GemmInstance.

Parameters:
  • stream[in] The HIP stream where all the GPU work will take place.

  • start[in] The HIP event which will record the start of the kernel.

  • stop[in] The HIP event which will record the end of the submitted kernel.

Return values:

HIPBLAS_STATUS_SUCCESS – If the operation completed successfully.

hipBLASLtExt API reference#

getAllAlgos()#

hipblasStatus_t hipblaslt_ext::getAllAlgos(hipblasLtHandle_t handle, GemmType typeGemm, hipblasOperation_t opA, hipblasOperation_t opB, hipDataType typeA, hipDataType typeB, hipDataType typeC, hipDataType typeD, hipblasComputeType_t typeCompute, std::vector<hipblasLtMatmulHeuristicResult_t> &heuristicResults)#

Retrieve the possible algorithms.

This function retrieves the possible algorithms for the matrix multiply operation hipblasLtMatmul() function with the given data and compute type. The output is placed in heuristicResults in order of increasing estimated compute time. It should use matmulIsAlgoSupported() to check if the algorithm supports the problem before executing hipblasLtMatmul().

Parameters:
  • handle[in] Pointer to the allocated hipBLASLt handle for the hipBLASLt context. See hipblasLtHandle_t.

  • typeGemm[in] Gemm type, for instance, GEMM or GROUPED_GEMM.

  • opA, opB[in] Transpose settings for A and B.

  • typeA, typeB, typeC, typeD[in] The data type of matrices A, B, C, and D.

  • typeCompute[in] The compute type.

  • heuristicResults[out] The algorithm heuristic vector.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. Verifies that returnedAlgoCount > 0 to determine the status of the results.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If no heuristic function is available for the current configuration.

  • HIPBLAS_STATUS_INVALID_VALUE – If no solution is found.

getIndexFromAlgo()#

int hipblaslt_ext::getIndexFromAlgo(hipblasLtMatmulAlgo_t &algo)#

Retrieve the algorithm index.

Parameters:

algo[in] The algorithm.

Return values:

int – The index of the algorithm, which can be used to get the heuristic results from getAlgosFromIndex. Note that the index might not be valid if the algorithm struct is not initialized properly.

getAlgosFromIndex()#

hipblasStatus_t hipblaslt_ext::getAlgosFromIndex(hipblasLtHandle_t handle, std::vector<int> &algoIndex, std::vector<hipblasLtMatmulHeuristicResult_t> &heuristicResults)#

Retrieve the possible algorithms.

This function retrieves the possible algorithms for the matrix multiply operation hipblasLtMatmul() function with the given index. The output is placed in heuristicResult in order of increasing estimated compute time. A specific solution index cannot be used across different versions of the library. Use matmulIsAlgoSupported() to check whether the algorithm supports the problem before executing hipblasLtMatmul().

Parameters:
  • handle[in] Pointer to the allocated hipBLASLt handle for the hipBLASLt context. See hipblasLtHandle_t.

  • algoIndex[in] The algorithm index vector.

  • heuristicResults[out] The algorithm heuristic vector.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. Verifies the state of heuristicResults.size() > 0 to determine the status of the results.

  • HIPBLAS_STATUS_NOT_SUPPORTED – If no heuristic function is available for the current configuration.

  • HIPBLAS_STATUS_INVALID_VALUE – If query indexes are all out of bounds of the solution map.

matmulIsAlgoSupported()#

hipblasStatus_t hipblaslt_ext::matmulIsAlgoSupported(hipblasLtHandle_t handle, hipblasLtMatmulDesc_t matmulDesc, const void *alpha, hipblasLtMatrixLayout_t Adesc, hipblasLtMatrixLayout_t Bdesc, const void *beta, hipblasLtMatrixLayout_t Cdesc, hipblasLtMatrixLayout_t Ddesc, hipblasLtMatmulAlgo_t &algo, size_t &workspaceSizeInBytes)#

Check whether the algorithm supports the problem for the hipBLASLt API.

This function updates the problem saved inside the algorithm if the problem is supported. The required workspaceSizeInBytes is also returned.

Parameters:
  • handle[in] Pointer to the allocated hipBLASLt handle for the hipBLASLt context. See hipblasLtHandle_t.

  • matmulDesc[in] Handle to a previously created matrix multiplication descriptor of type hipblasLtMatmulDesc_t.

  • alpha, beta[in] Pointers to the scalars used in the multiplication.

  • Adesc, Bdesc, Cdesc, Ddesc[in] Handles to the previously created matrix layout descriptors of type hipblasLtMatrixLayout_t.

  • algo[in] The algorithm heuristic.

  • workspaceSizeInBytes[out] Returns the required workspace size.

Return values:
  • HIPBLAS_STATUS_SUCCESS – If the query was successful. The problem is supported by the algorithm. results.

  • HIPBLAS_STATUS_INVALID_VALUE – The problem is not supported.

hipblasLtExt usage#

Here are the three use cases supported by the hipBLASLtExt APIs.

GEMM#

hipblasLt has its own instance. You must assign the problem type when constructing or importing the problem from the hipBLAS API.

HIPBLASLT_EXPORT explicit Gemm(hipblasLtHandle_t      handle,
                               hipblasOperation_t     opA,
                               hipblasOperation_t     opB,
                               hipDataType      typeA,
                               hipDataType      typeB,
                               hipDataType      typeC,
                               hipDataType      typeD,
                               hipblasComputeType_t typeCompute);

HIPBLASLT_EXPORT explicit Gemm(hipblasLtHandle_t       handle,
                               hipblasLtMatmulDesc_t   matmul_descr,
                               const void*             alpha,
                               const void*             A,
                               hipblasLtMatrixLayout_t matA,
                               const void*             B,
                               hipblasLtMatrixLayout_t matB,
                               const void*             beta,
                               const void*             C,
                               hipblasLtMatrixLayout_t matC,
                               void*                   D,
                               hipblasLtMatrixLayout_t matD);

After the instance is created, you can set the problem using the API. The API might require the following structures:

struct GemmEpilogue
{
   hipblasLtEpilogue_t mode = HIPBLASLT_EPILOGUE_DEFAULT;
   hipDataType   bias_data_type;
   int                 aux_ld;
   int                 aux_stride;
};
  • setProblem APIs:

    HIPBLASLT_EXPORT hipblasStatus_t setProblem(
       int64_t m, int64_t n, int64_t k, int64_t batch_count, GemmEpilogue& epilogue, GemmInputs& inputs);
    

You can set the leading dimensions and strides and reassign the data type using the following API:

HIPBLASLT_EXPORT hipblasStatus_t setProblem(int64_t            m,
                                            int64_t            n,
                                            int64_t            k,
                                            int64_t            batch_count,
                                            int64_t            lda,
                                            int64_t            ldb,
                                            int64_t            ldc,
                                            int64_t            ldd,
                                            int64_t            strideA,
                                            int64_t            strideB,
                                            int64_t            strideC,
                                            int64_t            strideD,
                                            GemmEpilogue&    epilogue,
                                            GemmInputs&      inputs,
                                            GemmProblemType& problemtype);

You can import problems from the hipblasLt APIs after the instance is created.

Note

This can overwrite the problem type of the instance.

HIPBLASLT_EXPORT hipblasStatus_t setProblem(hipblasLtMatmulDesc_t   matmul_descr,
                                            const void*             alpha,
                                            const void*             A,
                                            hipblasLtMatrixLayout_t matA,
                                            const void*             B,
                                            hipblasLtMatrixLayout_t matB,
                                            const void*             beta,
                                            const void*             C,
                                            hipblasLtMatrixLayout_t matC,
                                            void*                   D,
                                            hipblasLtMatrixLayout_t matD);

You can retrieve heuristics and set kernel arguments with the instance. If the properties of the GEMM and the inputs don’t change, you can call the run API to launch the kernel directly.

// Pseudo code
hipblaslt_ext::GemmPreference pref;
pref.setMaxWorkspaceBytes(1000000);
// Default epilogue mode is HIPBLASLT_EPILOGUE_DEFAULT
hipblaslt_ext::GemmEpilogue epilogue;
hipblaslt_ext::GemmInputs inputs;
inputs.setA(d_a);
inputs.setB(d_b);
inputs.setC(d_c);
inputs.setD(d_d);
inputs.setAlpha(&alpha);
inputs.setBeta(&beta);

hipblaslt_ext::Gemm gemm(handle,
                         HIPBLAS_OP_N,
                         HIPBLAS_OP_N,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIPBLAS_COMPUTE_32F);
std::vector<hipblasLtMatmulHeuristicResult_t> heuristic;
gemm.setProblem(1, 1, 1, 1, epilogue, inputs); // m, n, k, batch
gemm.algoGetHeuristic(gemm, pref, heuristic);
gemm.initialize(heuristic[0].algo, d_workspace, stream);
for(int i = 0; i < 10; i++)
{
    gemm.run(stream);
}

Grouped GEMM#

hipblasLtExt supports grouped GEMM. It shares the same class with normal GEMM.

After the problem is set, you can check the problem type using the function getGemmType().

enum class GemmType
{
    HIPBLASLT_GEMM             = 1,
    HIPBLASLT_GROUPED_GEMM     = 2
};

The grouped GEMM class also includes the setProblem APIs.

HIPBLASLT_EXPORT hipblasStatus_t setProblem(
    int64_t m, int64_t n, int64_t k, int64_t batch_count, GemmEpilogue& epilogue, GemmInputs& inputs);

HIPBLASLT_EXPORT hipblasStatus_t setProblem(std::vector<int64_t>&        m,
                                            std::vector<int64_t>&        n,
                                            std::vector<int64_t>&        k,
                                            std::vector<int64_t>&        batch_count,
                                            std::vector<GemmEpilogue>& epilogue,
                                            std::vector<GemmInputs>&   inputs);

HIPBLASLT_EXPORT hipblasStatus_t setProblem(std::vector<int64_t>&        m,
                                            std::vector<int64_t>&        n,
                                            std::vector<int64_t>&        k,
                                            std::vector<int64_t>&        batch_count,
                                            std::vector<int64_t>&        lda,
                                            std::vector<int64_t>&        ldb,
                                            std::vector<int64_t>&        ldc,
                                            std::vector<int64_t>&        ldd,
                                            std::vector<int64_t>&        strideA,
                                            std::vector<int64_t>&        strideB,
                                            std::vector<int64_t>&        strideC,
                                            std::vector<int64_t>&        strideD,
                                            std::vector<GemmEpilogue>& epilogue,
                                            std::vector<GemmInputs>&   inputs,
                                            GemmProblemType&           problemtype);

HIPBLASLT_EXPORT hipblasStatus_t setProblem(std::vector<hipblasLtMatmulDesc_t>&   matmul_descr,
                                            std::vector<void*>&                   alpha,
                                            std::vector<void*>&                   A,
                                            std::vector<hipblasLtMatrixLayout_t>& matA,
                                            std::vector<void*>&                   B,
                                            std::vector<hipblasLtMatrixLayout_t>& matB,
                                            std::vector<void*>&                   beta,
                                            std::vector<void*>&                   C,
                                            std::vector<hipblasLtMatrixLayout_t>& matC,
                                            std::vector<void*>&                   D,
                                            std::vector<hipblasLtMatrixLayout_t>& matD);

For the following API, the epilogue argument supports broadcasting to the length of the problem size by duplicating the last element.

HIPBLASLT_EXPORT hipblasStatus_t setProblem(std::vector<int64_t>&        m,
                                            std::vector<int64_t>&        n,
                                            std::vector<int64_t>&        k,
                                            std::vector<int64_t>&        batch_count,
                                            std::vector<int64_t>&        lda,
                                            std::vector<int64_t>&        ldb,
                                            std::vector<int64_t>&        ldc,
                                            std::vector<int64_t>&        ldd,
                                            std::vector<int64_t>&        strideA,
                                            std::vector<int64_t>&        strideB,
                                            std::vector<int64_t>&        strideC,
                                            std::vector<int64_t>&        strideD,
                                            std::vector<GemmEpilogue>& epilogue,
                                            std::vector<GemmInputs>&   inputs,
                                            GemmProblemType&           problemtype);

Note

Only a problemtype size equal to 1 is currently supported. (This means only one GemmProblemType for all problems.)

// Pseudo code
std::vector<int64_t> m, n, k;
// ...
for(size_t i = 0; i < problem_size, i++)
{
    // ...
}
std::vector<GemmProblemType> problemtypes;
problemtypes.push_back(problemtype);
groupedgemm.setProblem(m, n, k, batch_count, lda, ldb, ldc, ldd, strideA, strideB, strideC, strideD, epilogue, inputs, problemtypes);

The UserArguments structure#

Grouped GEMM supports the use of external device memory to run the kernel. This is helpful if some of the arguments are from the output of the pervious kernel. To change the size-related arguments m, n, k, and batch, see Fixed MK.

struct UserArguments
{
    uint32_t m; //!< size m
    uint32_t n; //!< size n
    uint32_t batch; //!< size batch
    uint32_t k; //!< size k
    void*    d; //!< The d matrix input pointer.
    void*    c; //!< The c matrix input pointer.
    void*    a; //!< The a matrix input pointer.
    void*    b; //!< The b matrix input pointer.
    uint32_t strideD1; //!< The d leading dimension.
    uint32_t strideD2; //!< The d batch stride
    uint32_t strideC1; //!< The c leading dimension.
    uint32_t strideC2; //!< The c batch stride
    uint32_t strideA1; //!< The a leading dimension.
    uint32_t strideA2; //!< The a batch stride
    uint32_t strideB1; //!< The b leading dimension.
    uint32_t strideB2; //!< The b batch stride
    int8_t   alpha[16]; //!< The alpha value.
    int8_t   beta[16]; //!< The beta value.
    // Epilogue inputs
    void*    bias; //!< The bias input pointer.
    int      biasType; //!< The bias datatype. Only works if mode is set to bias related epilogues.
    uint32_t reserved;
    void*    e; //!< The aux input pointer. Only works if mode is set to aux related epilogues.
    uint32_t strideE1; //!< The aux leading dimension. Only works if mode is set to aux related epilogues.
    uint32_t strideE2; //!< The aux batch stride. Only works if mode is set to aux related epilogues.
    float    act0; //!< The activation value 1. Some activations might use it.
    float    act1; //!< The activation value 2.
    int      activationType; //!< The activation type.  Only works if mode is set to activation related epilogues.
} __attribute__((packed));

hipBLASLt adds two functions to the UserArguments-related API. The first API is a helper function that helps you initialize the UserArguments structure from the saved problems inside the grouped GEMM object. The second API is an overload function with an additional UserArguments device pointer input.

HIPBLASLT_EXPORT hipblasStatus_t getDefaultValueForDeviceUserArguments(void* hostDeviceUserArgs);

HIPBLASLT_EXPORT hipblasStatus_t run(void* deviceUserArgs, hipStream_t stream);

Here is a simple example that shows how this API works.

// Pseudo code
// Step 1: Get all algorithms
std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
CHECK_HIPBLASLT_ERROR(hipblaslt_ext::getAllAlgos(handle,
                                                 HIPBLASLT_GEMM,
                                                 HIPBLAS_OP_N,
                                                 HIPBLAS_OP_N,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 HIPBLAS_COMPUTE_32F,
                                                 heuristicResult));

hipblaslt_ext::GemmPreference pref;
pref.setMaxWorkspaceBytes(1000000);
// Step 2: Setup problem
std::vector<int64_t> m(gemm_count);
std::vector<int64_t> n(gemm_count);
std::vector<int64_t> k(gemm_count);
std::vector<int64_t> batch_count(gemm_count);
std::vector<hipblaslt_ext::GemmEpilogue> epilogue(gemm_count);
std::vector<hipblaslt_ext::GemmInputs> inputs(gemm_count);
for(int i = 0; i < gemm_count; i++)
{
    m[i] = 1;
    n[i] = 1;
    k[i] = 1;
    batch_count[i] = 1;
    epilogue[i].setMode(HIPBLASLT_EPILOGUE_GELU);
    inputs[i].setA(d_a[i]);
    inputs[i].setB(d_b[i]);
    inputs[i].setC(d_c[i]);
    inputs[i].setD(d_d[i]);
    inputs[i].setAlpha(&alpha[i]);
    inputs[i].setBeta(&beta[i]);
}

// Step 3: Create grouped gemm instance
hipblaslt_ext::GroupedGemm groupedGemm(handle,
                                       HIPBLAS_OP_N,
                                       HIPBLAS_OP_N,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIPBLAS_COMPUTE_32F);

// Step 4: Set problem
groupedGemm.setProblem(m, n, k, batch_count, epilogue, inputs); // m, n, k, batch

// Step 5: Get default value from the instance
hipblaslt_ext::UserArguments* dUAFloat = new hipblaslt_ext::UserArguments[gemm_count];
groupedGemm.getDefaultValueForDeviceUserArguments((void*)dUAFloat);
// Once you get the default value here, you can make several copies and change the values
// from the host

// Next copy them to the device memory
hipblaslt_ext::UserArguments* d_dUAFloat = nullptr;
hipMalloc(&d_dUAFloat, sizeof(hipblaslt_ext::UserArguments) * gemm_count);
hipMemcpy(d_dUAFloat, dUAFloat, sizeof(hipblaslt_ext::UserArguments) * gemm_count, hipMemcpyHostToDevice);

validIdx.clear();
for(int j = 0; j < heuristicResult.size(); j++)
{
    size_t workspace_size = 0;
    if(groupedGemm.isAlgoSupported(heuristicResult[j].algo, workspace_size)
       == HIPBLAS_STATUS_SUCCESS)
    {
        validIdx.push_back(j);
    }
}

// Step 6: Initialize and run
if(validIdx.size() > 1)
{
    groupedGemm.initialize(heuristicResult[validIdx[0]].algo, d_workspace, stream);
    for(int i = 0; i < 10; i++)
    {
        groupedGemm.run(userArgs, stream);
    }
}

The base class (GemmInstance)#

This is the base class for Gemm and GroupedGemm.

// Gets heuristic from the instance.
HIPBLASLT_EXPORT hipblasStatus_t algoGetHeuristic(const int                                      requestedAlgoCount,
                                                  const GemmPreference&                        pref,
                                                  std::vector<hipblasLtMatmulHeuristicResult_t>& heuristicResults);

// Returns SUCCESS if the algo is supported, also returns the required workspace size in bytes.
HIPBLASLT_EXPORT hipblasStatus_t isAlgoSupported(hipblasLtMatmulAlgo_t& algo, size_t& workspaceSizeInBytes);

// Initializes the instance before calling run. Requires every time the problem is set.
HIPBLASLT_EXPORT hipblasStatus_t initialize(const hipblasLtMatmulAlgo_t& algo, void* workspace, bool useUserArgs = true, hipStream_t stream = 0);

// Run the problem.
HIPBLASLT_EXPORT hipblasStatus_t run(hipStream_t stream);

Get all algorithms#

Get all algorithms allows you to get all the algorithms for a specific problem type. It requires the transpose of A, B, the data type of the inputs, and the compute type.

HIPBLASLT_EXPORT
hipblasStatus_t hipblaslt_ext::getAllAlgos(hipblasLtHandle_t                              handle,
                                           hipblasLtExtGemmTypeEnum_t                     typeGemm,
                                           hipblasOperation_t                             opA,
                                           hipblasOperation_t                             opB,
                                           hipDataType                              typeA,
                                           hipDataType                              typeB,
                                           hipDataType                              typeC,
                                           hipDataType                              typeD,
                                           hipblasComputeType_t                         typeCompute,
                                           std::vector<hipblasLtMatmulHeuristicResult_t>& heuristicResults);

This API doesn’t require a problem size or epilogue as input. It uses another API named isAlgoSupported to check if the algorithm supports a problem.

hipblaslt_ext::matmulIsAlgoSupported()
gemm.isAlgoSupported()

The API returns the required workspace size in bytes upon successful completion.

// Get all algorithms
CHECK_HIPBLASLT_ERROR(hipblaslt_ext::getAllAlgos(handle,
                                                 HIPBLASLT_GEMM,
                                                 HIPBLAS_OP_N,
                                                 HIPBLAS_OP_N,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 HIPBLAS_COMPUTE_32F,
                                                 heuristicResult));

validIdx.clear();
for(int j = 0; j < heuristicResult.size(); j++)
{
    size_t workspace_size = 0;
    if(hipblaslt_ext::matmulIsAlgoSupported(handle,
                                            matmul,
                                            &(alpha),
                                            matA,
                                            matB,
                                            &(beta),
                                            matC,
                                            matD,
                                            heuristicResult[j].algo,
                                            workspace_size)
       == HIPBLAS_STATUS_SUCCESS)
    {
        validIdx.push_back(j);
        heuristicResult[j].workspaceSize = workspace_size;
    }
    else
    {
        heuristicResult[j].workspaceSize = 0;
    }
}

Algorithm index#

This extension API lets you to get the algorithm index using hipblasLtMatmulAlgo_t.

HIPBLASLT_EXPORT int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo);

You can use an index vector to retrieve the heuristic results.

HIPBLASLT_EXPORT
hipblasStatus_t
    getAlgosFromIndex(hipblasLtHandle_t                              handle,
                      std::vector<int>&                              algoIndex,
                      std::vector<hipblasLtMatmulHeuristicResult_t>& heuristicResults);

Sample code#

This section contains some code samples that demonstrate the use cases of the extension APIs.

GEMM#

// Pseudo code for gemm problem
// Get all algorithms
std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
CHECK_HIPBLASLT_ERROR(hipblaslt_ext::getAllAlgos(handle,
                                                 HIPBLASLT_GEMM,
                                                 HIPBLAS_OP_N,
                                                 HIPBLAS_OP_N,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 HIPBLAS_COMPUTE_32F,
                                                 heuristicResult));

hipblaslt_ext::GemmPreference pref;
pref.setMaxWorkspaceBytes(1000000);
hipblaslt_ext::GemmEpilogue epilogue;
epilogue.setMode(HIPBLASLT_EPILOGUE_GELU);
hipblaslt_ext::GemmInputs inputs;
inputs.setA(d_a);
inputs.setB(d_b);
inputs.setC(d_c);
inputs.setD(d_d);
inputs.setAlpha(&alpha);
inputs.setBeta(&beta);

hipblaslt_ext::Gemm gemm(handle,
                         HIPBLAS_OP_N,
                         HIPBLAS_OP_N,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIP_R_16F,
                         HIPBLAS_COMPUTE_32F);

gemm.setProblem(1, 1, 1, 1, epilogue, inputs); // m, n, k, batch

validIdx.clear();
for(int j = 0; j < heuristicResult.size(); j++)
{
    size_t workspace_size = 0;
    if(gemm.isAlgoSupported(heuristicResult[j].algo, workspace_size)
       == HIPBLAS_STATUS_SUCCESS)
    {
        validIdx.push_back(j);
        heuristicResult[j].workspaceSize = workspace_size;
    }
    else
    {
        heuristicResult[j].workspaceSize = 0;
    }
}

if(validIdx.size() > 1)
{
    gemm.initialize(heuristicResult[validIdx[0]].algo, d_workspace, stream);
    for(int i = 0; i < 10; i++)
    {
        gemm.run(stream);
    }
}

Grouped GEMM#

// Pseudo code for grouped gemm problem
// Get all algorithms
std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
CHECK_HIPBLASLT_ERROR(hipblaslt_ext::getAllAlgos(handle,
                                                 HIPBLASLT_GEMM,
                                                 HIPBLAS_OP_N,
                                                 HIPBLAS_OP_N,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 HIPBLAS_COMPUTE_32F,
                                                 heuristicResult));

hipblaslt_ext::GemmPreference pref;
pref.setMaxWorkspaceBytes(1000000);

std::vector<int64_t> m(gemm_count);
std::vector<int64_t> n(gemm_count);
std::vector<int64_t> k(gemm_count);
std::vector<int64_t> batch_count(gemm_count);
std::vector<hipblaslt_ext::GemmEpilogue> epilogue(gemm_count);
std::vector<hipblaslt_ext::GemmInputs> inputs(gemm_count);
for(int i = 0; i < gemm_count; i++)
{
    m[i] = 1;
    n[i] = 1;
    k[i] = 1;
    batch_count[i] = 1;
    epilogue[i].setMode(HIPBLASLT_EPILOGUE_GELU);
    inputs[i].setA(d_a[i]);
    inputs[i].setB(d_b[i]);
    inputs[i].setC(d_c[i]);
    inputs[i].setD(d_d[i]);
    inputs[i].setAlpha(&alpha[i]);
    inputs[i].setBeta(&beta[i]);
}


hipblaslt_ext::GroupedGemm groupedGemm(handle,
                                       HIPBLAS_OP_N,
                                       HIPBLAS_OP_N,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIPBLAS_COMPUTE_32F);

groupedGemm.setProblem(m, n, k, batch_count, epilogue, inputs); // m, n, k, batch

validIdx.clear();
for(int j = 0; j < heuristicResult.size(); j++)
{
    size_t workspace_size = 0;
    if(groupedGemm.isAlgoSupported(heuristicResult[j].algo, workspace_size)
       == HIPBLAS_STATUS_SUCCESS)
    {
        validIdx.push_back(j);
    }
}

if(validIdx.size() > 1)
{
    groupedGemm.initialize(heuristicResult[validIdx[0]].algo, d_workspace, stream);
    for(int i = 0; i < 10; i++)
    {
        groupedGemm.run(stream);
    }
}

Algorithm index#

int index = hipblaslt_ext::getIndexFromAlgo(testResults[i].algo);
// Save the index to disk or somewhere else for later use.

// Get the index from previous state.
std::vector<int> algoIndex{index};
std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResults;
// If the index is out of the bound of solutions, getAlgosFromIndex will return HIPBLAS_STATUS_INVALID_VALUE
if(HIPBLAS_STATUS_INVALID_VALUE
    == hipblaslt_ext::getAlgosFromIndex(handle, algoIndex, heuristicResults))
{
    std::cout << "Indexes are all out of bound." << std::endl;
    break;
}

[Grouped Gemm] Fixed MK#

The hipBLASLt extension supports changing the sizes (m, n, k, and batch) from the device memory UserArguments. However, the setup is a bit different from the normal routing.

Sum of N#

A sum of N needs to be used as an input for the grouped GEMM instance.

{1000, 1, 1, 1}; // The array of N, the first element is the sum of N

// Below is the values stored in "UserArguments"
{256, 256, 1, 1}; // This is a valid configuration cause 256 + 256 + 1 + 1 < 1000
{512, 512, 1, 1}; // This is NOT a valid configuration cause 512 + 512 + 1 + 1 > 1000

For example, consider a grouped GEMM with gemm_count = 4. The sum of N must not exceed the “sum of N” set in the setProblem API. In this mode, the first element is the “sum of N” in the array of Ns.

// Pseudo code
// Step 1: Get all algorithms
std::vector<hipblasLtMatmulHeuristicResult_t> heuristicResult;
CHECK_HIPBLASLT_ERROR(hipblaslt_ext::getAllAlgos(handle,
                                                 HIPBLASLT_GEMM,
                                                 HIPBLAS_OP_N,
                                                 HIPBLAS_OP_N,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 in_out_datatype,
                                                 HIPBLAS_COMPUTE_32F,
                                                 heuristicResult));

hipblaslt_ext::GemmPreference pref;
pref.setMaxWorkspaceBytes(1000000);
// Step 2: Setup problem
std::vector<int64_t> m(gemm_count);
std::vector<int64_t> n(gemm_count);
std::vector<int64_t> k(gemm_count);
std::vector<int64_t> batch_count(gemm_count);
std::vector<hipblaslt_ext::GemmEpilogue> epilogue(gemm_count);
std::vector<hipblaslt_ext::GemmInputs> inputs(gemm_count);

// Step 2.1: Calculate sum of n
int64_t sum_of_n = 0;
for(int i = 0; i < gemm_count; i++)
{
    sum_of_n += n_arr[i];
}

// {sum_of_n, 1, 1, 1, ...}; // The array of N, the first element is the sum of N
for(int i = 0; i < gemm_count; i++)
{
    m[i] = m_arr[i];
    if(i == 0)
        n[i] = sum_of_n;
    else
        n[i] = 1;
    k[i] = k_arr[i];
    batch_count[i] = 1;
    inputs[i].setA(d_a[i]);
    inputs[i].setB(d_b[i]);
    inputs[i].setC(d_c[i]);
    inputs[i].setD(d_d[i]);
    inputs[i].setAlpha(&alpha[i]);
    inputs[i].setBeta(&beta[i]);
}

// Step 3: Create grouped gemm instance
hipblaslt_ext::GroupedGemm groupedGemm(handle,
                                       HIPBLAS_OP_N,
                                       HIPBLAS_OP_N,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIP_R_16F,
                                       HIPBLAS_COMPUTE_32F);

// Step 4: Set problem
groupedGemm.setProblem(m, n, k, batch_count, epilogue, inputs); // m, n, k, batch

// Step 5: Get default value from the instance
hipblaslt_ext::UserArguments* dUAFloat = new hipblaslt_ext::UserArguments[gemm_count];
groupedGemm.getDefaultValueForDeviceUserArguments((void*)dUAFloat);
// Once you get the default value here, you can make several copies and change the values
// from the host

// Next Copy them to the device memory
hipblaslt_ext::UserArguments* d_dUAFloat = nullptr;
hipMalloc(&d_dUAFloat, sizeof(hipblaslt_ext::UserArguments) * gemm_count);
hipMemcpy(d_dUAFloat, dUAFloat, sizeof(hipblaslt_ext::UserArguments) * gemm_count, hipMemcpyHostToDevice);

validIdx.clear();
for(int j = 0; j < heuristicResult.size(); j++)
{
    size_t workspace_size = 0;
    if(groupedGemm.isAlgoSupported(heuristicResult[j].algo, workspace_size)
       == HIPBLAS_STATUS_SUCCESS)
    {
        validIdx.push_back(j);
    }
}

int threads = 256;
int blocks  = ceil((double)gemm_count / threads);

// Step 6: Initialize and run
if(validIdx.size() > 1)
{
    groupedGemm.initialize(heuristicResult[validIdx[0]].algo, d_workspace);
    for(int i = 0; i < 10; i++)
    {
        hipLaunchKernelGGL(kernelUpdateN,
                            dim3(blocks),
                            dim3(threads),
                            0,
                            stream,
                            gemm_count,
                            d_dUAFloat,
                            d_n_vec);  // d_n_vec is a device pointer with Ns
        groupedGemm.run(userArgs, stream);
    }
}

// .....

__global__ void kernelUpdateN(uint32_t gemm_count, void* userArgs, int32_t* sizes_n)
{
uint64_t id = hipBlockIdx_x * 256 + hipThreadIdx_x;

if(id >= gemm_count)
    return;

hipblaslt_ext::UserArguments* dUAFloat = static_cast<hipblaslt_ext::UserArguments*>(userArgs);
dUAFloat[id].n                         = sizes_n[id];
}