tf::cublasFlowCapturer class

class to construct a cuBLAS task graph

cublasFlowCapturer provides a higher-level interface over the library and hide concurrency details from users. It inherits methods from tf::cudaFlowCapturerBase and must be used from a tf::cudaFlowCapturer object. All pointers used to cublasFlowCapturer methods must be in GPU memory space or managed (i.e., cudaMallocManaged), including scalars, alpha and beta, input data and output data pointers. The following example uses cublas<t>amax to find the minimum index of the element of the maximum absolute magnitude in a vector.

#include <taskflow/cublasflow.hpp>

int main() {
  tf::Executor executor;
  tf::Taskflow taskflow;
  
  size_t N = 1024;
  float *x = nullptr;
  int *d_res;
  int  h_res;
  
  std::vector<float> host(N, 0.0f);
  host[512] = 100.0f;  // artificially set the mid-position to the largest
  
  cudaMalloc(&x, N*sizeof(float));
  cudaMalloc(&d_res, sizeof(int));
  
  taskflow.emplace([&](tf::cudaFlowCapturer& capturer){
    auto* cublas = capturer.make_capturer<tf::cublasFlowCapturer>();
  
    tf::cudaTask h2d      = capturer.copy(x, host.data(), N);
    tf::cudaTask find_max = cublas->amax(N, x, 1, d_res);  
    tf::cudaTask d2h      = capturer.copy(&h_res, d_res, 1);
    
    h2d.precede(find_max);  // amax runs before host-to-device copy
    find_max.precede(d2h);  // amax runs after  device-to-host copy
  });
  
  executor.run(taskflow).wait();
  
  assert(h_res == 512);
}

Currently, cublasFlowCapturer supports only float and double data types.

We design most tf::cublasFlowCapturer methods on top of the native, high-performance library. You may refer to for more details.

Base classes

class cudaFlowCapturerBase
base class to construct a CUDA task graph through stream capture

Constructors, destructors, conversion operators

cublasFlowCapturer() defaulted
constructs a cublas flow capturer

Public functions

auto native_handle() -> cublasHandle_t
gets the native cublas handle associated with this cublasFlowCapturer
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
auto vset(size_t n, const T* h, int inch, T* d, int incd) -> cudaTask
copies vector data from host to device
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
auto vget(size_t n, const T* d, int incd, T* h, int inch) -> cudaTask
copies vector data from device to host
template<typename T>
auto amax(int n, const T* x, int incx, int* result) -> cudaTask
finds the smallest index of the element of the maximum absolute magnitude
template<typename T>
auto amin(int n, const T* x, int incx, int* result) -> cudaTask
finds the smallest index of the element of the minimum absolute magnitude
template<typename T>
auto asum(int n, const T* x, int incx, T* result) -> cudaTask
finds the sum of absolute values of the elements over a vector
template<typename T>
auto axpy(int n, const T* alpha, const T* x, int incx, T* y, int incy) -> cudaTask
multiples a vector by a scalar and adds it to a vector
template<typename T>
auto vcopy(int n, const T* x, int incx, T* y, int incy) -> cudaTask
copies a vector to another vector
template<typename T>
auto dot(int n, const T* x, int incx, const T* y, int incy, T* result) -> cudaTask
computes the dot product of two vectors
template<typename T>
auto nrm2(int n, const T* x, int incx, T* result) -> cudaTask
computes the Euclidean norm of a vector
template<typename T>
auto scal(int n, const T* scalar, T* x, int incx) -> cudaTask
scales a vector by a scalar
template<typename T>
auto swap(int n, T* x, int incx, T* y, int incy) -> cudaTask
swaps elements between two vectors
template<typename T>
auto gemv(cublasOperation_t trans, int m, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
performs matrix-vector multiplication
template<typename T>
auto c_gemv(cublasOperation_t trans, int m, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
similar to tf::cublasFlowCapturer::gemv but operates on C-styled row-major layout
template<typename T>
auto symv(cublasFillMode_t uplo, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
performs symmetric matrix-vector multiplication
template<typename T>
auto c_symv(cublasFillMode_t uplo, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
similar to tf::cublasFlowCapturer::symv but operates on C-styled row-major layout
template<typename T>
auto syr(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, T* A, int lda) -> cudaTask
performs symmetric rank-1 update
template<typename T>
auto c_syr(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, T* A, int lda) -> cudaTask
similar to tf::cublasFlowCapturer::c_syr but operates on C-styled row-major layout
template<typename T>
auto syr2(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda) -> cudaTask
performs symmetric rank-2 update
template<typename T>
auto c_syr2(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda) -> cudaTask
similar to tf::cublasFlowCapturer::syr2 but operates on C-styled row-major layout
template<typename T>
auto trmv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
performs the triangular matrix-vector multiplication
template<typename T>
auto c_trmv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
similar to tf::cublasFlowCapturer::trmv but operates on C-styled row-major layout
template<typename T>
auto trsv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
solves the triangular linear system with a single right-hand-side
template<typename T>
auto c_trsv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
similar to tf::cublasFlowCapturer::trsv but operates on C-styled row-major layout
template<typename T>
auto geam(cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T* alpha, const T* A, int lda, const T* beta, const T* B, int ldb, T* C, int ldc) -> cudaTask
performs matrix-matrix addition and transposition
template<typename T>
auto c_geam(cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T* alpha, const T* A, int lda, const T* beta, const T* B, int ldb, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::geam but on row-major layout
template<typename T>
auto gemm(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
performs matrix-matrix multiplication
template<typename T>
auto c_gemm(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::gemm but operates on C-styled row-major layout
template<typename T>
auto gemm_batched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A[], int lda, const T* B[], int ldb, const T* beta, T* C[], int ldc, int bc) -> cudaTask
performs matrix-matrix multiplication over a batch of matrices
template<typename T>
auto c_gemm_batched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A[], int lda, const T* B[], int ldb, const T* beta, T* C[], int ldc, int bc) -> cudaTask
similar to tf::cublasFlowCapturer::gemm_batched but operates on C-styled row-major layout
template<typename T>
auto gemm_sbatched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, long long int sA, const T* B, int ldb, long long int sB, const T* beta, T* C, int ldc, long long int sC, int bc) -> cudaTask
performs matrix-matrix multiplication over a batch of matrices with strided memory access
template<typename T>
auto c_gemm_sbatched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, long long int sA, const T* B, int ldb, long long int sB, const T* beta, T* C, int ldc, long long int sC, int bc) -> cudaTask
similar to tf::cublasFlowCapturer::c_gemm_sbatched but operates on C-styled row-major layout
template<typename T>
auto symm(cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
performs the symmetric matrix-matrix multiplication
template<typename T>
auto c_symm(cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::symm but operates on C-styled row-major layout
template<typename T>
auto syrk(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* beta, T* C, int ldc) -> cudaTask
performs the symmetric rank-k update
template<typename T>
auto c_syrk(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* beta, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::c_syrk but operates on C-styled row-major layout
template<typename T>
auto syr2k(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
performs the symmetric rank-2k update
template<typename T>
auto c_syr2k(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::syr2k but operates on C-styled row-major layout
template<typename T>
auto syrkx(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
performs a variation of the symmetric rank-k update
template<typename T>
auto c_syrkx(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::syrkx but operates on C-styled row-major layout
template<typename T>
auto trmm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, T* C, int ldc) -> cudaTask
performs triangular matrix-matrix multiplication
template<typename T>
auto c_trmm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, T* C, int ldc) -> cudaTask
similar to tf::cublasFlowCapturer::trmm but oeprates on C-styled row-major layout
template<typename T>
auto trsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, T* B, int ldb) -> cudaTask
solves the triangular linear system with multiple right-hand-sides
template<typename T>
auto c_trsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, T* B, int ldb) -> cudaTask
similar to tf::cublasFlowCapturer::trsm but operates on C-styled row-major layout

Function documentation

cublasHandle_t tf::cublasFlowCapturer::native_handle()

gets the native cublas handle associated with this cublasFlowCapturer

Returns a native cublas handle of type cublasHandle_t

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cublasFlowCapturer::vset(size_t n, const T* h, int inch, T* d, int incd)

copies vector data from host to device

Template parameters
T data type
Parameters
n number of elements
h source host pointer
inch spacing between consecutive elements in h
d target device pointer
incd spacing between consecutive elements in d
Returns a tf::cudaTask handle

This method copies n elements from a vector h in host memory space to a vector d in GPU memory space. The storage spacing between consecutive elements is given by inch for the source vector h and by incd for the destination vector d.

This method calls native cublasSetVectorAsync with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf::cublasFlowCapturer::vget(size_t n, const T* d, int incd, T* h, int inch)

copies vector data from device to host

Template parameters
T data type
Parameters
n number of elements
d source device pointer
incd spacing between consecutive elements in d
h target host pointer
inch spacing between consecutive elements in h
Returns a tf::cudaTask handle

This method copies n elements from a vector d in GPU memory space to a vector h in host memory space. The storage spacing between consecutive elements is given by inch for the target vector h and by incd for the source vector d.

This method calls native cublasGetVectorAsync with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::amax(int n, const T* x, int incx, int* result)

finds the smallest index of the element of the maximum absolute magnitude

Template parameters
T data type
Parameters
n number of elements in vector x
x pointer to the memory address of the vector
incx stride between consecutive elements of x
result the resulting index (1-based indexing)
Returns a tf::cudaTask handle

This method calls native cublas<t>amax with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::amin(int n, const T* x, int incx, int* result)

finds the smallest index of the element of the minimum absolute magnitude

Template parameters
T data type
Parameters
n number of elements in vector x
x pointer to the memory address of the vector
incx stride between consecutive elements of x
result the resulting index (1-based indexing)
Returns a tf::cudaTask handle

This method calls native cublas<t>amin with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::asum(int n, const T* x, int incx, T* result)

finds the sum of absolute values of the elements over a vector

Template parameters
T data type
Parameters
n number of elements in vector x
x pointer to the memory address of the vector
incx stride between consecutive elements of x
result the result
Returns a tf::cudaTask handle

This method calls native cublas<t>asum with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::axpy(int n, const T* alpha, const T* x, int incx, T* y, int incy)

multiples a vector by a scalar and adds it to a vector

Template parameters
T data type
Parameters
n number of elements in vectors x and y
alpha scalar used to multiplication
x pointer to the memory address of the vector x
incx stride between consecutive elements of x
y pointer to the memory address of the vector y
incy stride between consecutive elements of y
Returns a tf::cudaTask handle

This function multiplies the vector x by the scalar alpha and adds it to the vector y overwriting the latest vector with the result. Hence, the performed operation is:

y[j] = alpha * x[k] + y[j],

where j and k are indices of n elements with step sizes incy and incx.

This method calls native cublas<t>asum with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::vcopy(int n, const T* x, int incx, T* y, int incy)

copies a vector to another vector

Template parameters
T data type
Parameters
n number of elements to copy
x pointer to the memory address of the vector x
incx stride between consecutive elements of x
y pointer to the memory address of the vector y
incy stride between consecutive elements of y
Returns a tf::cudaTask handle

This function copies n elements from a vector x of a step size incx to another vector y of step size incy.

adds it to the vector y overwriting the latest vector with the result. Hence, the performed operation is:

y[j] = x[k],

where j and k are indices of n elements with step sizes incy and incx.

This method calls native cublas<t>copy with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::dot(int n, const T* x, int incx, const T* y, int incy, T* result)

computes the dot product of two vectors

Template parameters
T data type
Parameters
n number of elements to perform the dot product
x pointer to the memory address of the vector x
incx stride between consecutive elements of x
y pointer to the memory address of the vector y
incy stride between consecutive elements of y
result the resulting dot product
Returns a tf::cudaTask handle

sum += x[i] * y[i]

This method calls native cublas<t>dot with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::nrm2(int n, const T* x, int incx, T* result)

computes the Euclidean norm of a vector

Template parameters
T data type
Parameters
n number of elements in vector x
x pointer to the memory address of the vector
incx stride between consecutive elements of x
result the result
Returns a tf::cudaTask handle

This method calls native cublas<t>nrm2 with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::scal(int n, const T* scalar, T* x, int incx)

scales a vector by a scalar

Template parameters
T data type
Parameters
n number of elements in vector x
scalar scalar used for multiplication
x pointer to the memory address of the vector
incx stride between consecutive elements of x
Returns a tf::cudaTask handle

This method calls native cublas<t>scal with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::swap(int n, T* x, int incx, T* y, int incy)

swaps elements between two vectors

Template parameters
T data type
Parameters
n number of elements to perform the dot product
x pointer to the memory address of the vector x
incx stride between consecutive elements of x
y pointer to the memory address of the vector y
incy stride between consecutive elements of y
Returns a tf::cudaTask handle

This function interchanges the elements of vectors x and y. Hence, the performed operation is:

y[j] <-> x[k],

where j is the index of element in y with a step size incy and k is the index of element in x with a step size incx.

This method calls native cublas<t>swap with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::gemv(cublasOperation_t trans, int m, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy)

performs matrix-vector multiplication

Template parameters
T data type
Parameters
trans transport operation op(A)
m number of rows of matrix A
n number of columns of matrix A
alpha pointer to the alpha scalar
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
x pointer to the address of x of at least (1 + (n - 1) * abs(incx)) elements if no transposition, or (1 + (m - 1) * abs(incx)) elements otherwise.
incx stride between consecutive elements of x
beta pointer to the beta scalar
y pointer to the address of y
incy stride between consecutive elements of y
Returns a tf::cudaTask handle

This function performs matrix-vector multiplication:

y = alpha * op(A) * x + beta * y,

where alpha and beta are scalars, A is a 2D matrix stored in column-major format, and x, y are vectors.

The input matrices are in column-major storage.

This method calls native cublas<t>gemv with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::symv(cublasFillMode_t uplo, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy)

performs symmetric matrix-vector multiplication

Template parameters
T data type
Parameters
uplo indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements
n number of rows and columns of matrix A
alpha pointer to the alpha scalar
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
x pointer to the address of x
incx stride between consecutive elements of x
beta pointer to the beta scalar
y pointer to the address of y
incy stride between consecutive elements of y
Returns a tf::cudaTask handle

This function performs symmetric matrix-vector multiplication:

y = alpha * A * x + beta * y,

where alpha and beta are scalars, A is a 2D symmetric matrix stored in column-major format, and x, y are vectors

This method calls native cublas<t>symv with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::syr(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, T* A, int lda)

performs symmetric rank-1 update

Template parameters
T data type
Parameters
uplo indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements
n number of rows and columns of matrix A
alpha pointer to the alpha scalar
x pointer to the address of x
incx stride between consecutive elements of x
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
Returns a tf::cudaTask handle

This function performs symmetric rank-1 update:

A = alpha * x * x^T + A,

where alpha is a scalar, A is a 2D symmetric matrix stored in column-major format, and x is a vector.

The result is also symmetric and is stored on in the uplo part of A.

This method calls native cublas<t>syr with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::syr2(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda)

performs symmetric rank-2 update

Template parameters
T data type
Parameters
uplo indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements
n number of rows and columns of matrix A
alpha pointer to the alpha scalar
x pointer to the address of x
incx stride between consecutive elements of x
y pointer to the address of y
incy stride between consecutive elements of y
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
Returns a tf::cudaTask handle

This function performs symmetric rank-2 update:

A = alpha * x * y^T + y * x^T + A,

where alpha is a scalar, A is a 2D symmetric matrix stored in column-major format, and x and y are vectors.

The result is also symmetric and is stored on in the uplo part of A.

This method calls native cublas<t>syr2 with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::trmv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx)

performs the triangular matrix-vector multiplication

Template parameters
T data type
Parameters
uplo indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements
tran transpose operation op(A)
diag indicates if the elements on the main diagonal of matrix A are unity (i.e., all 1s) and of no need to be accessed
n number of rows and columns of matrix A
A pointer to the address of A
lda leading dimension of 2D array used to store matrix A
x input of vector b and output of the solution on exit
incx stride between consecutive elements of x

This method performs the triangular matrix-vector multiplication:

x = op(A),

where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, and x is a vector.

template<typename T>
cudaTask tf::cublasFlowCapturer::trsv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx)

solves the triangular linear system with a single right-hand-side

Template parameters
T data type
Parameters
uplo indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements
tran transpose operation op(A)
diag indicates if the elements on the main diagonal of matrix A are unity (i.e., all 1s) and of no need to be accessed
n number of rows and columns of matrix A
A pointer to the address of A
lda leading dimension of 2D array used to store matrix A
x input of vector b and output of the solution on exit
incx stride between consecutive elements of x

This method solves the triangular linear system with a single right-hand-side

op(A) x = b,

where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, and x and b are vectors.

template<typename T>
cudaTask tf::cublasFlowCapturer::geam(cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T* alpha, const T* A, int lda, const T* beta, const T* B, int ldb, T* C, int ldc)

performs matrix-matrix addition and transposition

Template parameters
T data type
Parameters
ta transport operation op(A)
tb transport operation op(B)
m number of rows of matrix C and op(A)
n number of columns of matrix C and op(B)
alpha pointer to the alpha scalar
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
beta pointer to the beta scalar
B pointer to the address of B
ldb leading dimension of 2D array used to store the matrix B
C pointer to the address of C
ldc leading dimension of 2D array used to store the matrix C
Returns a tf::cudaTask handle

This method performs the matrix-matrix addition/transposition:

C = alpha * op(A) + beta * op(B),

where alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions op(A) as m by n, op(B) as m by n and C as m by n, respectively.

The operation is out-of-place if C does not overlap A or B.

The in-place mode supports the following two operations:

  1. C = alpha * C + beta * op(B)
  2. C = alpha * op(A) + beta * C

For in-place mode, if C equals A, ldc equals lda and ta equals CUBLAS_OP_N. If C equals B, ldc equals ldb and tb equals CUBLAS_OP_N.

The operation includes the following special cases:

  1. the user can reset matrix C to zero by setting alpha and beta to 0
  2. the user can transpose matrix A by setting alpha to 1 and beta to 0

The input matrices are in column-major storage.

This method calls native cublas<t>geam with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::gemm(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc)

performs matrix-matrix multiplication

Template parameters
T data type
Parameters
ta transport operation op(A)
tb transport operation op(B)
m number of rows of matrix C and op(A)
n number of columns of matrix C and op(B)
k number of columns of op(A) and rows of op(B)
alpha pointer to the alpha scalar
A pointer to the address of A
lda leading dimension of 2D array used to store the matrix A
B pointer to the address of B
ldb leading dimension of 2D array used to store the matrix B
beta pointer to the beta scalar
C pointer to the address of C
ldc leading dimension of 2D array used to store the matrix C
Returns a tf::cudaTask handle

This function performs matrix-matrix multiplication:

C = alpha * op (A) * op (B) + beta * C,

where alpha and beta are scalars, and A, B, and C are 2D matrices stored in column-major format with dimension op(A) as m by k, dimension op(B) as k by n, and C as m by n.

The input matrices are in column-major storage.

This method calls native cublas<t>gemm with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::gemm_batched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A[], int lda, const T* B[], int ldb, const T* beta, T* C[], int ldc, int bc)

performs matrix-matrix multiplication over a batch of matrices

Template parameters
T data type
Parameters
ta transport operation op(A[i])
tb transport operation op(B[i])
m number of rows of matrix C[i] and op(A[i])
n number of columns of matrix C[i] and op(B[i])
k number of columns of op(A[i]) and rows of op(B[i])
alpha pointer to the alpha scalar
A array pointer to A batch
lda leading dimension of 2D array used to store the matrix A[i]
B array pointer to B batch
ldb leading dimension of 2D array used to store the matrix B[i]
beta pointer to the beta scalar
C array pointer to C batch
ldc leading dimension of 2D array used to store the matrix C[i]
bc batch size (number of matrices)
Returns a tf::cudaTask handle

The batch must be uniform. All instances in the batch must have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (ta, tb) for their respective A, B and C matrices. The address of the input matrices and the output matrix of each instance of the batch are read from arrays of pointers passed to the function by the caller.

C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc),

where alpha and beta are scalars, and A[i], B[i], and C[i] are 2D matrices stored in column-major format with dimension op(A) as m by k, dimension op(B) as k by n, and C as m by n.

The input matrices are in column-major storage.

This method calls native cublas<t>gemmBatched with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::gemm_sbatched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, long long int sA, const T* B, int ldb, long long int sB, const T* beta, T* C, int ldc, long long int sC, int bc)

performs matrix-matrix multiplication over a batch of matrices with strided memory access

Template parameters
T data type
Parameters
ta transport operation op(A[i])
tb transport operation op(B[i])
m number of rows of matrix C[i] and op(A[i])
n number of columns of matrix C[i] and op(B[i])
k number of columns of op(A[i]) and rows of op(B[i])
alpha pointer to the alpha scalar
A pointer to A batch
lda leading dimension of 2D array used to store the matrix A[i]
sA address offset between A[i] and A[i+1]
B pointer to B batch
ldb leading dimension of 2D array used to store the matrix B[i]
sB address offset between B[i] and B[i+1]
beta pointer to the beta scalar
C pointer to C batch
ldc leading dimension of 2D array used to store the matrix C[i]
sC address offset between C[i] and C[i+1]
bc batch size (number of matrices)
Returns a tf::cudaTask handle

Here, we use A[i], B[i], C[i] as notation for A, B and C matrices in the i-th instance of the batch, implicitly assuming they are respectively address offsets sA, sB, sC away from A[i-1], B[i-1], C[i-1].

The input matrices are in column-major storage.

This method calls native cublas<t>gemmStridedBatched with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

The batch must be uniform. All instances in the batch must have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (ta, tb) for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first instance are passed to the function by the user along with the address offsets - sA, sB and sC that determine the locations of input and output matrices in future instances.

C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc),

where alpha and beta are scalars, and A[i], B[i], and C[i] are 2D matrices stored in column-major format with dimension op(A) as m by k, dimension op(B) as k by n, and C as m by n.

On certain problem sizes, it might be advantageous to create multiple gemm tasks to take advantage of concurrent kernels, rather than this method.

template<typename T>
cudaTask tf::cublasFlowCapturer::symm(cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc)

performs the symmetric matrix-matrix multiplication

Template parameters
T data type
Parameters
side indicates if matrix A is on the left or right of B.
uplo indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements.
m number of rows of matrix C and B, with matrix A sized accordingly
n number of columns of matrix C and B, with matrix A sized accordingly
alpha scalar used for multiplication
A pointer to the address of matrix A
lda leading dimension of the 2D array used to store A
B pointer to the address of matrix B
ldb leading dimension of the 2D array used to store B
beta scalar used for multiplication
C pointer to the address of matrix C
ldc leading dimension of the 2D array used to store C

The method performs symmetric matrix-matrix multiplication:

C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT, or

C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT.

A is a symmetric matrix stored in lower or upper mode, B and C are m by n matrices, and alpha and beta are scalars.

This method calls native cublas<t>symm with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::syrk(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* beta, T* C, int ldc)

performs the symmetric rank-k update

Template parameters
T data type
Parameters
uplo indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements.
tran transposition operation to apply to A
n number of rows of matrix C and op(A)
k number of columns of matrix op(A)
alpha scalar used for multiplication
A pointer to the address of A
lda leading dimension of the 2D array used to store A
beta scalar used for multiplication
C pointer to the address of C
ldc leading dimension of the 2D array used to store C

This method performs the symmetric rank-k update :

C = alpha * op(A) * op(A)^T + beta * C,

where alpha and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A is a matrix with dimension op(A) n by k.

The result is stored to uplo part of C.

This method calls native cublas<t>syrk with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::syr2k(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc)

performs the symmetric rank-2k update

Template parameters
T data type
Parameters
uplo indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements.
tran transposition operation to apply to A
n number of rows of matrix C and op(A)
k number of columns of matrix op(A)
alpha scalar used for multiplication
A pointer to the address of A
lda leading dimension of the 2D array used to store A
B pointer to the address of B
ldb leading dimension of the 2D array used to store B
beta scalar used for multiplication
C pointer to the address of C
ldc leading dimension of the 2D array used to store C

This method performs the symmetric rank-2k update :

C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C,

where alpha and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A and B are two matrices with dimensions op(A) and op(B) n by k.

The result is stored to uplo part of C.

This method calls native cublas<t>syr2k with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::syrkx(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc)

performs a variation of the symmetric rank-k update

Template parameters
T data type
Parameters
uplo indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements.
tran transposition operation to apply to A
n number of rows of matrix C and op(A)
k number of columns of matrix op(A)
alpha scalar used for multiplication
A pointer to the address of A
lda leading dimension of the 2D array used to store A
B pointer to the address of B
ldb leading dimension of the 2D array used to store B
beta scalar used for multiplication
C pointer to the address of C
ldc leading dimension of the 2D array used to store C

This method performs a variation of the symmetric rank-k update:

C = alpha * op(A) * op(B)^T + beta * C,

where alpha and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A and B are two matrices with dimensions op(A) and op(B) n by k.

The result is stored to uplo part of C.

This method calls native cublas<t>syr2k with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

template<typename T>
cudaTask tf::cublasFlowCapturer::trmm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, T* C, int ldc)

performs triangular matrix-matrix multiplication

Template parameters
T data type
Parameters
side indicates if matrix A is on the left or right of B
uplo indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements
tran transposition operation to apply to A
diag indicates if the elements on the main diagonal of matrix A are unity and should not be accessed.
m number of rows of matrix B, with matrix A sized accordingly
n number of columns of matrix B, with matrix A sized accordingly
alpha scalar used for multiplication
A pointer to the address of matrix A
lda leading dimension of the 2D array used to store A
B pointer to the address of matrix B
ldb leading dimension of the 2D array used to store B
C pointer to the address of matrix C
ldc leading dimension of the 2D array used to store C

This method performs triangular matrix-matrix multiplication:

C = alpha * op(A) * B, if side == CUBLAS_SIDE_LEFT, or

C = alpha * B * op(A), if side == CUBLAS_SIDE_RIGHT,

where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m by n matrix, and alpha is a scalar.

This method calls native cublas<t>trmm with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.

Notice that in this method, B and C can point to the same address in which case the in-place implementation is performed (with results written back to B).

template<typename T>
cudaTask tf::cublasFlowCapturer::trsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, T* B, int ldb)

solves the triangular linear system with multiple right-hand-sides

Template parameters
T data type
Parameters
side indicates if A is on the left or right side of X
uplo indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements
tran transposition operation to apply to A
diag indicates if the elements on the main diagonal of matrix A are unity and should not be accessed
m number of rows in matrix B, with matrix A sized accordingly
n number of columns in matrix B, with matrix A sized accordingly
alpha scalar to apply to B
A pointer to the address of matrix A
lda leading dimension of the 2D array used to store A
B pointer to the address of matrix B
ldb leading dimension of the 2D array used to store B

This method solves the triangular linear system with multiple right-hand-sides:

op(A) * X = alpha * B, if side == CUBLAS_SIDE_LEFT, or

X * op(A) = alpha * B, if side == CUBLAS_SIDE_RIGHT,

where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, X and B are m by n matrices, and alpha is a scalar.

The solution X overwrites the right-hand-sides B on exit.

This method calls native cublas<t>trsm with packed parameters, (handle, args...), where handle is managed by the cublasFlowCapturer and args... are the given arguments.