class
cublasFlowCapturerclass to construct a cuBLAS task graph
Contents
cublasFlowCapturer provides a higher-level interface over the library and hide concurrency details from users. It inherits methods from tf::cudaMallocManaged
), including scalars, alpha
and beta
, input data and output data pointers. The following example uses cublas<t>amax
to find the minimum index of the element of the maximum absolute magnitude in a vector.
#include <taskflow/cublasflow.hpp> int main() { tf::Executor executor; tf::Taskflow taskflow; size_t N = 1024; float *x = nullptr; int *d_res; int h_res; std::vector<float> host(N, 0.0f); host[512] = 100.0f; // artificially set the mid-position to the largest cudaMalloc(&x, N*sizeof(float)); cudaMalloc(&d_res, sizeof(int)); taskflow.emplace([&](tf::cudaFlowCapturer& capturer){ auto* cublas = capturer.make_capturer<tf::cublasFlowCapturer>(); tf::cudaTask h2d = capturer.copy(x, host.data(), N); tf::cudaTask find_max = cublas->amax(N, x, 1, d_res); tf::cudaTask d2h = capturer.copy(&h_res, d_res, 1); h2d.precede(find_max); // amax runs before host-to-device copy find_max.precede(d2h); // amax runs after device-to-host copy }); executor.run(taskflow).wait(); assert(h_res == 512); }
Currently, cublasFlowCapturer supports only float
and double
data types.
We design most tf::
Base classes
- class cudaFlowCapturerBase
- base class to construct a CUDA task graph through stream capture
Constructors, destructors, conversion operators
- cublasFlowCapturer() defaulted
- constructs a cublas flow capturer
Public functions
- auto native_handle() -> cublasHandle_t
- gets the native cublas handle associated with this cublasFlowCapturer
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto vset(size_t n, const T* h, int inch, T* d, int incd) -> cudaTask
- copies vector data from host to device
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto vget(size_t n, const T* d, int incd, T* h, int inch) -> cudaTask
- copies vector data from device to host
-
template<typename T>auto amax(int n, const T* x, int incx, int* result) -> cudaTask
- finds the smallest index of the element of the maximum absolute magnitude
-
template<typename T>auto amin(int n, const T* x, int incx, int* result) -> cudaTask
- finds the smallest index of the element of the minimum absolute magnitude
-
template<typename T>auto asum(int n, const T* x, int incx, T* result) -> cudaTask
- finds the sum of absolute values of the elements over a vector
-
template<typename T>auto axpy(int n, const T* alpha, const T* x, int incx, T* y, int incy) -> cudaTask
- multiples a vector by a scalar and adds it to a vector
-
template<typename T>auto vcopy(int n, const T* x, int incx, T* y, int incy) -> cudaTask
- copies a vector to another vector
-
template<typename T>auto dot(int n, const T* x, int incx, const T* y, int incy, T* result) -> cudaTask
- computes the dot product of two vectors
-
template<typename T>auto nrm2(int n, const T* x, int incx, T* result) -> cudaTask
- computes the Euclidean norm of a vector
-
template<typename T>auto scal(int n, const T* scalar, T* x, int incx) -> cudaTask
- scales a vector by a scalar
-
template<typename T>auto swap(int n, T* x, int incx, T* y, int incy) -> cudaTask
- swaps elements between two vectors
-
template<typename T>auto gemv(cublasOperation_t trans, int m, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
- performs matrix-vector multiplication
-
template<typename T>auto c_gemv(cublasOperation_t trans, int m, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
- similar to tf::
cublasFlowCapturer:: gemv but operates on C-styled row-major layout -
template<typename T>auto symv(cublasFillMode_t uplo, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
- performs symmetric matrix-vector multiplication
-
template<typename T>auto c_symv(cublasFillMode_t uplo, int n, const T* alpha, const T* A, int lda, const T* x, int incx, const T* beta, T* y, int incy) -> cudaTask
- similar to tf::
cublasFlowCapturer:: symv but operates on C-styled row-major layout -
template<typename T>auto syr(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, T* A, int lda) -> cudaTask
- performs symmetric rank-1 update
-
template<typename T>auto c_syr(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, T* A, int lda) -> cudaTask
- similar to tf::
cublasFlowCapturer:: c_syr but operates on C-styled row-major layout -
template<typename T>auto syr2(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda) -> cudaTask
- performs symmetric rank-2 update
-
template<typename T>auto c_syr2(cublasFillMode_t uplo, int n, const T* alpha, const T* x, int incx, const T* y, int incy, T* A, int lda) -> cudaTask
- similar to tf::
cublasFlowCapturer:: syr2 but operates on C-styled row-major layout -
template<typename T>auto trmv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
- performs the triangular matrix-vector multiplication
-
template<typename T>auto c_trmv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
- similar to tf::
cublasFlowCapturer:: trmv but operates on C-styled row-major layout -
template<typename T>auto trsv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
- solves the triangular linear system with a single right-hand-side
-
template<typename T>auto c_trsv(cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int n, const T* A, int lda, T* x, int incx) -> cudaTask
- similar to tf::
cublasFlowCapturer:: trsv but operates on C-styled row-major layout -
template<typename T>auto geam(cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T* alpha, const T* A, int lda, const T* beta, const T* B, int ldb, T* C, int ldc) -> cudaTask
- performs matrix-matrix addition and transposition
-
template<typename T>auto c_geam(cublasOperation_t ta, cublasOperation_t tb, int m, int n, const T* alpha, const T* A, int lda, const T* beta, const T* B, int ldb, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: geam but on row-major layout -
template<typename T>auto gemm(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- performs matrix-matrix multiplication
-
template<typename T>auto c_gemm(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: gemm but operates on C-styled row-major layout -
template<typename T>auto gemm_batched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A[], int lda, const T* B[], int ldb, const T* beta, T* C[], int ldc, int bc) -> cudaTask
- performs matrix-matrix multiplication over a batch of matrices
-
template<typename T>auto c_gemm_batched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A[], int lda, const T* B[], int ldb, const T* beta, T* C[], int ldc, int bc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: gemm_batched but operates on C-styled row-major layout -
template<typename T>auto gemm_sbatched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, long long int sA, const T* B, int ldb, long long int sB, const T* beta, T* C, int ldc, long long int sC, int bc) -> cudaTask
- performs matrix-matrix multiplication over a batch of matrices with strided memory access
-
template<typename T>auto c_gemm_sbatched(cublasOperation_t ta, cublasOperation_t tb, int m, int n, int k, const T* alpha, const T* A, int lda, long long int sA, const T* B, int ldb, long long int sB, const T* beta, T* C, int ldc, long long int sC, int bc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: c_gemm_sbatched but operates on C-styled row-major layout -
template<typename T>auto symm(cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- performs the symmetric matrix-matrix multiplication
-
template<typename T>auto c_symm(cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: symm but operates on C-styled row-major layout -
template<typename T>auto syrk(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* beta, T* C, int ldc) -> cudaTask
- performs the symmetric rank-k update
-
template<typename T>auto c_syrk(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* beta, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: c_syrk but operates on C-styled row-major layout -
template<typename T>auto syr2k(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- performs the symmetric rank-2k update
-
template<typename T>auto c_syr2k(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: syr2k but operates on C-styled row-major layout -
template<typename T>auto syrkx(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- performs a variation of the symmetric rank-k update
-
template<typename T>auto c_syrkx(cublasFillMode_t uplo, cublasOperation_t tran, int n, int k, const T* alpha, const T* A, int lda, const T* B, int ldb, const T* beta, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: syrkx but operates on C-styled row-major layout -
template<typename T>auto trmm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, T* C, int ldc) -> cudaTask
- performs triangular matrix-matrix multiplication
-
template<typename T>auto c_trmm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, const T* B, int ldb, T* C, int ldc) -> cudaTask
- similar to tf::
cublasFlowCapturer:: trmm but oeprates on C-styled row-major layout -
template<typename T>auto trsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, T* B, int ldb) -> cudaTask
- solves the triangular linear system with multiple right-hand-sides
-
template<typename T>auto c_trsm(cublasSideMode_t side, cublasFillMode_t uplo, cublasOperation_t tran, cublasDiagType_t diag, int m, int n, const T* alpha, const T* A, int lda, T* B, int ldb) -> cudaTask
- similar to tf::
cublasFlowCapturer:: trsm but operates on C-styled row-major layout
Function documentation
cublasHandle_t tf:: cublasFlowCapturer:: native_handle()
gets the native cublas handle associated with this cublasFlowCapturer
Returns | a native cublas handle of type cublasHandle_t |
---|
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cublasFlowCapturer:: vset(size_t n,
const T* h,
int inch,
T* d,
int incd)
copies vector data from host to device
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements |
h | source host pointer |
inch | spacing between consecutive elements in h |
d | target device pointer |
incd | spacing between consecutive elements in d |
Returns | a tf:: |
This method copies n
elements from a vector h
in host memory space to a vector d
in GPU memory space. The storage spacing between consecutive elements is given by inch
for the source vector h
and by incd
for the destination vector d
.
This method calls native cublasSetVectorAsync
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
cudaTask tf:: cublasFlowCapturer:: vget(size_t n,
const T* d,
int incd,
T* h,
int inch)
copies vector data from device to host
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements |
d | source device pointer |
incd | spacing between consecutive elements in d |
h | target host pointer |
inch | spacing between consecutive elements in h |
Returns | a tf:: |
This method copies n
elements from a vector d
in GPU memory space to a vector h
in host memory space. The storage spacing between consecutive elements is given by inch
for the target vector h
and by incd
for the source vector d
.
This method calls native cublasGetVectorAsync
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: amax(int n,
const T* x,
int incx,
int* result)
finds the smallest index of the element of the maximum absolute magnitude
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vector x |
x | pointer to the memory address of the vector |
incx | stride between consecutive elements of x |
result | the resulting index (1-based indexing) |
Returns | a tf:: |
This method calls native cublas<t>amax
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: amin(int n,
const T* x,
int incx,
int* result)
finds the smallest index of the element of the minimum absolute magnitude
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vector x |
x | pointer to the memory address of the vector |
incx | stride between consecutive elements of x |
result | the resulting index (1-based indexing) |
Returns | a tf:: |
This method calls native cublas<t>amin
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: asum(int n,
const T* x,
int incx,
T* result)
finds the sum of absolute values of the elements over a vector
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vector x |
x | pointer to the memory address of the vector |
incx | stride between consecutive elements of x |
result | the result |
Returns | a tf:: |
This method calls native cublas<t>asum
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: axpy(int n,
const T* alpha,
const T* x,
int incx,
T* y,
int incy)
multiples a vector by a scalar and adds it to a vector
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vectors x and y |
alpha | scalar used to multiplication |
x | pointer to the memory address of the vector x |
incx | stride between consecutive elements of x |
y | pointer to the memory address of the vector y |
incy | stride between consecutive elements of y |
Returns | a tf:: |
This function multiplies the vector x
by the scalar alpha
and adds it to the vector y
overwriting the latest vector with the result. Hence, the performed operation is:
y[j] = alpha * x[k] + y[j]
,
where j
and k
are indices of n
elements with step sizes incy
and incx
.
This method calls native cublas<t>asum
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: vcopy(int n,
const T* x,
int incx,
T* y,
int incy)
copies a vector to another vector
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements to copy |
x | pointer to the memory address of the vector x |
incx | stride between consecutive elements of x |
y | pointer to the memory address of the vector y |
incy | stride between consecutive elements of y |
Returns | a tf:: |
This function copies n
elements from a vector x
of a step size incx
to another vector y
of step size incy
.
adds it to the vector y
overwriting the latest vector with the result. Hence, the performed operation is:
y[j] = x[k]
,
where j
and k
are indices of n
elements with step sizes incy
and incx
.
This method calls native cublas<t>copy
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: dot(int n,
const T* x,
int incx,
const T* y,
int incy,
T* result)
computes the dot product of two vectors
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements to perform the dot product |
x | pointer to the memory address of the vector x |
incx | stride between consecutive elements of x |
y | pointer to the memory address of the vector y |
incy | stride between consecutive elements of y |
result | the resulting dot product |
Returns | a tf:: |
sum += x[i] * y[i]
This method calls native cublas<t>dot
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: nrm2(int n,
const T* x,
int incx,
T* result)
computes the Euclidean norm of a vector
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vector x |
x | pointer to the memory address of the vector |
incx | stride between consecutive elements of x |
result | the result |
Returns | a tf:: |
This method calls native cublas<t>nrm2
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: scal(int n,
const T* scalar,
T* x,
int incx)
scales a vector by a scalar
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements in vector x |
scalar | scalar used for multiplication |
x | pointer to the memory address of the vector |
incx | stride between consecutive elements of x |
Returns | a tf:: |
This method calls native cublas<t>scal
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: swap(int n,
T* x,
int incx,
T* y,
int incy)
swaps elements between two vectors
Template parameters | |
---|---|
T | data type |
Parameters | |
n | number of elements to perform the dot product |
x | pointer to the memory address of the vector x |
incx | stride between consecutive elements of x |
y | pointer to the memory address of the vector y |
incy | stride between consecutive elements of y |
Returns | a tf:: |
This function interchanges the elements of vectors x
and y
. Hence, the performed operation is:
y[j] <-> x[k]
,
where j
is the index of element in y
with a step size incy
and k
is the index of element in x
with a step size incx
.
This method calls native cublas<t>swap
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: gemv(cublasOperation_t trans,
int m,
int n,
const T* alpha,
const T* A,
int lda,
const T* x,
int incx,
const T* beta,
T* y,
int incy)
performs matrix-vector multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
trans | transport operation op(A) |
m | number of rows of matrix A |
n | number of columns of matrix A |
alpha | pointer to the alpha scalar |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
x | pointer to the address of x of at least (1 + (n - 1) * abs(incx)) elements if no transposition, or (1 + (m - 1) * abs(incx)) elements otherwise. |
incx | stride between consecutive elements of x |
beta | pointer to the beta scalar |
y | pointer to the address of y |
incy | stride between consecutive elements of y |
Returns | a tf:: |
This function performs matrix-vector multiplication:
y = alpha * op(A) * x + beta * y
,
where alpha
and beta
are scalars, A
is a 2D matrix stored in column-major format, and x
, y
are vectors.
The input matrices are in column-major storage.
This method calls native cublas<t>gemv
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: symv(cublasFillMode_t uplo,
int n,
const T* alpha,
const T* A,
int lda,
const T* x,
int incx,
const T* beta,
T* y,
int incy)
performs symmetric matrix-vector multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements |
n | number of rows and columns of matrix A |
alpha | pointer to the alpha scalar |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
x | pointer to the address of x |
incx | stride between consecutive elements of x |
beta | pointer to the beta scalar |
y | pointer to the address of y |
incy | stride between consecutive elements of y |
Returns | a tf:: |
This function performs symmetric matrix-vector multiplication:
y = alpha * A * x + beta * y
,
where alpha
and beta
are scalars, A
is a 2D symmetric matrix stored in column-major format, and x
, y
are vectors
This method calls native cublas<t>symv
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: syr(cublasFillMode_t uplo,
int n,
const T* alpha,
const T* x,
int incx,
T* A,
int lda)
performs symmetric rank-1 update
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements |
n | number of rows and columns of matrix A |
alpha | pointer to the alpha scalar |
x | pointer to the address of x |
incx | stride between consecutive elements of x |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
Returns | a tf:: |
This function performs symmetric rank-1 update:
A = alpha * x * x^T + A
,
where alpha
is a scalar, A
is a 2D symmetric matrix stored in column-major format, and x
is a vector.
The result is also symmetric and is stored on in the uplo
part of A
.
This method calls native cublas<t>syr
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: syr2(cublasFillMode_t uplo,
int n,
const T* alpha,
const T* x,
int incx,
const T* y,
int incy,
T* A,
int lda)
performs symmetric rank-2 update
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements |
n | number of rows and columns of matrix A |
alpha | pointer to the alpha scalar |
x | pointer to the address of x |
incx | stride between consecutive elements of x |
y | pointer to the address of y |
incy | stride between consecutive elements of y |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
Returns | a tf:: |
This function performs symmetric rank-2 update:
A = alpha * x * y^T + y * x^T + A
,
where alpha
is a scalar, A
is a 2D symmetric matrix stored in column-major format, and x
and y
are vectors.
The result is also symmetric and is stored on in the uplo
part of A
.
This method calls native cublas<t>syr2
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: trmv(cublasFillMode_t uplo,
cublasOperation_t tran,
cublasDiagType_t diag,
int n,
const T* A,
int lda,
T* x,
int incx)
performs the triangular matrix-vector multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements |
tran | transpose operation op(A) |
diag | indicates if the elements on the main diagonal of matrix A are unity (i.e., all 1s) and of no need to be accessed |
n | number of rows and columns of matrix A |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store matrix A |
x | input of vector b and output of the solution on exit |
incx | stride between consecutive elements of x |
This method performs the triangular matrix-vector multiplication:
x = op(A)
,
where A
is a triangular matrix stored in lower or upper mode with or without the main diagonal, and x
is a vector.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: trsv(cublasFillMode_t uplo,
cublasOperation_t tran,
cublasDiagType_t diag,
int n,
const T* A,
int lda,
T* x,
int incx)
solves the triangular linear system with a single right-hand-side
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements |
tran | transpose operation op(A) |
diag | indicates if the elements on the main diagonal of matrix A are unity (i.e., all 1s) and of no need to be accessed |
n | number of rows and columns of matrix A |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store matrix A |
x | input of vector b and output of the solution on exit |
incx | stride between consecutive elements of x |
This method solves the triangular linear system with a single right-hand-side
op(A) x = b
,
where A
is a triangular matrix stored in lower or upper mode with or without the main diagonal, and x
and b
are vectors.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: geam(cublasOperation_t ta,
cublasOperation_t tb,
int m,
int n,
const T* alpha,
const T* A,
int lda,
const T* beta,
const T* B,
int ldb,
T* C,
int ldc)
performs matrix-matrix addition and transposition
Template parameters | |
---|---|
T | data type |
Parameters | |
ta | transport operation op(A) |
tb | transport operation op(B) |
m | number of rows of matrix C and op(A) |
n | number of columns of matrix C and op(B) |
alpha | pointer to the alpha scalar |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
beta | pointer to the beta scalar |
B | pointer to the address of B |
ldb | leading dimension of 2D array used to store the matrix B |
C | pointer to the address of C |
ldc | leading dimension of 2D array used to store the matrix C |
Returns | a tf:: |
This method performs the matrix-matrix addition/transposition:
C = alpha * op(A) + beta * op(B)
,
where alpha
and beta
are scalars, and A
, B
and C
are matrices stored in column-major format with dimensions op(A)
as m
by n
, op(B)
as m
by n
and C
as m
by n
, respectively.
The operation is out-of-place if C
does not overlap A
or B
.
The in-place mode supports the following two operations:
C = alpha * C + beta * op(B)
C = alpha * op(A) + beta * C
For in-place mode, if C
equals A
, ldc
equals lda
and ta
equals CUBLAS_OP_N
. If C
equals B
, ldc
equals ldb
and tb
equals CUBLAS_OP_N.
The operation includes the following special cases:
- the user can reset matrix
C
to zero by settingalpha
andbeta
to 0 - the user can transpose matrix
A
by settingalpha
to 1 andbeta
to 0
The input matrices are in column-major storage.
This method calls native cublas<t>geam
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: gemm(cublasOperation_t ta,
cublasOperation_t tb,
int m,
int n,
int k,
const T* alpha,
const T* A,
int lda,
const T* B,
int ldb,
const T* beta,
T* C,
int ldc)
performs matrix-matrix multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
ta | transport operation op(A) |
tb | transport operation op(B) |
m | number of rows of matrix C and op(A) |
n | number of columns of matrix C and op(B) |
k | number of columns of op(A) and rows of op(B) |
alpha | pointer to the alpha scalar |
A | pointer to the address of A |
lda | leading dimension of 2D array used to store the matrix A |
B | pointer to the address of B |
ldb | leading dimension of 2D array used to store the matrix B |
beta | pointer to the beta scalar |
C | pointer to the address of C |
ldc | leading dimension of 2D array used to store the matrix C |
Returns | a tf:: |
This function performs matrix-matrix multiplication:
C = alpha * op (A) * op (B) + beta * C
,
where alpha
and beta
are scalars, and A
, B
, and C
are 2D matrices stored in column-major format with dimension op(A)
as m
by k
, dimension op(B)
as k
by n
, and C
as m
by n
.
The input matrices are in column-major storage.
This method calls native cublas<t>gemm
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: gemm_batched(cublasOperation_t ta,
cublasOperation_t tb,
int m,
int n,
int k,
const T* alpha,
const T* A[],
int lda,
const T* B[],
int ldb,
const T* beta,
T* C[],
int ldc,
int bc)
performs matrix-matrix multiplication over a batch of matrices
Template parameters | |
---|---|
T | data type |
Parameters | |
ta | transport operation op(A[i]) |
tb | transport operation op(B[i]) |
m | number of rows of matrix C [i] and op(A[i]) |
n | number of columns of matrix C [i] and op(B[i]) |
k | number of columns of op(A[i]) and rows of op(B[i]) |
alpha | pointer to the alpha scalar |
A | array pointer to A batch |
lda | leading dimension of 2D array used to store the matrix A [i] |
B | array pointer to B batch |
ldb | leading dimension of 2D array used to store the matrix B [i] |
beta | pointer to the beta scalar |
C | array pointer to C batch |
ldc | leading dimension of 2D array used to store the matrix C [i] |
bc | batch size (number of matrices) |
Returns | a tf:: |
The batch must be uniform. All instances in the batch must have the same dimensions (m, n, k)
, leading dimensions (lda, ldb, ldc)
and transpositions (ta, tb)
for their respective A
, B
and C
matrices. The address of the input matrices and the output matrix of each instance of the batch are read from arrays of pointers passed to the function by the caller.
C[i]= alpha * op (A[i]) * op (B[i]) + beta * C[i], i in [0, bc)
,
where alpha
and beta
are scalars, and A
[i], B
[i], and C
[i] are 2D matrices stored in column-major format with dimension op(A)
as m
by k
, dimension op(B)
as k
by n
, and C
as m
by n
.
The input matrices are in column-major storage.
This method calls native cublas<t>gemmBatched
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: gemm_sbatched(cublasOperation_t ta,
cublasOperation_t tb,
int m,
int n,
int k,
const T* alpha,
const T* A,
int lda,
long long int sA,
const T* B,
int ldb,
long long int sB,
const T* beta,
T* C,
int ldc,
long long int sC,
int bc)
performs matrix-matrix multiplication over a batch of matrices with strided memory access
Template parameters | |
---|---|
T | data type |
Parameters | |
ta | transport operation op(A[i]) |
tb | transport operation op(B[i]) |
m | number of rows of matrix C [i] and op(A[i]) |
n | number of columns of matrix C [i] and op(B[i]) |
k | number of columns of op(A[i]) and rows of op(B[i]) |
alpha | pointer to the alpha scalar |
A | pointer to A batch |
lda | leading dimension of 2D array used to store the matrix A [i] |
sA | address offset between A [i] and A [i+1] |
B | pointer to B batch |
ldb | leading dimension of 2D array used to store the matrix B [i] |
sB | address offset between B [i] and B [i+1] |
beta | pointer to the beta scalar |
C | pointer to C batch |
ldc | leading dimension of 2D array used to store the matrix C [i] |
sC | address offset between C [i] and C [i+1] |
bc | batch size (number of matrices) |
Returns | a tf:: |
Here, we use A
[i], B
[i], C
[i] as notation for A, B and C matrices in the i-th
instance of the batch, implicitly assuming they are respectively address offsets sA
, sB
, sC
away from A
[i-1], B
[i-1], C
[i-1].
The input matrices are in column-major storage.
This method calls native cublas<t>gemmStridedBatched
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
The batch must be uniform. All instances in the batch must have the same dimensions (m, n, k)
, leading dimensions (lda, ldb, ldc)
and transpositions (ta, tb)
for their respective A
, B
and C
matrices. Input matrices A
, B
and output matrix C
for each instance of the batch are located at fixed address offsets from their locations in the previous instance. Pointers to A
, B
and C
matrices for the first instance are passed to the function by the user along with the address offsets - sA
, sB
and sC
that determine the locations of input and output matrices in future instances.
C + i*sC = alpha * op (A + i*sA) * op (B + i*sB) + beta * (C + i*sC), i in [0, bc)
,
where alpha
and beta
are scalars, and A
[i], B
[i], and C
[i] are 2D matrices stored in column-major format with dimension op(A)
as m
by k
, dimension op(B)
as k
by n
, and C
as m
by n
.
On certain problem sizes, it might be advantageous to create multiple gemm tasks to take advantage of concurrent kernels, rather than this method.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: symm(cublasSideMode_t side,
cublasFillMode_t uplo,
int m,
int n,
const T* alpha,
const T* A,
int lda,
const T* B,
int ldb,
const T* beta,
T* C,
int ldc)
performs the symmetric matrix-matrix multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
side | indicates if matrix A is on the left or right of B . |
uplo | indicates if matrix A lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
m | number of rows of matrix C and B , with matrix A sized accordingly |
n | number of columns of matrix C and B , with matrix A sized accordingly |
alpha | scalar used for multiplication |
A | pointer to the address of matrix A |
lda | leading dimension of the 2D array used to store A |
B | pointer to the address of matrix B |
ldb | leading dimension of the 2D array used to store B |
beta | scalar used for multiplication |
C | pointer to the address of matrix C |
ldc | leading dimension of the 2D array used to store C |
The method performs symmetric matrix-matrix multiplication:
C = alpha * A * B + beta * C, if side == CUBLAS_SIDE_LEFT
, or
C = alpha * B * A + beta * C, if side == CUBLAS_SIDE_RIGHT
.
A
is a symmetric matrix stored in lower or upper mode, B
and C
are m
by n
matrices, and alpha
and beta
are scalars.
This method calls native cublas<t>symm
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: syrk(cublasFillMode_t uplo,
cublasOperation_t tran,
int n,
int k,
const T* alpha,
const T* A,
int lda,
const T* beta,
T* C,
int ldc)
performs the symmetric rank-k update
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
tran | transposition operation to apply to A |
n | number of rows of matrix C and op(A) |
k | number of columns of matrix op(A) |
alpha | scalar used for multiplication |
A | pointer to the address of A |
lda | leading dimension of the 2D array used to store A |
beta | scalar used for multiplication |
C | pointer to the address of C |
ldc | leading dimension of the 2D array used to store C |
This method performs the symmetric rank-k update :
C = alpha * op(A) * op(A)^T + beta * C
,
where alpha
and beta
are scalars, C
is a symmetric matrix stored in lower or upper mode, and A
is a matrix with dimension op(A)
n
by k
.
The result is stored to uplo
part of C
.
This method calls native cublas<t>syrk
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: syr2k(cublasFillMode_t uplo,
cublasOperation_t tran,
int n,
int k,
const T* alpha,
const T* A,
int lda,
const T* B,
int ldb,
const T* beta,
T* C,
int ldc)
performs the symmetric rank-2k update
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
tran | transposition operation to apply to A |
n | number of rows of matrix C and op(A) |
k | number of columns of matrix op(A) |
alpha | scalar used for multiplication |
A | pointer to the address of A |
lda | leading dimension of the 2D array used to store A |
B | pointer to the address of B |
ldb | leading dimension of the 2D array used to store B |
beta | scalar used for multiplication |
C | pointer to the address of C |
ldc | leading dimension of the 2D array used to store C |
This method performs the symmetric rank-2k update :
C = alpha * (op(A) * op(B)^T + op(B) * op(A)^T) + beta * C
,
where alpha
and beta
are scalars, C
is a symmetric matrix stored in lower or upper mode, and A
and B
are two matrices with dimensions op(A)
and op(B) n
by k
.
The result is stored to uplo
part of C
.
This method calls native cublas<t>syr2k
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: syrkx(cublasFillMode_t uplo,
cublasOperation_t tran,
int n,
int k,
const T* alpha,
const T* A,
int lda,
const T* B,
int ldb,
const T* beta,
T* C,
int ldc)
performs a variation of the symmetric rank-k update
Template parameters | |
---|---|
T | data type |
Parameters | |
uplo | indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
tran | transposition operation to apply to A |
n | number of rows of matrix C and op(A) |
k | number of columns of matrix op(A) |
alpha | scalar used for multiplication |
A | pointer to the address of A |
lda | leading dimension of the 2D array used to store A |
B | pointer to the address of B |
ldb | leading dimension of the 2D array used to store B |
beta | scalar used for multiplication |
C | pointer to the address of C |
ldc | leading dimension of the 2D array used to store C |
This method performs a variation of the symmetric rank-k update:
C = alpha * op(A) * op(B)^T + beta * C
,
where alpha
and beta
are scalars, C
is a symmetric matrix stored in lower or upper mode, and A
and B
are two matrices with dimensions op(A)
and op(B) n
by k
.
The result is stored to uplo
part of C
.
This method calls native cublas<t>syr2k
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
template<typename T>
cudaTask tf:: cublasFlowCapturer:: trmm(cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t tran,
cublasDiagType_t diag,
int m,
int n,
const T* alpha,
const T* A,
int lda,
const T* B,
int ldb,
T* C,
int ldc)
performs triangular matrix-matrix multiplication
Template parameters | |
---|---|
T | data type |
Parameters | |
side | indicates if matrix A is on the left or right of B |
uplo | indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements |
tran | transposition operation to apply to A |
diag | indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m | number of rows of matrix B , with matrix A sized accordingly |
n | number of columns of matrix B , with matrix A sized accordingly |
alpha | scalar used for multiplication |
A | pointer to the address of matrix A |
lda | leading dimension of the 2D array used to store A |
B | pointer to the address of matrix B |
ldb | leading dimension of the 2D array used to store B |
C | pointer to the address of matrix C |
ldc | leading dimension of the 2D array used to store C |
This method performs triangular matrix-matrix multiplication:
C = alpha * op(A) * B
, if side == CUBLAS_SIDE_LEFT
, or
C = alpha * B * op(A)
, if side == CUBLAS_SIDE_RIGHT
,
where A
is a triangular matrix stored in lower or upper mode with or without the main diagonal, B
and C
are m
by n
matrix, and alpha
is a scalar.
This method calls native cublas<t>trmm
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.
Notice that in this method, B
and C
can point to the same address in which case the in-place implementation is performed (with results written back to B
).
template<typename T>
cudaTask tf:: cublasFlowCapturer:: trsm(cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t tran,
cublasDiagType_t diag,
int m,
int n,
const T* alpha,
const T* A,
int lda,
T* B,
int ldb)
solves the triangular linear system with multiple right-hand-sides
Template parameters | |
---|---|
T | data type |
Parameters | |
side | indicates if A is on the left or right side of X |
uplo | indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements |
tran | transposition operation to apply to A |
diag | indicates if the elements on the main diagonal of matrix A are unity and should not be accessed |
m | number of rows in matrix B , with matrix A sized accordingly |
n | number of columns in matrix B , with matrix A sized accordingly |
alpha | scalar to apply to B |
A | pointer to the address of matrix A |
lda | leading dimension of the 2D array used to store A |
B | pointer to the address of matrix B |
ldb | leading dimension of the 2D array used to store B |
This method solves the triangular linear system with multiple right-hand-sides:
op(A) * X = alpha * B
, if side == CUBLAS_SIDE_LEFT
, or
X * op(A) = alpha * B
, if side == CUBLAS_SIDE_RIGHT
,
where A
is a triangular matrix stored in lower or upper mode with or without the main diagonal, X
and B
are m
by n
matrices, and alpha
is a scalar.
The solution X
overwrites the right-hand-sides B
on exit.
This method calls native cublas<t>trsm
with packed parameters, (handle, args...)
, where handle
is managed by the cublasFlowCapturer and args
... are the given arguments.