tf namespace

taskflow namespace

Classes

class ChromeObserver
observer interface based on Chrome tracing format
class CriticalSection
class to create a critical region of limited workers to run tasks
class cublasFlowCapturer
class to construct a cuBLAS task graph
class cublasScopedPerThreadHandle
class to provide RAII-styled guard of cublas handle acquisition
class cudaBLAF
basic linear algebra flow on top of cudaFlow
class cudaFlow
class for building a CUDA task dependency graph
class cudaFlowCapturer
class for building a CUDA task dependency graph through stream capture
class cudaFlowCapturerBase
base class to construct a CUDA task graph through stream capture
template<typename H, typename C, typename D>
class cudaPerThreadDeviceObjectPool
per-thread object pool to manage CUDA device object
class cudaRoundRobinCapturing
class to capture the described graph into a native cudaGraph using a greedy round-robin algorithm on a fixed number of streams
class cudaScopedDevice
RAII-styled device context switch.
class cudaScopedPerThreadEvent
class that provides RAII-styled guard of event acquisition
class cudaScopedPerThreadStream
class that provides RAII-styled guard of stream acquisition
class cudaSequentialCapturing
class to capture the described graph into a native cudaGraph using a single stream
class cudaTask
handle to a node of the internal CUDA graph
class Executor
execution interface for running a taskflow graph
class FlowBuilder
building methods of a task dependency graph
template<typename T>
class Future
class to access the result of task execution
class ObserverInterface
The interface class for creating an executor observer.
class Semaphore
class to create a semophore object for building a concurrency constraint
template<typename T>
class Singleton
class template to create a thread-safe singleton object
class Subflow
class to construct a subflow graph from the execution of a dynamic task
class Task
handle to a node in a task dependency graph
class Taskflow
main entry to create a task dependency graph
template<typename T>
class TaskQueue
Lock-free unbounded single-producer multiple-consumer queue.
class TaskView
class to access task information from the observer interface
template<typename T>
class Tensor
a tensor contains arithmetic data in N dimensions
template<typename T>
class TensorExpr
handle to a tensor expression created by a tensorframe
class TFProfObserver
observer interface based on the built-in taskflow profiler format
class WorkerView
class to create an immutable view of a worker in an executor

Enums

enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
enumeration of all observer types
enum class TaskType: int { PLACEHOLDER = 0, CUDAFLOW, STATIC, DYNAMIC, CONDITION, MODULE, ASYNC, UNDEFINED }
enumeration of all task types
enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
enumeration of all cudaTask types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
default time point type of observers
using cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>
using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>
alias of per-thread stream pool type
using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>
alias of per-thread event pool type

Functions

auto to_string(ObserverType type) -> const char*
convert an observer type to a human-readable string
auto to_string(TaskType type) -> const char*
convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&
overload of ostream inserter operator for cudaTask
auto cublas_per_thread_handle_pool() -> cublasPerThreadHandlePool&
auto cuda_default_max_threads_per_block() -> size_t constexpr
queries the maximum threads allowed per block
auto cuda_default_threads_per_block(size_t N) -> size_t constexpr
queries the default number of threads per block in an 1D vector of N elements
auto cuda_get_num_devices() -> size_t
queries the number of available devices
auto cuda_get_device() -> int
gets the current device associated with the caller thread
void cuda_set_device(int id)
switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p)
obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp
obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t
queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t
queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t
queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int
queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int
queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool
queries if the device supports unified addressing
auto cuda_get_driver_version() -> int
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int
queries the CUDA Runtime version (1000 * major + 10 * minor)
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>
auto cuda_get_copy_parms(T* tgt, const T* src, size_t num) -> cudaMemcpy3DParms
gets the memcpy node parameter of a copy task
auto cuda_get_memcpy_parms(void* tgt, const void* src, size_t bytes) -> cudaMemcpy3DParms
gets the memcpy node parameter of a memcpy task (untyped)
auto cuda_get_memset_parms(void* dst, int ch, size_t count) -> cudaMemsetParams
gets the memset node parameter of a memcpy task (untyped)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
auto cuda_get_fill_parms(T* dst, T value, size_t count) -> cudaMemsetParams
gets the memset node parameter of a fill task (typed)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>
auto cuda_get_zero_parms(T* dst, size_t count) -> cudaMemsetParams
gets the memset node parameter of a zero task (typed)
auto cuda_get_graph_num_root_nodes(cudaGraph_t graph) -> size_t
queries the number of root nodes in a native CUDA graph
auto cuda_get_graph_num_nodes(cudaGraph_t graph) -> size_t
queries the number of nodes in a native CUDA graph
auto cuda_get_graph_num_edges(cudaGraph_t graph) -> size_t
queries the number of edges in a native CUDA graph
auto cuda_get_graph_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>
acquires the nodes in a native CUDA graph
auto cuda_get_graph_root_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>
acquires the root nodes in a native CUDA graph
auto cuda_get_graph_edges(cudaGraph_t graph) -> std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
acquires the edges in a native CUDA graph
auto cuda_get_graph_node_type(cudaGraphNode_t node) -> cudaGraphNodeType
queries the type of a native CUDA graph node
auto cuda_graph_node_type_to_string(cudaGraphNodeType type) -> const char*
convert the type of a native CUDA graph node to a readable string
template<typename T>
void cuda_dump_graph(T& os, cudaGraph_t graph)
dumps a native CUDA graph and all associated child graphs to a DOT format
auto cuda_get_free_mem(int d) -> size_t
queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t
queries the total available memory (expensive call)
template<typename T>
auto cuda_malloc_device(size_t N, int d) -> T*
allocates memory on the given device for holding N elements of type T
template<typename T>
auto cuda_malloc_device(size_t N) -> T*
allocates memory on the current device associated with the caller
template<typename T>
auto cuda_malloc_shared(size_t N) -> T*
allocates shared memory for holding N elements of type T
template<typename T>
void cuda_free(T* ptr, int d)
frees memory on the GPU device
template<typename T>
void cuda_free(T* ptr)
frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
initializes or sets GPU memory to the given value byte by byte
auto cuda_per_thread_stream_pool() -> cudaPerThreadStreamPool&
acquires the per-thread cuda stream pool
auto cuda_per_thread_event_pool() -> cudaPerThreadEventPool&
per-thread cuda event pool
auto to_string(cudaTaskType type) -> const char* constexpr
convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&
overload of ostream inserter operator for cudaTask
auto version() -> const char* constexpr
queries the version information in a string format major.minor.patch
template<typename T>
auto log2(T n) -> int constexpr
returns floor(log2(n)), assumes n > 0
template<typename RandItr, typename C>
auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr
finds the median of three numbers of dereferenced iterators using the given comparator
template<typename RandItr, typename C>
auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr
finds the pseudo median of a range of items using spreaded nine numbers
template<typename Iter, typename Compare>
void sort2(Iter a, Iter b, Compare comp)
sorts two elements of dereferenced iterators using the given comparison function
template<typename Iter, typename Compare>
void sort3(Iter a, Iter b, Iter c, Compare comp)
sorts three elements of dereferenced iterators using the given comparison function
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
auto unique_id() -> T
generates a program-wise unique id of the give type (thread-safe)

Variables

std::array<TaskType, 7> TASK_TYPES constexpr
array of all task types (used for iterating task types)
template<typename C>
bool is_static_task_v constexpr
determines if a callable is a static task
template<typename C>
bool is_dynamic_task_v constexpr
determines if a callable is a dynamic task
template<typename C>
bool is_condition_task_v constexpr
determines if a callable is a condition task
template<typename C>
bool is_cudaflow_task_v constexpr
determines if a callable is a cudaflow task

Enum documentation

enum class tf::ObserverType: int

enumeration of all observer types

enum class tf::TaskType: int

enumeration of all task types

enum class tf::cudaTaskType: int

enumeration of all cudaTask types

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>

default time point type of observers

using tf::cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>

alias of per-thread cublas handle pool type

using tf::cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>

alias of per-thread stream pool type

using tf::cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>

alias of per-thread event pool type

Function documentation

const char* tf::to_string(ObserverType type)

convert an observer type to a human-readable string

const char* tf::to_string(TaskType type)

convert a task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const Task& task)

overload of ostream inserter operator for cudaTask

cublasPerThreadHandlePool& tf::cublas_per_thread_handle_pool() private

acquires the per-thread cublas stream pool

size_t tf::cuda_default_max_threads_per_block() constexpr

queries the maximum threads allowed per block

size_t tf::cuda_default_threads_per_block(size_t N) constexpr

queries the default number of threads per block in an 1D vector of N elements

size_t tf::cuda_get_num_devices()

queries the number of available devices

int tf::cuda_get_device()

gets the current device associated with the caller thread

void tf::cuda_set_device(int id)

switches to a given device context

void tf::cuda_get_device_property(int i, cudaDeviceProp& p)

obtains the device property

cudaDeviceProp tf::cuda_get_device_property(int i)

obtains the device property

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)

dumps the device property

size_t tf::cuda_get_device_max_threads_per_block(int d)

queries the maximum threads per block on a device

size_t tf::cuda_get_device_max_x_dim_per_block(int d)

queries the maximum x-dimension per block on a device

size_t tf::cuda_get_device_max_y_dim_per_block(int d)

queries the maximum y-dimension per block on a device

size_t tf::cuda_get_device_max_z_dim_per_block(int d)

queries the maximum z-dimension per block on a device

size_t tf::cuda_get_device_max_x_dim_per_grid(int d)

queries the maximum x-dimension per grid on a device

size_t tf::cuda_get_device_max_y_dim_per_grid(int d)

queries the maximum y-dimension per grid on a device

size_t tf::cuda_get_device_max_z_dim_per_grid(int d)

queries the maximum z-dimension per grid on a device

size_t tf::cuda_get_device_max_shm_per_block(int d)

queries the maximum shared memory size in bytes per block on a device

size_t tf::cuda_get_device_warp_size(int d)

queries the warp size on a device

int tf::cuda_get_device_compute_capability_major(int d)

queries the major number of compute capability of a device

int tf::cuda_get_device_compute_capability_minor(int d)

queries the minor number of compute capability of a device

bool tf::cuda_get_device_unified_addressing(int d)

queries if the device supports unified addressing

int tf::cuda_get_driver_version()

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int tf::cuda_get_runtime_version()

queries the CUDA Runtime version (1000 * major + 10 * minor)

cudaGraphNodeType tf::cuda_get_graph_node_type(cudaGraphNode_t node)

queries the type of a native CUDA graph node

valid type values are:

  • cudaGraphNodeTypeKernel = 0x00
  • cudaGraphNodeTypeMemcpy = 0x01
  • cudaGraphNodeTypeMemset = 0x02
  • cudaGraphNodeTypeHost = 0x03
  • cudaGraphNodeTypeGraph = 0x04
  • cudaGraphNodeTypeEmpty = 0x05
  • cudaGraphNodeTypeWaitEvent = 0x06
  • cudaGraphNodeTypeEventRecord = 0x07

template<typename T>
void tf::cuda_dump_graph(T& os, cudaGraph_t graph)

dumps a native CUDA graph and all associated child graphs to a DOT format

Template parameters
T output stream target
Parameters
os target output stream
graph native CUDA graph

size_t tf::cuda_get_free_mem(int d)

queries the free memory (expensive call)

size_t tf::cuda_get_total_mem(int d)

queries the total available memory (expensive call)

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls cuda_malloc_device from the current device associated with the caller.

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free
d device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T pointer type
Parameters
ptr device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)

copies data between host and device asynchronously through a stream

Parameters
stream stream identifier
dst destination memory address
src source memory address
count size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)

initializes or sets GPU memory to the given value byte by byte

Parameters
stream stream identifier
devPtr pointer to GPU mempry
value value to set for each byte of the specified memory
count size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

cudaPerThreadStreamPool& tf::cuda_per_thread_stream_pool()

acquires the per-thread cuda stream pool

const char* tf::to_string(cudaTaskType type) constexpr

convert a cuda_task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const cudaTask& ct)

overload of ostream inserter operator for cudaTask

const char* tf::version() constexpr

queries the version information in a string format major.minor.patch

Variable documentation

std::array<TaskType, 7> tf::TASK_TYPES constexpr

array of all task types (used for iterating task types)

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

template<typename C>
bool tf::is_dynamic_task_v constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

template<typename C>
bool tf::is_cudaflow_task_v constexpr

determines if a callable is a cudaflow task

A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.