tf namespace
taskflow namespace
Classes
- class ChromeObserver
- observer interface based on Chrome tracing format
- class CriticalSection
- class to create a critical region of limited workers to run tasks
- class cublasFlowCapturer
- class to construct a cuBLAS task graph
- class cublasScopedPerThreadHandle
- class to provide RAII-styled guard of cublas handle acquisition
- class cudaBLAF
- basic linear algebra flow on top of cudaFlow
- class cudaFlow
- class for building a CUDA task dependency graph
- class cudaFlowCapturer
- class for building a CUDA task dependency graph through stream capture
- class cudaFlowCapturerBase
- base class to construct a CUDA task graph through stream capture
-
template<typename H, typename C, typename D>class cudaPerThreadDeviceObjectPool
- per-thread object pool to manage CUDA device object
- class cudaRoundRobinCapturing
- class to capture the described graph into a native cudaGraph using a greedy round-robin algorithm on a fixed number of streams
- class cudaScopedDevice
- RAII-styled device context switch.
- class cudaScopedPerThreadEvent
- class that provides RAII-styled guard of event acquisition
- class cudaScopedPerThreadStream
- class that provides RAII-styled guard of stream acquisition
- class cudaSequentialCapturing
- class to capture the described graph into a native cudaGraph using a single stream
- class cudaTask
- handle to a node of the internal CUDA graph
- class Executor
- execution interface for running a taskflow graph
- class FlowBuilder
- building methods of a task dependency graph
-
template<typename T>class Future
- class to access the result of task execution
- class ObserverInterface
- The interface class for creating an executor observer.
- class Semaphore
- class to create a semophore object for building a concurrency constraint
-
template<typename T>class Singleton
- class template to create a thread-safe singleton object
- class Subflow
- class to construct a subflow graph from the execution of a dynamic task
- class Task
- handle to a node in a task dependency graph
- class Taskflow
- main entry to create a task dependency graph
-
template<typename T>class TaskQueue
- Lock-free unbounded single-producer multiple-consumer queue.
- class TaskView
- class to access task information from the observer interface
-
template<typename T>class Tensor
- a tensor contains arithmetic data in N dimensions
-
template<typename T>class TensorExpr
- handle to a tensor expression created by a tensorframe
- class TFProfObserver
- observer interface based on the built-in taskflow profiler format
- class WorkerView
- class to create an immutable view of a worker in an executor
Enums
- enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }
- enumeration of all observer types
- enum class TaskType: int { PLACEHOLDER = 0, CUDAFLOW, STATIC, DYNAMIC, CONDITION, MODULE, ASYNC, UNDEFINED }
- enumeration of all task types
- enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }
- enumeration of all cudaTask types
Typedefs
- using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
- default time point type of observers
- using cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>
- using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>
- alias of per-thread stream pool type
- using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>
- alias of per-thread event pool type
Functions
- auto to_string(ObserverType type) -> const char*
- convert an observer type to a human-readable string
- auto to_string(TaskType type) -> const char*
- convert a task type to a human-readable string
- auto operator<<(std::ostream& os, const Task& task) -> std::ostream&
- overload of ostream inserter operator for cudaTask
- auto cublas_per_thread_handle_pool() -> cublasPerThreadHandlePool&
- auto cuda_default_max_threads_per_block() -> size_t constexpr
- queries the maximum threads allowed per block
- auto cuda_default_threads_per_block(size_t N) -> size_t constexpr
- queries the default number of threads per block in an 1D vector of N elements
- auto cuda_get_num_devices() -> size_t
- queries the number of available devices
- auto cuda_get_device() -> int
- gets the current device associated with the caller thread
- void cuda_set_device(int id)
- switches to a given device context
- void cuda_get_device_property(int i, cudaDeviceProp& p)
- obtains the device property
- auto cuda_get_device_property(int i) -> cudaDeviceProp
- obtains the device property
- void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
- dumps the device property
- auto cuda_get_device_max_threads_per_block(int d) -> size_t
- queries the maximum threads per block on a device
- auto cuda_get_device_max_x_dim_per_block(int d) -> size_t
- queries the maximum x-dimension per block on a device
- auto cuda_get_device_max_y_dim_per_block(int d) -> size_t
- queries the maximum y-dimension per block on a device
- auto cuda_get_device_max_z_dim_per_block(int d) -> size_t
- queries the maximum z-dimension per block on a device
- auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t
- queries the maximum x-dimension per grid on a device
- auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t
- queries the maximum y-dimension per grid on a device
- auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t
- queries the maximum z-dimension per grid on a device
- auto cuda_get_device_max_shm_per_block(int d) -> size_t
- queries the maximum shared memory size in bytes per block on a device
- auto cuda_get_device_warp_size(int d) -> size_t
- queries the warp size on a device
- auto cuda_get_device_compute_capability_major(int d) -> int
- queries the major number of compute capability of a device
- auto cuda_get_device_compute_capability_minor(int d) -> int
- queries the minor number of compute capability of a device
- auto cuda_get_device_unified_addressing(int d) -> bool
- queries if the device supports unified addressing
- auto cuda_get_driver_version() -> int
- queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
- auto cuda_get_runtime_version() -> int
- queries the CUDA Runtime version (1000 * major + 10 * minor)
-
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr>auto cuda_get_copy_parms(T* tgt, const T* src, size_t num) -> cudaMemcpy3DParms
- gets the memcpy node parameter of a copy task
- auto cuda_get_memcpy_parms(void* tgt, const void* src, size_t bytes) -> cudaMemcpy3DParms
- gets the memcpy node parameter of a memcpy task (untyped)
- auto cuda_get_memset_parms(void* dst, int ch, size_t count) -> cudaMemsetParams
- gets the memset node parameter of a memcpy task (untyped)
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto cuda_get_fill_parms(T* dst, T value, size_t count) -> cudaMemsetParams
- gets the memset node parameter of a fill task (typed)
-
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr>auto cuda_get_zero_parms(T* dst, size_t count) -> cudaMemsetParams
- gets the memset node parameter of a zero task (typed)
- auto cuda_get_graph_num_root_nodes(cudaGraph_t graph) -> size_t
- queries the number of root nodes in a native CUDA graph
- auto cuda_get_graph_num_nodes(cudaGraph_t graph) -> size_t
- queries the number of nodes in a native CUDA graph
- auto cuda_get_graph_num_edges(cudaGraph_t graph) -> size_t
- queries the number of edges in a native CUDA graph
- auto cuda_get_graph_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>
- acquires the nodes in a native CUDA graph
- auto cuda_get_graph_root_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>
- acquires the root nodes in a native CUDA graph
- auto cuda_get_graph_edges(cudaGraph_t graph) -> std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>
- acquires the edges in a native CUDA graph
- auto cuda_get_graph_node_type(cudaGraphNode_t node) -> cudaGraphNodeType
- queries the type of a native CUDA graph node
- auto cuda_graph_node_type_to_string(cudaGraphNodeType type) -> const char*
- convert the type of a native CUDA graph node to a readable string
-
template<typename T>void cuda_dump_graph(T& os, cudaGraph_t graph)
- dumps a native CUDA graph and all associated child graphs to a DOT format
- auto cuda_get_free_mem(int d) -> size_t
- queries the free memory (expensive call)
- auto cuda_get_total_mem(int d) -> size_t
- queries the total available memory (expensive call)
-
template<typename T>auto cuda_malloc_device(size_t N, int d) -> T*
- allocates memory on the given device for holding
N
elements of typeT
-
template<typename T>auto cuda_malloc_device(size_t N) -> T*
- allocates memory on the current device associated with the caller
-
template<typename T>auto cuda_malloc_shared(size_t N) -> T*
- allocates shared memory for holding
N
elements of typeT
-
template<typename T>void cuda_free(T* ptr, int d)
- frees memory on the GPU device
-
template<typename T>void cuda_free(T* ptr)
- frees memory on the GPU device
- void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
- copies data between host and device asynchronously through a stream
- void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
- initializes or sets GPU memory to the given value byte by byte
- auto cuda_per_thread_stream_pool() -> cudaPerThreadStreamPool&
- acquires the per-thread cuda stream pool
- auto cuda_per_thread_event_pool() -> cudaPerThreadEventPool&
- per-thread cuda event pool
- auto to_string(cudaTaskType type) -> const char* constexpr
- convert a cuda_task type to a human-readable string
- auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&
- overload of ostream inserter operator for cudaTask
- auto version() -> const char* constexpr
- queries the version information in a string format
major.minor.patch
-
template<typename T>auto log2(T n) -> int constexpr
- returns floor(log2(n)), assumes n > 0
-
template<typename RandItr, typename C>auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr
- finds the median of three numbers of dereferenced iterators using the given comparator
-
template<typename RandItr, typename C>auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr
- finds the pseudo median of a range of items using spreaded nine numbers
-
template<typename Iter, typename Compare>void sort2(Iter a, Iter b, Compare comp)
- sorts two elements of dereferenced iterators using the given comparison function
-
template<typename Iter, typename Compare>void sort3(Iter a, Iter b, Iter c, Compare comp)
- sorts three elements of dereferenced iterators using the given comparison function
-
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>auto unique_id() -> T
- generates a program-wise unique id of the give type (thread-safe)
Variables
- std::array<TaskType, 7> TASK_TYPES constexpr
- array of all task types (used for iterating task types)
-
template<typename C>bool is_static_task_v constexpr
- determines if a callable is a static task
-
template<typename C>bool is_dynamic_task_v constexpr
- determines if a callable is a dynamic task
-
template<typename C>bool is_condition_task_v constexpr
- determines if a callable is a condition task
-
template<typename C>bool is_cudaflow_task_v constexpr
- determines if a callable is a cudaflow task
Enum documentation
enum class tf:: ObserverType: int
#include <src/taskflow/core/observer.hpp>
enumeration of all observer types
enum class tf:: TaskType: int
#include <src/taskflow/core/task.hpp>
enumeration of all task types
enum class tf:: cudaTaskType: int
#include <src/taskflow/cuda/cuda_task.hpp>
enumeration of all cudaTask types
Typedef documentation
using tf:: observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
#include <src/taskflow/core/observer.hpp>
default time point type of observers
using tf:: cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>
alias of per-thread cublas handle pool type
using tf:: cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>
#include <src/taskflow/cuda/cuda_stream.hpp>
alias of per-thread stream pool type
using tf:: cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>
#include <src/taskflow/cuda/cuda_stream.hpp>
alias of per-thread event pool type
Function documentation
const char* tf:: to_string(ObserverType type)
#include <src/taskflow/core/observer.hpp>
convert an observer type to a human-readable string
const char* tf:: to_string(TaskType type)
#include <src/taskflow/core/task.hpp>
convert a task type to a human-readable string
std::ostream& tf:: operator<<(std::ostream& os,
const Task& task)
#include <src/taskflow/core/task.hpp>
overload of ostream inserter operator for cudaTask
cublasPerThreadHandlePool& tf:: cublas_per_thread_handle_pool() private
acquires the per-thread cublas stream pool
size_t tf:: cuda_default_max_threads_per_block() constexpr
queries the maximum threads allowed per block
size_t tf:: cuda_default_threads_per_block(size_t N) constexpr
queries the default number of threads per block in an 1D vector of N elements
size_t tf:: cuda_get_num_devices()
#include <src/taskflow/cuda/cuda_device.hpp>
queries the number of available devices
int tf:: cuda_get_device()
#include <src/taskflow/cuda/cuda_device.hpp>
gets the current device associated with the caller thread
void tf:: cuda_set_device(int id)
#include <src/taskflow/cuda/cuda_device.hpp>
switches to a given device context
void tf:: cuda_get_device_property(int i,
cudaDeviceProp& p)
#include <src/taskflow/cuda/cuda_device.hpp>
obtains the device property
cudaDeviceProp tf:: cuda_get_device_property(int i)
#include <src/taskflow/cuda/cuda_device.hpp>
obtains the device property
void tf:: cuda_dump_device_property(std::ostream& os,
const cudaDeviceProp& p)
#include <src/taskflow/cuda/cuda_device.hpp>
dumps the device property
size_t tf:: cuda_get_device_max_threads_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum threads per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per block on a device
size_t tf:: cuda_get_device_max_y_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per block on a device
size_t tf:: cuda_get_device_max_z_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per block on a device
size_t tf:: cuda_get_device_max_x_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum x-dimension per grid on a device
size_t tf:: cuda_get_device_max_y_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum y-dimension per grid on a device
size_t tf:: cuda_get_device_max_z_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum z-dimension per grid on a device
size_t tf:: cuda_get_device_max_shm_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the maximum shared memory size in bytes per block on a device
size_t tf:: cuda_get_device_warp_size(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the warp size on a device
int tf:: cuda_get_device_compute_capability_major(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the major number of compute capability of a device
int tf:: cuda_get_device_compute_capability_minor(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries the minor number of compute capability of a device
bool tf:: cuda_get_device_unified_addressing(int d)
#include <src/taskflow/cuda/cuda_device.hpp>
queries if the device supports unified addressing
int tf:: cuda_get_driver_version()
#include <src/taskflow/cuda/cuda_device.hpp>
queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
int tf:: cuda_get_runtime_version()
#include <src/taskflow/cuda/cuda_device.hpp>
queries the CUDA Runtime version (1000 * major + 10 * minor)
cudaGraphNodeType tf:: cuda_get_graph_node_type(cudaGraphNode_t node)
queries the type of a native CUDA graph node
valid type values are:
- cudaGraphNodeTypeKernel = 0x00
- cudaGraphNodeTypeMemcpy = 0x01
- cudaGraphNodeTypeMemset = 0x02
- cudaGraphNodeTypeHost = 0x03
- cudaGraphNodeTypeGraph = 0x04
- cudaGraphNodeTypeEmpty = 0x05
- cudaGraphNodeTypeWaitEvent = 0x06
- cudaGraphNodeTypeEventRecord = 0x07
template<typename T>
void tf:: cuda_dump_graph(T& os,
cudaGraph_t graph)
dumps a native CUDA graph and all associated child graphs to a DOT format
Template parameters | |
---|---|
T | output stream target |
Parameters | |
os | target output stream |
graph | native CUDA graph |
size_t tf:: cuda_get_free_mem(int d)
#include <src/taskflow/cuda/cuda_memory.hpp>
queries the free memory (expensive call)
size_t tf:: cuda_get_total_mem(int d)
#include <src/taskflow/cuda/cuda_memory.hpp>
queries the total available memory (expensive call)
#include <src/taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N,
int d)
allocates memory on the given device for holding N
elements of type T
The function calls cudaMalloc
to allocate N*sizeof(T)
bytes of memory on the given device d
and returns a pointer to the starting address of the device memory.
#include <src/taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_device(size_t N)
allocates memory on the current device associated with the caller
The function calls cuda_malloc_device from the current device associated with the caller.
#include <src/taskflow/cuda/cuda_memory.hpp>
template<typename T>
T* tf:: cuda_malloc_shared(size_t N)
allocates shared memory for holding N
elements of type T
The function calls cudaMallocManaged
to allocate N*sizeof(T)
bytes of memory and returns a pointer to the starting address of the shared memory.
#include <src/taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr,
int d)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
d | device context identifier |
This methods call cudaFree
to free the memory space pointed to by ptr
using the given device context.
#include <src/taskflow/cuda/cuda_memory.hpp>
template<typename T>
void tf:: cuda_free(T* ptr)
frees memory on the GPU device
Template parameters | |
---|---|
T | pointer type |
Parameters | |
ptr | device pointer to memory to free |
This methods call cudaFree
to free the memory space pointed to by ptr
using the current device context of the caller.
void tf:: cuda_memcpy_async(cudaStream_t stream,
void* dst,
const void* src,
size_t count)
#include <src/taskflow/cuda/cuda_memory.hpp>
copies data between host and device asynchronously through a stream
Parameters | |
---|---|
stream | stream identifier |
dst | destination memory address |
src | source memory address |
count | size in bytes to copy |
The method calls cudaMemcpyAsync
with the given stream
using cudaMemcpyDefault
to infer the memory space of the source and the destination pointers. The memory areas may not overlap.
void tf:: cuda_memset_async(cudaStream_t stream,
void* devPtr,
int value,
size_t count)
#include <src/taskflow/cuda/cuda_memory.hpp>
initializes or sets GPU memory to the given value byte by byte
Parameters | |
---|---|
stream | stream identifier |
devPtr | pointer to GPU mempry |
value | value to set for each byte of the specified memory |
count | size in bytes to set |
The method calls cudaMemsetAsync
with the given stream
to fill the first count
bytes of the memory area pointed to by devPtr
with the constant byte value value
.
cudaPerThreadStreamPool& tf:: cuda_per_thread_stream_pool()
#include <src/taskflow/cuda/cuda_stream.hpp>
acquires the per-thread cuda stream pool
cudaPerThreadEventPool& tf:: cuda_per_thread_event_pool()
#include <src/taskflow/cuda/cuda_stream.hpp>
per-thread cuda event pool
const char* tf:: to_string(cudaTaskType type) constexpr
#include <src/taskflow/cuda/cuda_task.hpp>
convert a cuda_task type to a human-readable string
std::ostream& tf:: operator<<(std::ostream& os,
const cudaTask& ct)
#include <src/taskflow/cuda/cuda_task.hpp>
overload of ostream inserter operator for cudaTask
const char* tf:: version() constexpr
#include <src/taskflow/taskflow.hpp>
queries the version information in a string format major.minor.patch
Variable documentation
std::array<TaskType, 7> tf:: TASK_TYPES constexpr
#include <src/taskflow/core/task.hpp>
array of all task types (used for iterating task types)
#include <src/taskflow/core/task.hpp>
template<typename C>
bool tf:: is_static_task_v constexpr
determines if a callable is a static task
A static task is a callable object constructible from std::function<void()>.
#include <src/taskflow/core/task.hpp>
template<typename C>
bool tf:: is_dynamic_task_v constexpr
determines if a callable is a dynamic task
A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
#include <src/taskflow/core/task.hpp>
template<typename C>
bool tf:: is_condition_task_v constexpr
determines if a callable is a condition task
A condition task is a callable object constructible from std::function<int()>.
#include <src/taskflow/core/task.hpp>
template<typename C>
bool tf:: is_cudaflow_task_v constexpr
determines if a callable is a cudaflow task
A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.