tf namespace

taskflow namespace

Reference
- Classes
- Enums
- Typedefs
- Functions
- Variables

Classes

class ChromeObserver: observer interface based on Chrome tracing format
class CriticalSection: class to create a critical region of limited workers to run tasks
class cublasFlowCapturer: class to construct a cuBLAS task graph
class cublasScopedPerThreadHandle: class to provide RAII-styled guard of cublas handle acquisition
class cudaBLAF: basic linear algebra flow on top of cudaFlow
class cudaFlow: class for building a CUDA task dependency graph
class cudaFlowCapturer: class for building a CUDA task dependency graph through stream capture
class cudaFlowCapturerBase: base class to construct a CUDA task graph through stream capture
template<typename H, typename C, typename D> class cudaPerThreadDeviceObjectPool: per-thread object pool to manage CUDA device object
class cudaRoundRobinCapturing: class to capture the described graph into a native cudaGraph using a greedy round-robin algorithm on a fixed number of streams
class cudaScopedDevice: RAII-styled device context switch.
class cudaScopedPerThreadEvent: class that provides RAII-styled guard of event acquisition
class cudaScopedPerThreadStream: class that provides RAII-styled guard of stream acquisition
class cudaSequentialCapturing: class to capture the described graph into a native cudaGraph using a single stream
class cudaTask: handle to a node of the internal CUDA graph
class Executor: execution interface for running a taskflow graph
class FlowBuilder: building methods of a task dependency graph
template<typename T> class Future: class to access the result of task execution
class ObserverInterface: The interface class for creating an executor observer.
class Semaphore: class to create a semophore object for building a concurrency constraint
template<typename T> class Singleton: class template to create a thread-safe singleton object
class Subflow: class to construct a subflow graph from the execution of a dynamic task
class Task: handle to a node in a task dependency graph
class Taskflow: main entry to create a task dependency graph
template<typename T> class TaskQueue: Lock-free unbounded single-producer multiple-consumer queue.
class TaskView: class to access task information from the observer interface
template<typename T> class Tensor: a tensor contains arithmetic data in N dimensions
template<typename T> class TensorExpr: handle to a tensor expression created by a tensorframe
class TFProfObserver: observer interface based on the built-in taskflow profiler format
class WorkerView: class to create an immutable view of a worker in an executor

Enums

enum class ObserverType: int { TFPROF = 0, CHROME, UNDEFINED }: enumeration of all observer types
enum class TaskType: int { PLACEHOLDER = 0, CUDAFLOW, STATIC, DYNAMIC, CONDITION, MODULE, ASYNC, UNDEFINED }: enumeration of all task types
enum class cudaTaskType: int { EMPTY = 0, HOST, MEMSET, MEMCPY, KERNEL, SUBFLOW, CAPTURE, UNDEFINED }: enumeration of all cudaTask types

Typedefs

using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>: default time point type of observers
using cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>
using cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>: alias of per-thread stream pool type
using cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>: alias of per-thread event pool type

Functions

auto to_string(ObserverType type) -> const char*: convert an observer type to a human-readable string
auto to_string(TaskType type) -> const char*: convert a task type to a human-readable string
auto operator<<(std::ostream& os, const Task& task) -> std::ostream&: overload of ostream inserter operator for cudaTask
auto cublas_per_thread_handle_pool() -> cublasPerThreadHandlePool&
auto cuda_default_max_threads_per_block() -> size_t constexpr: queries the maximum threads allowed per block
auto cuda_default_threads_per_block(size_t N) -> size_t constexpr: queries the default number of threads per block in an 1D vector of N elements
auto cuda_get_num_devices() -> size_t: queries the number of available devices
auto cuda_get_device() -> int: gets the current device associated with the caller thread
void cuda_set_device(int id): switches to a given device context
void cuda_get_device_property(int i, cudaDeviceProp& p): obtains the device property
auto cuda_get_device_property(int i) -> cudaDeviceProp: obtains the device property
void cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p): dumps the device property
auto cuda_get_device_max_threads_per_block(int d) -> size_t: queries the maximum threads per block on a device
auto cuda_get_device_max_x_dim_per_block(int d) -> size_t: queries the maximum x-dimension per block on a device
auto cuda_get_device_max_y_dim_per_block(int d) -> size_t: queries the maximum y-dimension per block on a device
auto cuda_get_device_max_z_dim_per_block(int d) -> size_t: queries the maximum z-dimension per block on a device
auto cuda_get_device_max_x_dim_per_grid(int d) -> size_t: queries the maximum x-dimension per grid on a device
auto cuda_get_device_max_y_dim_per_grid(int d) -> size_t: queries the maximum y-dimension per grid on a device
auto cuda_get_device_max_z_dim_per_grid(int d) -> size_t: queries the maximum z-dimension per grid on a device
auto cuda_get_device_max_shm_per_block(int d) -> size_t: queries the maximum shared memory size in bytes per block on a device
auto cuda_get_device_warp_size(int d) -> size_t: queries the warp size on a device
auto cuda_get_device_compute_capability_major(int d) -> int: queries the major number of compute capability of a device
auto cuda_get_device_compute_capability_minor(int d) -> int: queries the minor number of compute capability of a device
auto cuda_get_device_unified_addressing(int d) -> bool: queries if the device supports unified addressing
auto cuda_get_driver_version() -> int: queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver
auto cuda_get_runtime_version() -> int: queries the CUDA Runtime version (1000 * major + 10 * minor)
template<typename T, std::enable_if_t<!std::is_same_v<T, void>, void>* = nullptr> auto cuda_get_copy_parms(T* tgt, const T* src, size_t num) -> cudaMemcpy3DParms: gets the memcpy node parameter of a copy task
auto cuda_get_memcpy_parms(void* tgt, const void* src, size_t bytes) -> cudaMemcpy3DParms: gets the memcpy node parameter of a memcpy task (untyped)
auto cuda_get_memset_parms(void* dst, int ch, size_t count) -> cudaMemsetParams: gets the memset node parameter of a memcpy task (untyped)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto cuda_get_fill_parms(T* dst, T value, size_t count) -> cudaMemsetParams: gets the memset node parameter of a fill task (typed)
template<typename T, std::enable_if_t<is_pod_v<T> && (sizeof(T)==1||sizeof(T)==2||sizeof(T)==4), void>* = nullptr> auto cuda_get_zero_parms(T* dst, size_t count) -> cudaMemsetParams: gets the memset node parameter of a zero task (typed)
auto cuda_get_graph_num_root_nodes(cudaGraph_t graph) -> size_t: queries the number of root nodes in a native CUDA graph
auto cuda_get_graph_num_nodes(cudaGraph_t graph) -> size_t: queries the number of nodes in a native CUDA graph
auto cuda_get_graph_num_edges(cudaGraph_t graph) -> size_t: queries the number of edges in a native CUDA graph
auto cuda_get_graph_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>: acquires the nodes in a native CUDA graph
auto cuda_get_graph_root_nodes(cudaGraph_t graph) -> std::vector<cudaGraphNode_t>: acquires the root nodes in a native CUDA graph
auto cuda_get_graph_edges(cudaGraph_t graph) -> std::vector<std::pair<cudaGraphNode_t, cudaGraphNode_t>>: acquires the edges in a native CUDA graph
auto cuda_get_graph_node_type(cudaGraphNode_t node) -> cudaGraphNodeType: queries the type of a native CUDA graph node
auto cuda_graph_node_type_to_string(cudaGraphNodeType type) -> const char*: convert the type of a native CUDA graph node to a readable string
template<typename T> void cuda_dump_graph(T& os, cudaGraph_t graph): dumps a native CUDA graph and all associated child graphs to a DOT format
auto cuda_get_free_mem(int d) -> size_t: queries the free memory (expensive call)
auto cuda_get_total_mem(int d) -> size_t: queries the total available memory (expensive call)
template<typename T> auto cuda_malloc_device(size_t N, int d) -> T*: allocates memory on the given device for holding N elements of type T
template<typename T> auto cuda_malloc_device(size_t N) -> T*: allocates memory on the current device associated with the caller
template<typename T> auto cuda_malloc_shared(size_t N) -> T*: allocates shared memory for holding N elements of type T
template<typename T> void cuda_free(T* ptr, int d): frees memory on the GPU device
template<typename T> void cuda_free(T* ptr): frees memory on the GPU device
void cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count): copies data between host and device asynchronously through a stream
void cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count): initializes or sets GPU memory to the given value byte by byte
auto cuda_per_thread_stream_pool() -> cudaPerThreadStreamPool&: acquires the per-thread cuda stream pool
auto cuda_per_thread_event_pool() -> cudaPerThreadEventPool&: per-thread cuda event pool
auto to_string(cudaTaskType type) -> const char* constexpr: convert a cuda_task type to a human-readable string
auto operator<<(std::ostream& os, const cudaTask& ct) -> std::ostream&: overload of ostream inserter operator for cudaTask
auto version() -> const char* constexpr: queries the version information in a string format major.minor.patch
template<typename T> auto log2(T n) -> int constexpr: returns floor(log2(n)), assumes n > 0
template<typename RandItr, typename C> auto median_of_three(RandItr l, RandItr m, RandItr r, C cmp) -> RandItr: finds the median of three numbers of dereferenced iterators using the given comparator
template<typename RandItr, typename C> auto pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) -> RandItr: finds the pseudo median of a range of items using spreaded nine numbers
template<typename Iter, typename Compare> void sort2(Iter a, Iter b, Compare comp): sorts two elements of dereferenced iterators using the given comparison function
template<typename Iter, typename Compare> void sort3(Iter a, Iter b, Iter c, Compare comp): sorts three elements of dereferenced iterators using the given comparison function
template<typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr> auto unique_id() -> T: generates a program-wise unique id of the give type (thread-safe)

Variables

std::array<TaskType, 7> TASK_TYPES constexpr: array of all task types (used for iterating task types)
template<typename C> bool is_static_task_v constexpr: determines if a callable is a static task
template<typename C> bool is_dynamic_task_v constexpr: determines if a callable is a dynamic task
template<typename C> bool is_condition_task_v constexpr: determines if a callable is a condition task
template<typename C> bool is_cudaflow_task_v constexpr: determines if a callable is a cudaflow task

Enum documentation

enum class tf::ObserverType: int
#include <src/taskflow/core/observer.hpp>

enumeration of all observer types

enum class tf::TaskType: int
#include <src/taskflow/core/task.hpp>

enumeration of all task types

enum class tf::cudaTaskType: int
#include <src/taskflow/cuda/cuda_task.hpp>

enumeration of all cudaTask types

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>
#include <src/taskflow/core/observer.hpp>

default time point type of observers

using tf::cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>

alias of per-thread cublas handle pool type

using tf::cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter>
#include <src/taskflow/cuda/cuda_stream.hpp>

alias of per-thread stream pool type

using tf::cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter>
#include <src/taskflow/cuda/cuda_stream.hpp>

alias of per-thread event pool type

Function documentation

const char* tf::to_string(ObserverType type)
#include <src/taskflow/core/observer.hpp>

convert an observer type to a human-readable string

const char* tf::to_string(TaskType type)
#include <src/taskflow/core/task.hpp>

convert a task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const Task& task)
#include <src/taskflow/core/task.hpp>

overload of ostream inserter operator for cudaTask

cublasPerThreadHandlePool& tf::cublas_per_thread_handle_pool() private

acquires the per-thread cublas stream pool

size_t tf::cuda_default_max_threads_per_block() constexpr
#include <src/taskflow/cuda/cuda_capturer.hpp>

queries the maximum threads allowed per block

size_t tf::cuda_default_threads_per_block(size_t N) constexpr
#include <src/taskflow/cuda/cuda_capturer.hpp>

queries the default number of threads per block in an 1D vector of N elements

size_t tf::cuda_get_num_devices()
#include <src/taskflow/cuda/cuda_device.hpp>

queries the number of available devices

int tf::cuda_get_device()
#include <src/taskflow/cuda/cuda_device.hpp>

gets the current device associated with the caller thread

void tf::cuda_set_device(int id)
#include <src/taskflow/cuda/cuda_device.hpp>

switches to a given device context

void tf::cuda_get_device_property(int i, cudaDeviceProp& p)
#include <src/taskflow/cuda/cuda_device.hpp>

obtains the device property

cudaDeviceProp tf::cuda_get_device_property(int i)
#include <src/taskflow/cuda/cuda_device.hpp>

obtains the device property

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p)
#include <src/taskflow/cuda/cuda_device.hpp>

dumps the device property

size_t tf::cuda_get_device_max_threads_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum threads per block on a device

size_t tf::cuda_get_device_max_x_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum x-dimension per block on a device

size_t tf::cuda_get_device_max_y_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum y-dimension per block on a device

size_t tf::cuda_get_device_max_z_dim_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum z-dimension per block on a device

size_t tf::cuda_get_device_max_x_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum x-dimension per grid on a device

size_t tf::cuda_get_device_max_y_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum y-dimension per grid on a device

size_t tf::cuda_get_device_max_z_dim_per_grid(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum z-dimension per grid on a device

size_t tf::cuda_get_device_max_shm_per_block(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the maximum shared memory size in bytes per block on a device

size_t tf::cuda_get_device_warp_size(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the warp size on a device

int tf::cuda_get_device_compute_capability_major(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the major number of compute capability of a device

int tf::cuda_get_device_compute_capability_minor(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries the minor number of compute capability of a device

bool tf::cuda_get_device_unified_addressing(int d)
#include <src/taskflow/cuda/cuda_device.hpp>

queries if the device supports unified addressing

int tf::cuda_get_driver_version()
#include <src/taskflow/cuda/cuda_device.hpp>

queries the latest CUDA version (1000 * major + 10 * minor) supported by the driver

int tf::cuda_get_runtime_version()
#include <src/taskflow/cuda/cuda_device.hpp>

queries the CUDA Runtime version (1000 * major + 10 * minor)

cudaGraphNodeType tf::cuda_get_graph_node_type(cudaGraphNode_t node)

queries the type of a native CUDA graph node

valid type values are:

cudaGraphNodeTypeKernel = 0x00
cudaGraphNodeTypeMemcpy = 0x01
cudaGraphNodeTypeMemset = 0x02
cudaGraphNodeTypeHost = 0x03
cudaGraphNodeTypeGraph = 0x04
cudaGraphNodeTypeEmpty = 0x05
cudaGraphNodeTypeWaitEvent = 0x06
cudaGraphNodeTypeEventRecord = 0x07

template<typename T>
void tf::cuda_dump_graph(T& os, cudaGraph_t graph)

dumps a native CUDA graph and all associated child graphs to a DOT format

Template parameters
T	output stream target
Parameters
os	target output stream
graph	native CUDA graph

size_t tf::cuda_get_free_mem(int d)
#include <src/taskflow/cuda/cuda_memory.hpp>

queries the free memory (expensive call)

size_t tf::cuda_get_total_mem(int d)
#include <src/taskflow/cuda/cuda_memory.hpp>

queries the total available memory (expensive call)

#include <src/taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_device(size_t N, int d)

allocates memory on the given device for holding N elements of type T

The function calls cudaMalloc to allocate N*sizeof(T) bytes of memory on the given device d and returns a pointer to the starting address of the device memory.

#include <src/taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_device(size_t N)

allocates memory on the current device associated with the caller

The function calls cuda_malloc_device from the current device associated with the caller.

#include <src/taskflow/cuda/cuda_memory.hpp>

template<typename T>
T* tf::cuda_malloc_shared(size_t N)

allocates shared memory for holding N elements of type T

The function calls cudaMallocManaged to allocate N*sizeof(T) bytes of memory and returns a pointer to the starting address of the shared memory.

#include <src/taskflow/cuda/cuda_memory.hpp>

template<typename T>
void tf::cuda_free(T* ptr, int d)

frees memory on the GPU device

Template parameters
T	pointer type
Parameters
ptr	device pointer to memory to free
d	device context identifier

This methods call cudaFree to free the memory space pointed to by ptr using the given device context.

#include <src/taskflow/cuda/cuda_memory.hpp>

template<typename T>
void tf::cuda_free(T* ptr)

frees memory on the GPU device

Template parameters
T	pointer type
Parameters
ptr	device pointer to memory to free

This methods call cudaFree to free the memory space pointed to by ptr using the current device context of the caller.

void tf::cuda_memcpy_async(cudaStream_t stream, void* dst, const void* src, size_t count)
#include <src/taskflow/cuda/cuda_memory.hpp>

copies data between host and device asynchronously through a stream

Parameters
stream	stream identifier
dst	destination memory address
src	source memory address
count	size in bytes to copy

The method calls cudaMemcpyAsync with the given stream using cudaMemcpyDefault to infer the memory space of the source and the destination pointers. The memory areas may not overlap.

void tf::cuda_memset_async(cudaStream_t stream, void* devPtr, int value, size_t count)
#include <src/taskflow/cuda/cuda_memory.hpp>

initializes or sets GPU memory to the given value byte by byte

Parameters
stream	stream identifier
devPtr	pointer to GPU mempry
value	value to set for each byte of the specified memory
count	size in bytes to set

The method calls cudaMemsetAsync with the given stream to fill the first count bytes of the memory area pointed to by devPtr with the constant byte value value.

cudaPerThreadStreamPool& tf::cuda_per_thread_stream_pool()
#include <src/taskflow/cuda/cuda_stream.hpp>

acquires the per-thread cuda stream pool

cudaPerThreadEventPool& tf::cuda_per_thread_event_pool()
#include <src/taskflow/cuda/cuda_stream.hpp>

per-thread cuda event pool

const char* tf::to_string(cudaTaskType type) constexpr
#include <src/taskflow/cuda/cuda_task.hpp>

convert a cuda_task type to a human-readable string

std::ostream& tf::operator<<(std::ostream& os, const cudaTask& ct)
#include <src/taskflow/cuda/cuda_task.hpp>

overload of ostream inserter operator for cudaTask

const char* tf::version() constexpr
#include <src/taskflow/taskflow.hpp>

queries the version information in a string format major.minor.patch

Variable documentation

std::array<TaskType, 7> tf::TASK_TYPES constexpr
#include <src/taskflow/core/task.hpp>

array of all task types (used for iterating task types)

#include <src/taskflow/core/task.hpp>

template<typename C>
bool tf::is_static_task_v constexpr

determines if a callable is a static task

A static task is a callable object constructible from std::function<void()>.

#include <src/taskflow/core/task.hpp>

template<typename C>
bool tf::is_dynamic_task_v constexpr

determines if a callable is a dynamic task

A dynamic task is a callable object constructible from std::function<void(Subflow&)>.

#include <src/taskflow/core/task.hpp>

template<typename C>
bool tf::is_condition_task_v constexpr

determines if a callable is a condition task

A condition task is a callable object constructible from std::function<int()>.

#include <src/taskflow/core/task.hpp>

template<typename C>
bool tf::is_cudaflow_task_v constexpr

determines if a callable is a cudaflow task

A cudaFlow task is a callable object constructible from std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.

tf namespace

Contents

Classes

Enums

Typedefs

Functions

Variables

Enum documentation

enum class tf::ObserverType: int #include <src/taskflow/core/observer.hpp>

enum class tf::TaskType: int #include <src/taskflow/core/task.hpp>

enum class tf::cudaTaskType: int #include <src/taskflow/cuda/cuda_task.hpp>

Typedef documentation

using tf::observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock> #include <src/taskflow/core/observer.hpp>

using tf::cublasPerThreadHandlePool = cudaPerThreadDeviceObjectPool<cublasHandle_t, cublasHandleCreator, cublasHandleDeleter>

using tf::cudaPerThreadStreamPool = cudaPerThreadDeviceObjectPool<cudaStream_t, cudaStreamCreator, cudaStreamDeleter> #include <src/taskflow/cuda/cuda_stream.hpp>

using tf::cudaPerThreadEventPool = cudaPerThreadDeviceObjectPool<cudaEvent_t, cudaEventCreator, cudaEventDeleter> #include <src/taskflow/cuda/cuda_stream.hpp>

Function documentation

const char* tf::to_string(ObserverType type) #include <src/taskflow/core/observer.hpp>

const char* tf::to_string(TaskType type) #include <src/taskflow/core/task.hpp>

std::ostream& tf::operator<<(std::ostream& os, const Task& task) #include <src/taskflow/core/task.hpp>

cublasPerThreadHandlePool& tf::cublas_per_thread_handle_pool() private

size_t tf::cuda_default_max_threads_per_block() constexpr #include <src/taskflow/cuda/cuda_capturer.hpp>

size_t tf::cuda_default_threads_per_block(size_t N) constexpr #include <src/taskflow/cuda/cuda_capturer.hpp>

size_t tf::cuda_get_num_devices() #include <src/taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device() #include <src/taskflow/cuda/cuda_device.hpp>

void tf::cuda_set_device(int id) #include <src/taskflow/cuda/cuda_device.hpp>

void tf::cuda_get_device_property(int i, cudaDeviceProp& p) #include <src/taskflow/cuda/cuda_device.hpp>

cudaDeviceProp tf::cuda_get_device_property(int i) #include <src/taskflow/cuda/cuda_device.hpp>

void tf::cuda_dump_device_property(std::ostream& os, const cudaDeviceProp& p) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_threads_per_block(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_x_dim_per_block(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_y_dim_per_block(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_z_dim_per_block(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_x_dim_per_grid(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_y_dim_per_grid(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_z_dim_per_grid(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_max_shm_per_block(int d) #include <src/taskflow/cuda/cuda_device.hpp>

size_t tf::cuda_get_device_warp_size(int d) #include <src/taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device_compute_capability_major(int d) #include <src/taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_device_compute_capability_minor(int d) #include <src/taskflow/cuda/cuda_device.hpp>

bool tf::cuda_get_device_unified_addressing(int d) #include <src/taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_driver_version() #include <src/taskflow/cuda/cuda_device.hpp>

int tf::cuda_get_runtime_version() #include <src/taskflow/cuda/cuda_device.hpp>