Collaboration diagram for Classes and functions used for acceleration methods:

Files
file	tsgAcceleratedDataStructures.hpp
	Data structures for interacting with CUDA and MAGMA environments.

file	tsgCacheLagrange.hpp
	Cache data structure for evaluate with Global grids.

Classes
struct	TasGrid::HandleDeleter< ehandle >
	Deleter template for the GPU handles, e.g., cuBlas and rocBlas. More...

class	TasGrid::GpuVector< T >
	Template class that wraps around a single GPU array, providing functionality that mimics std::vector. More...

struct	TasGrid::GpuEngine
	Wrapper class around calls GPU accelerated linear algebra libraries. More...

class	TasGrid::AccelerationDomainTransform
	Implements the domain transform algorithms in case the user data is provided on the GPU. More...

struct	TasGrid::AccelerationContext
	Wrapper class around GPU device ID, acceleration type and GpuEngine. More...

class	TasGrid::CacheLagrange< T >
	Cache that holds the values of 1D Lagrange polynomials. More...

class	TasGrid::CacheLagrangeDerivative< T >
	Cache that holds the derivatives of 1D Lagrange polynomials. Uses the same interface as CacheLagrange. . More...

Functions
template<typename >
void	TasGrid::deleteHandle (int *)
	Deletes the handle, specialized for each TPL backend and tag in TasGrid::AccHandle namepace. More...

template<typename T >
void	TasGrid::TasGpu::dtrans2can (AccelerationContext const acc, bool use01, int dims, int num_x, int pad_size, const double gpu_trans_a, const double gpu_trans_b, const T gpu_x_transformed, T *gpu_x_canonical)
	Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA device. More...

template<typename T >
void	TasGrid::TasGpu::devalpwpoly (AccelerationContext const acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T gpu_x, const T gpu_nodes, const T gpu_support, T *gpu_y)
	Evaluate the basis functions for a local polynomial grid using the DENSE algorithm. More...

template<typename T >
void	TasGrid::TasGpu::devalpwpoly_sparse (AccelerationContext const acc, int order, TypeOneDRule rule, int dims, int num_x, const T gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals)
	Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm. More...

template<typename T >
void	TasGrid::TasGpu::devalseq (AccelerationContext const acc, int dims, int num_x, const std::vector< int > &max_levels, const T gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result)
	Evaluate the basis for a Sequence grid. More...

template<typename T >
void	TasGrid::TasGpu::devalfor (AccelerationContext const acc, int dims, int num_x, const std::vector< int > &max_levels, const T gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T gpu_wreal, typename GpuVector< T >::value_type gpu_wimag)
	Evaluate the basis for a Fourier grid. More...

template<typename T >
void	TasGrid::TasGpu::devalglo (AccelerationContext const acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result)
	Evaluate the basis for Global grid. More...

void	TasGrid::TasGpu::fillDataGPU (AccelerationContext const *acc, double value, long long N, long long stride, double data[])
	Fills the data with the provided real number at the given stride.

template<typename T >
void	TasGrid::TasGpu::load_n (AccelerationContext const acc, T const cpu_data, size_t num_entries, T *gpu_data)
	Similar to copy_n, copies the data from the CPU to the GPU.

template<typename T , typename U >
Utils::use_if<!std::is_same< U, T >::value >	TasGrid::TasGpu::load_n (AccelerationContext const acc, U const cpu_data, size_t num_entries, T *gpu_data)
	Similar to copy_n, copies the data from the CPU to the GPU.

TypeAcceleration	TasGrid::AccelerationMeta::getIOAccelerationString (const char *name)
	Convert the string (coming from C or Python) into an enumerated type. More...

const char *	TasGrid::AccelerationMeta::getIOAccelerationString (TypeAcceleration accel)
	Convert the enumerated type to a string, the inverse of getIOAccelerationString() More...

int	TasGrid::AccelerationMeta::getIOAccelerationInt (TypeAcceleration accel)
	Convert the integer (coming from Fortran) into an enumerated type. More...

TypeAcceleration	TasGrid::AccelerationMeta::getIOIntAcceleration (int accel)
	Convert the enumerated type to an integer, the inverse of getIOAccelerationInt() More...

bool	TasGrid::AccelerationMeta::isAccTypeGPU (TypeAcceleration accel)
	Returns true if accele is cuda, cublas or magma. More...

TypeAcceleration	TasGrid::AccelerationMeta::getAvailableFallback (TypeAcceleration accel)
	Implements fallback logic, if accel has been enabled through CMake then this returns accel, otherwise it returns the "next-best-thing". More...

int	TasGrid::AccelerationMeta::getNumGpuDevices ()
	Return the number of visible GPU devices. More...

void	TasGrid::AccelerationMeta::setDefaultGpuDevice (int deviceID)
	Selects the active device for this CPU thread, not supported for DPC++. More...

unsigned long long	TasGrid::AccelerationMeta::getTotalGPUMemory (int deviceID)
	Return the memory available in the device (in units of bytes). More...

std::string	TasGrid::AccelerationMeta::getGpuDeviceName (int deviceID)
	Returns the name of the selected GPU device, empty string if no device is available or the index is out of bounds. More...

template<typename T >
void	TasGrid::AccelerationMeta::recvGpuArray (AccelerationContext const , size_t num_entries, const T gpu_data, std::vector< T > &cpu_data)
	Copy a device array to the main memory, used for testing only, always favor using GpuVector (if possible). More...

template<typename T >
void	TasGrid::AccelerationMeta::delGpuArray (AccelerationContext const , T x)
	Deallocate device array, used primarily for testing, always favor using GpuVector (if possible). More...

void *	TasGrid::AccelerationMeta::createCublasHandle ()
	Creates a new cuBlas handle, used in unit-testing only.

void	TasGrid::AccelerationMeta::deleteCublasHandle (void *)
	Destroys the cuBlas handle, used in unit-testing only.

Detailed Description

RAII Memory Management: CUDA uses C-style of memory management with cudaMalloc(), cudaMemcopy(), cudaFree(), but templated C++ std::vector-style class is far more handy and more fail-safe. The GpuVector template class guards against memory leaks and offers more seamless integration between CPU and GPU data structures. See the GpuVector documentation for details.

Streams and Handles Encapsulation: CUDA linear algebra libraries (as well as MAGAM), use streams and handles for all their calls. The handles have to be allocated, deleted, and passed around which causes unnecessary code clutter. Encapsulating the handles in a single GpuEngine class greatly simplifies the work-flow. Furthermore, some (sparse) linear operations require multiple calls to CUDA/MAGMA libraries, and it is easier to combine those into a single call to a GpuEngine method.

Acceleration Metadata

The AccelerationMeta namespace offers several methods used throughout the library and in the testing:

Tasmanian specific acceleration fallback logic
Reading CUDA device properties, e.g., number of devices or total memory
Error handling for common CUDA/cuBlas/cuSparse calls

C++ Wrappers to Fortran BLAS API: The standard BLAS API follows Fortran calling conventions, e.g., call by value and underscore at the end of function names. A C++ wrapper is provided that handles Tasmanian specific cases of dense matrix-matrix and matrix-vector multiplication using C++ compatible API.

Function Documentation

◆ deleteHandle()

template<typename >

void TasGrid::deleteHandle ( int * )

Deletes the handle, specialized for each TPL backend and tag in TasGrid::AccHandle namepace.

◆ dtrans2can()

template<typename T >

void TasGrid::TasGpu::dtrans2can	(	AccelerationContext const *	acc,
		bool	use01,
		int	dims,
		int	num_x,
		int	pad_size,
		const double *	gpu_trans_a,
		const double *	gpu_trans_b,
		const T *	gpu_x_transformed,
		T *	gpu_x_canonical
	)

Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA device.

The array gpu_x_transformed is provided by the user and must be of size num_x times dims. The points are stored contiguously in strides of dim (identical to all other calls). In order to facilitate contiguous memory access, it is most efficient to assign each thread to different dimension, but the dimensions are much less than the threads. Thus, we pad the transforms gpu_trans_a and gpu_trans_b as if they are applied to much larger vectors (of dimension pad_size). The pad_size must be a multiple of dims, but num_x doesn't have to divide into pad_size. The gpu_trans_a and gpu_trans_b must have length pad_size and must contain the rate and shift of the transform (not the upper/lower limits).

The use01 indicates whether to use canonical interval (0, 1) or (-1, 1). This is called from AccelerationDomainTransform::getCanonicalPoints().

See the implementation of AccelerationDomainTransform::load() for more details.

◆ devalpwpoly()

template<typename T >

void TasGrid::TasGpu::devalpwpoly	(	AccelerationContext const *	acc,
		int	order,
		TypeOneDRule	rule,
		int	num_dimensions,
		int	num_x,
		int	num_basis,
		const T *	gpu_x,
		const T *	gpu_nodes,
		const T *	gpu_support,
		T *	gpu_y
	)

Evaluate the basis functions for a local polynomial grid using the DENSE algorithm.

Used for basis evaluations in GridLocalPolynomial with order (0, 1, and 2) and rule. The grid has num_dimensions and num_basis functions (equal to the number of points in the grid). The number of locations to evaluate is num_x, and gpu_x must have size num_x times num_dimensions, and gpu_x must be on a canonical interval. The output matrix gpu_y has dimension num_basis (contiguous dimension) times num_x.

The grid nodes and the associated support are "encoded" in gpu_nodes and gpu_support, where negative support is used to handle special cases such as global support on level 1 (rule semi-localp). The interpretation of the negative support must be synchronized between the kernel tasgpu_devalpwpoly_feval() and GridLocalPolynomial::encodeSupportForGPU().

◆ devalpwpoly_sparse()

template<typename T >

void TasGrid::TasGpu::devalpwpoly_sparse	(	AccelerationContext const *	acc,
		int	order,
		TypeOneDRule	rule,
		int	dims,
		int	num_x,
		const T *	gpu_x,
		const GpuVector< T > &	gpu_nodes,
		const GpuVector< T > &	gpu_support,
		const GpuVector< int > &	gpu_hpntr,
		const GpuVector< int > &	gpu_hindx,
		const GpuVector< int > &	gpu_hroots,
		GpuVector< int > &	gpu_spntr,
		GpuVector< int > &	gpu_sindx,
		GpuVector< T > &	gpu_svals
	)

Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm.

The inputs are identical to devalpwpoly() with the addition of hierarchy vectors gpu_hpntr, gpu_hindx, and gpu_roots. The hierarchy vectors define a series of trees, one for each entry in gpu_roots, and all three nodes combined are indexed from 0 to gpu_spntr.size(). The gpu_hpntr holds the offsets of the children of each node and the indexes of the children are stored in gpu_sindx. The format is identical to row compressed sparse matrix.

The output vectors gpu_spntr, gpu_sindx and gpu_svals form a row compressed matrix, e.g., in format that can directly interface with Nvidia cusparseDcsrmm2().

◆ devalseq()

template<typename T >

void TasGrid::TasGpu::devalseq	(	AccelerationContext const *	acc,
		int	dims,
		int	num_x,
		const std::vector< int > &	max_levels,
		const T *	gpu_x,
		const GpuVector< int > &	num_nodes,
		const GpuVector< int > &	points,
		const GpuVector< T > &	nodes,
		const GpuVector< T > &	coeffs,
		T *	gpu_result
	)

Evaluate the basis for a Sequence grid.

The evaluation points are defined on a canonical interval and given in gpu_x with size dims times num_x. The max_levels indicates the maximum level in each direction (one more than the vector stored in the GridSequence data structure), the vector is required to compute the offsets of the intermediate cache for the Newton polynomials. The points holds the same information as the MultiIndexSet, but in transposed order, i.e., the dimensions of the multi-indexes are contiguous. The kernel handles one dimension at a time, hence the switched order compared to the CPU which handles one multi-index at a time. The ndoes vector has the cached 1D nodes and coeffs holds the cached coefficients of the Newton polynomials.

The output is gpu_result which must have dimension num_x by num_nodes.size() / dims.

◆ devalfor()

template<typename T >

void TasGrid::TasGpu::devalfor	(	AccelerationContext const *	acc,
		int	dims,
		int	num_x,
		const std::vector< int > &	max_levels,
		const T *	gpu_x,
		const GpuVector< int > &	num_nodes,
		const GpuVector< int > &	points,
		T *	gpu_wreal,
		typename GpuVector< T >::value_type *	gpu_wimag
	)

Evaluate the basis for a Fourier grid.

The logic is identical to devalseq(), except the Fourier polynomials do not require nodes or coefficients. The output is two real arrays of size num_x by num_nodes.size() / dims corresponding to the real and complex parts of the basis.

◆ devalglo()

template<typename T >

void TasGrid::TasGpu::devalglo	(	AccelerationContext const *	acc,
		bool	is_nested,
		bool	is_clenshawcurtis0,
		int	dims,
		int	num_x,
		int	num_p,
		int	num_basis,
		T const *	gpu_x,
		GpuVector< T > const &	nodes,
		GpuVector< T > const &	coeff,
		GpuVector< T > const &	tensor_weights,
		GpuVector< int > const &	nodes_per_level,
		GpuVector< int > const &	offset_per_level,
		GpuVector< int > const &	map_dimension,
		GpuVector< int > const &	map_level,
		GpuVector< int > const &	active_tensors,
		GpuVector< int > const &	active_num_points,
		GpuVector< int > const &	dim_offsets,
		GpuVector< int > const &	map_tensor,
		GpuVector< int > const &	map_index,
		GpuVector< int > const &	map_reference,
		T *	gpu_result
	)

Evaluate the basis for Global grid.

The logic is more complicated due to the more general nature of the grid.

◆ getIOAccelerationString() [1/2]

TypeAcceleration TasGrid::AccelerationMeta::getIOAccelerationString ( const char * name )

Convert the string (coming from C or Python) into an enumerated type.

◆ getIOAccelerationString() [2/2]

const char* TasGrid::AccelerationMeta::getIOAccelerationString ( TypeAcceleration accel )

Convert the enumerated type to a string, the inverse of getIOAccelerationString()

◆ getIOAccelerationInt()

int TasGrid::AccelerationMeta::getIOAccelerationInt ( TypeAcceleration accel )

Convert the integer (coming from Fortran) into an enumerated type.

◆ getIOIntAcceleration()

TypeAcceleration TasGrid::AccelerationMeta::getIOIntAcceleration ( int accel )

Convert the enumerated type to an integer, the inverse of getIOAccelerationInt()

◆ isAccTypeGPU()

bool TasGrid::AccelerationMeta::isAccTypeGPU ( TypeAcceleration accel )

Returns true if accele is cuda, cublas or magma.

◆ getAvailableFallback()

TypeAcceleration TasGrid::AccelerationMeta::getAvailableFallback ( TypeAcceleration accel )

Implements fallback logic, if accel has been enabled through CMake then this returns accel, otherwise it returns the "next-best-thing".

This function always returns a valid acceleration type. The fallback logic is documented with the enumerated type TasGrid::TypeAcceleration.

◆ getNumGpuDevices()

int TasGrid::AccelerationMeta::getNumGpuDevices ( )

Return the number of visible GPU devices.

◆ setDefaultGpuDevice()

void TasGrid::AccelerationMeta::setDefaultGpuDevice ( int deviceID )

Selects the active device for this CPU thread, not supported for DPC++.

The deviceID must be a valid ID (between 0 and getNumGpuDevices() -1).

◆ getTotalGPUMemory()

unsigned long long TasGrid::AccelerationMeta::getTotalGPUMemory ( int deviceID )

Return the memory available in the device (in units of bytes).

The deviceID must be a valid ID (between 0 and getNumGpuDevices() -1).

◆ getGpuDeviceName()

std::string TasGrid::AccelerationMeta::getGpuDeviceName ( int deviceID )

Returns the name of the selected GPU device, empty string if no device is available or the index is out of bounds.

The deviceID must be a valid ID (between 0 and getNumGpuDevices() -1).

◆ recvGpuArray()

template<typename T >

void TasGrid::AccelerationMeta::recvGpuArray	(	AccelerationContext const *	,
		size_t	num_entries,
		const T *	gpu_data,
		std::vector< T > &	cpu_data
	)

Copy a device array to the main memory, used for testing only, always favor using GpuVector (if possible).

◆ delGpuArray()

template<typename T >

void TasGrid::AccelerationMeta::delGpuArray	(	AccelerationContext const *	,
		T *	x
	)

Deallocate device array, used primarily for testing, always favor using GpuVector (if possible).

Files

Classes

Functions

Detailed Description

Function Documentation

◆ deleteHandle()

◆ dtrans2can()

◆ devalpwpoly()

◆ devalpwpoly_sparse()

◆ devalseq()

◆ devalfor()

◆ devalglo()

◆ getIOAccelerationString() [1/2]

◆ getIOAccelerationString() [2/2]

◆ getIOAccelerationInt()

◆ getIOIntAcceleration()

◆ isAccTypeGPU()

◆ getAvailableFallback()

◆ getNumGpuDevices()

◆ setDefaultGpuDevice()

◆ getTotalGPUMemory()

◆ getGpuDeviceName()

◆ recvGpuArray()

◆ delGpuArray()