Wrappers around custom CUDA kernels to handle domain transforms and basis evaluations, the kernels are instantiated in tsgCudaKernels.cu. More...

Functions
template<typename T >
void	dtrans2can (AccelerationContext const acc, bool use01, int dims, int num_x, int pad_size, const double gpu_trans_a, const double gpu_trans_b, const T gpu_x_transformed, T *gpu_x_canonical)
	Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA device. More...

template<typename T >
void	devalpwpoly (AccelerationContext const acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T gpu_x, const T gpu_nodes, const T gpu_support, T *gpu_y)
	Evaluate the basis functions for a local polynomial grid using the DENSE algorithm. More...

template<typename T >
void	devalpwpoly_sparse (AccelerationContext const acc, int order, TypeOneDRule rule, int dims, int num_x, const T gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals)
	Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm. More...

template<typename T >
void	devalseq (AccelerationContext const acc, int dims, int num_x, const std::vector< int > &max_levels, const T gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result)
	Evaluate the basis for a Sequence grid. More...

template<typename T >
void	devalfor (AccelerationContext const acc, int dims, int num_x, const std::vector< int > &max_levels, const T gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T gpu_wreal, typename GpuVector< T >::value_type gpu_wimag)
	Evaluate the basis for a Fourier grid. More...

template<typename T >
void	devalglo (AccelerationContext const acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result)
	Evaluate the basis for Global grid. More...

void	fillDataGPU (AccelerationContext const *acc, double value, long long N, long long stride, double data[])
	Fills the data with the provided real number at the given stride.

template<typename T >
void	load_n (AccelerationContext const acc, T const cpu_data, size_t num_entries, T *gpu_data)
	Similar to copy_n, copies the data from the CPU to the GPU.

template<typename T , typename U >
Utils::use_if<!std::is_same< U, T >::value >	load_n (AccelerationContext const acc, U const cpu_data, size_t num_entries, T *gpu_data)
	Similar to copy_n, copies the data from the CPU to the GPU.

template<typename scalar_type >
void	solveLSmultiGPU (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[])
	Least squares solver with data sitting on the gpu device. More...

template<typename scalar_type >
void	solveLSmultiOOC (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[])
	Identical to TasGpu::solveLSmultiGPU() but the arrays are on the CPU and the MAGMA out-of-core implementation is used.

template<typename scalar_type >
void	solveLSmulti (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[])
	Identical to TasGpu::solveLSmultiGPU() but the data starts with the CPU and gets uploaded to the GPU first.

void	factorizePLU (AccelerationContext const *acceleration, int n, double A[], int_gpu_lapack ipiv[])
	Factorize , arrays are on the GPU.

void	solvePLU (AccelerationContext const *acceleration, char trans, int n, double const A[], int_gpu_lapack const ipiv[], double b[])
	Solve A x = b using a PLU factorization.

void	solvePLU (AccelerationContext const *acceleration, char trans, int n, double const A[], int_gpu_lapack const ipiv[], int nrhs, double B[])
	Solve A x = b using a PLU factorization, B is in row-major format.

template<typename scalar_type >
void	denseMultiply (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, GpuVector< scalar_type > const &A, GpuVector< scalar_type > const &B, typename GpuVector< scalar_type >::value_type beta, scalar_type C[])
	Wrapper to GPU BLAS that multiplies dense matrices (e.g., cuBlas, MAGMA). More...

template<typename scalar_type >
void	denseMultiplyMixed (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, GpuVector< scalar_type > const &A, scalar_type const B[], typename GpuVector< scalar_type >::value_type beta, scalar_type C[])
	Identical to TasGpu::denseMultiply() but both B and C are array in CPU memory.

template<typename scalar_type >
void	sparseMultiply (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, const GpuVector< scalar_type > &A, const GpuVector< int > &pntr, const GpuVector< int > &indx, const GpuVector< scalar_type > &vals, scalar_type C[])
	Wrapper to GPU methods that multiplies a sparse and a dense matrix. More...

template<typename T >
void	sparseMultiplyMixed (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< T >::value_type alpha, const GpuVector< T > &A, const std::vector< int > &pntr, const std::vector< int > &indx, const std::vector< T > &vals, T C[])
	Identical to TasGpu::sparseMultiply() but the sparse matrix and the result C are in CPU memory.

Detailed Description

Wrappers around custom CUDA kernels to handle domain transforms and basis evaluations, the kernels are instantiated in tsgCudaKernels.cu.

Function Documentation

◆ solveLSmultiGPU()

template<typename scalar_type >

void TasGrid::TasGpu::solveLSmultiGPU	(	AccelerationContext const *	acceleration,
		int	n,
		int	m,
		scalar_type	A[],
		int	nrhs,
		scalar_type	B[]
	)

Least squares solver with data sitting on the gpu device.

Solves the least squares problem $min_x \| A x - B \|^2_2$ where A has n rows and m columns (n >= m) and B has n rows and nrhs columns. Both matrices are stored in row-major format on the GPU device associated with the given acceleration. The matrix B will be overwritten with the solution.

◆ denseMultiply()

template<typename scalar_type >

void TasGrid::TasGpu::denseMultiply	(	AccelerationContext const *	acceleration,
		int	M,
		int	N,
		int	K,
		typename GpuVector< scalar_type >::value_type	alpha,
		GpuVector< scalar_type > const &	A,
		GpuVector< scalar_type > const &	B,
		typename GpuVector< scalar_type >::value_type	beta,
		scalar_type	C[]
	)

Wrapper to GPU BLAS that multiplies dense matrices (e.g., cuBlas, MAGMA).

Computes $C = \alpha A B + \beta C$ where A is M by K, B is K by N, and C is M by N all stored in column major format on the GPU device associated with the acceleration. The signature is near identical to BLAS sgemm() or dgemm() but there are no transpose variants and the leading dimensions are inferred as if the matrices have no padding.

◆ sparseMultiply()

template<typename scalar_type >

void TasGrid::TasGpu::sparseMultiply	(	AccelerationContext const *	acceleration,
		int	M,
		int	N,
		int	K,
		typename GpuVector< scalar_type >::value_type	alpha,
		const GpuVector< scalar_type > &	A,
		const GpuVector< int > &	pntr,
		const GpuVector< int > &	indx,
		const GpuVector< scalar_type > &	vals,
		scalar_type	C[]
	)

Wrapper to GPU methods that multiplies a sparse and a dense matrix.

Computes $C = \alpha A B$ where A is M by K, B is K by N, and C is M by N, matrices are in column major format with B being sparse in column compressed format.

Functions

Detailed Description

Function Documentation

◆ solveLSmultiGPU()

◆ denseMultiply()

◆ sparseMultiply()