Doxygen
1.9.1
|
Wrappers around custom CUDA kernels to handle domain transforms and basis evaluations, the kernels are instantiated in tsgCudaKernels.cu. More...
Functions | |
template<typename T > | |
void | dtrans2can (AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size, const double *gpu_trans_a, const double *gpu_trans_b, const T *gpu_x_transformed, T *gpu_x_canonical) |
Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA device. More... | |
template<typename T > | |
void | devalpwpoly (AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y) |
Evaluate the basis functions for a local polynomial grid using the DENSE algorithm. More... | |
template<typename T > | |
void | devalpwpoly_sparse (AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals) |
Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm. More... | |
template<typename T > | |
void | devalseq (AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result) |
Evaluate the basis for a Sequence grid. More... | |
template<typename T > | |
void | devalfor (AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T *gpu_wreal, typename GpuVector< T >::value_type *gpu_wimag) |
Evaluate the basis for a Fourier grid. More... | |
template<typename T > | |
void | devalglo (AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const *gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result) |
Evaluate the basis for Global grid. More... | |
void | fillDataGPU (AccelerationContext const *acc, double value, long long N, long long stride, double data[]) |
Fills the data with the provided real number at the given stride. | |
template<typename T > | |
void | load_n (AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data) |
Similar to copy_n, copies the data from the CPU to the GPU. | |
template<typename T , typename U > | |
Utils::use_if<!std::is_same< U, T >::value > | load_n (AccelerationContext const *acc, U const *cpu_data, size_t num_entries, T *gpu_data) |
Similar to copy_n, copies the data from the CPU to the GPU. | |
template<typename scalar_type > | |
void | solveLSmultiGPU (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[]) |
Least squares solver with data sitting on the gpu device. More... | |
template<typename scalar_type > | |
void | solveLSmultiOOC (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[]) |
Identical to TasGpu::solveLSmultiGPU() but the arrays are on the CPU and the MAGMA out-of-core implementation is used. | |
template<typename scalar_type > | |
void | solveLSmulti (AccelerationContext const *acceleration, int n, int m, scalar_type A[], int nrhs, scalar_type B[]) |
Identical to TasGpu::solveLSmultiGPU() but the data starts with the CPU and gets uploaded to the GPU first. | |
void | factorizePLU (AccelerationContext const *acceleration, int n, double A[], int_gpu_lapack ipiv[]) |
Factorize ![]() | |
void | solvePLU (AccelerationContext const *acceleration, char trans, int n, double const A[], int_gpu_lapack const ipiv[], double b[]) |
Solve A x = b using a PLU factorization. | |
void | solvePLU (AccelerationContext const *acceleration, char trans, int n, double const A[], int_gpu_lapack const ipiv[], int nrhs, double B[]) |
Solve A x = b using a PLU factorization, B is in row-major format. | |
template<typename scalar_type > | |
void | denseMultiply (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, GpuVector< scalar_type > const &A, GpuVector< scalar_type > const &B, typename GpuVector< scalar_type >::value_type beta, scalar_type C[]) |
Wrapper to GPU BLAS that multiplies dense matrices (e.g., cuBlas, MAGMA). More... | |
template<typename scalar_type > | |
void | denseMultiplyMixed (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, GpuVector< scalar_type > const &A, scalar_type const B[], typename GpuVector< scalar_type >::value_type beta, scalar_type C[]) |
Identical to TasGpu::denseMultiply() but both B and C are array in CPU memory. | |
template<typename scalar_type > | |
void | sparseMultiply (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< scalar_type >::value_type alpha, const GpuVector< scalar_type > &A, const GpuVector< int > &pntr, const GpuVector< int > &indx, const GpuVector< scalar_type > &vals, scalar_type C[]) |
Wrapper to GPU methods that multiplies a sparse and a dense matrix. More... | |
template<typename T > | |
void | sparseMultiplyMixed (AccelerationContext const *acceleration, int M, int N, int K, typename GpuVector< T >::value_type alpha, const GpuVector< T > &A, const std::vector< int > &pntr, const std::vector< int > &indx, const std::vector< T > &vals, T C[]) |
Identical to TasGpu::sparseMultiply() but the sparse matrix and the result C are in CPU memory. | |
Wrappers around custom CUDA kernels to handle domain transforms and basis evaluations, the kernels are instantiated in tsgCudaKernels.cu.
void TasGrid::TasGpu::solveLSmultiGPU | ( | AccelerationContext const * | acceleration, |
int | n, | ||
int | m, | ||
scalar_type | A[], | ||
int | nrhs, | ||
scalar_type | B[] | ||
) |
Least squares solver with data sitting on the gpu device.
Solves the least squares problem where A has n rows and m columns (n >= m) and B has n rows and nrhs columns. Both matrices are stored in row-major format on the GPU device associated with the given acceleration. The matrix B will be overwritten with the solution.
void TasGrid::TasGpu::denseMultiply | ( | AccelerationContext const * | acceleration, |
int | M, | ||
int | N, | ||
int | K, | ||
typename GpuVector< scalar_type >::value_type | alpha, | ||
GpuVector< scalar_type > const & | A, | ||
GpuVector< scalar_type > const & | B, | ||
typename GpuVector< scalar_type >::value_type | beta, | ||
scalar_type | C[] | ||
) |
Wrapper to GPU BLAS that multiplies dense matrices (e.g., cuBlas, MAGMA).
Computes where A is M by K, B is K by N, and C is M by N all stored in column major format on the GPU device associated with the acceleration. The signature is near identical to BLAS sgemm() or dgemm() but there are no transpose variants and the leading dimensions are inferred as if the matrices have no padding.
void TasGrid::TasGpu::sparseMultiply | ( | AccelerationContext const * | acceleration, |
int | M, | ||
int | N, | ||
int | K, | ||
typename GpuVector< scalar_type >::value_type | alpha, | ||
const GpuVector< scalar_type > & | A, | ||
const GpuVector< int > & | pntr, | ||
const GpuVector< int > & | indx, | ||
const GpuVector< scalar_type > & | vals, | ||
scalar_type | C[] | ||
) |
Wrapper to GPU methods that multiplies a sparse and a dense matrix.
Computes where A is M by K, B is K by N, and C is M by N, matrices are in column major format with B being sparse in column compressed format.