31#ifndef __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
32#define __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
34#include "tsgAcceleratedHandles.hpp"
81struct AccelerationContext;
103 GpuVector(
GpuVector<T> &&other) : num_entries(Utils::exchange(other.num_entries, 0)), gpu_data(Utils::exchange(other.gpu_data, nullptr))
104 #ifdef Tasmanian_ENABLE_DPCPP
105 , sycl_queue(other.sycl_queue)
111 std::swap(num_entries, temp.num_entries);
112 std::swap(gpu_data, temp.gpu_data);
113 #ifdef Tasmanian_ENABLE_DPCPP
114 std::swap(sycl_queue, temp.sycl_queue);
143 size_t size()
const{
return num_entries; }
147 const T*
data()
const{
return gpu_data; }
154 bool empty()
const{
return (num_entries == 0); }
160 template<
typename IteratorLike>
162 load(acc, std::distance(ibegin, iend), &*ibegin);
172 load(acc, cpu_data.size(), cpu_data.data());
191 std::vector<T> converted(count);
192 std::transform(cpu_data, cpu_data + count, converted.begin(), [](U
const &x)->T{ return static_cast<T>(x); });
193 load(acc, converted);
197 cpu_data.resize(num_entries);
198 unload(acc, cpu_data.data());
213 T* external = gpu_data;
225 #ifdef Tasmanian_ENABLE_DPCPP
237 #ifdef Tasmanian_ENABLE_CUDA
253 #ifdef Tasmanian_ENABLE_HIP
255 void setRocBlasHandle(
void *handle);
257 void setRocSparseHandle(
void *handle);
260 std::unique_ptr<int, HandleDeleter<AccHandle::Rocblas>> rblas_handle;
262 std::unique_ptr<int, HandleDeleter<AccHandle::Rocsparse>> rsparse_handle;
265 #ifdef Tasmanian_ENABLE_DPCPP
267 void setSyclQueue(
void *queue);
269 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> internal_queue;
302 int num_dimensions, padded_size;
327 void dtrans2can(
AccelerationContext const *acc,
bool use01,
int dims,
int num_x,
int pad_size,
328 const double *gpu_trans_a,
const double *gpu_trans_b,
329 const T *gpu_x_transformed, T *gpu_x_canonical);
344 void devalpwpoly(
AccelerationContext const *acc,
int order,
TypeOneDRule rule,
int num_dimensions,
int num_x,
int num_basis,
const T *gpu_x,
const T *gpu_nodes,
const T *gpu_support, T *gpu_y);
376 void devalseq(
AccelerationContext const *acc,
int dims,
int num_x,
const std::vector<int> &max_levels,
const T *gpu_x,
398 void devalglo(
AccelerationContext const *acc,
bool is_nested,
bool is_clenshawcurtis0,
int dims,
int num_x,
int num_p,
int num_basis,
409 void fillDataGPU(
AccelerationContext const *acc,
double value,
long long N,
long long stride,
double data[]);
415 template<
typename T>
void load_n(
AccelerationContext const *acc, T
const *cpu_data,
size_t num_entries, T *gpu_data);
421 template<
typename T,
typename U>
423 std::vector<T> converted(num_entries);
424 std::transform(cpu_data, cpu_data + num_entries, converted.begin(), [](U
const &x)->T{ return static_cast<T>(x); });
425 load_n(acc, converted.data(), num_entries, gpu_data);
429 #ifdef __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__
435 void cudaDgemm(
int M,
int N,
int K,
const double *gpu_a,
const double *gpu_b,
double *gpu_c);
440 void cudaSparseMatmul(
int M,
int N,
int num_nz,
const int* gpu_spntr,
const int* gpu_sindx,
const double* gpu_svals,
const double *gpu_B,
double *gpu_C);
445 void cudaSparseVecDenseMat(
int M,
int N,
int num_nz,
const double *A,
const int *indx,
const double *vals,
double *C);
450 void convert_sparse_to_dense(
int num_rows,
int num_columns,
const int *gpu_pntr,
const int *gpu_indx,
const double *gpu_vals,
double *gpu_destination);
458namespace AccelerationMeta{
464 std::map<std::string, TypeAcceleration> getStringToAccelerationMap();
485 #ifdef Tasmanian_ENABLE_MAGMA
488 #ifdef Tasmanian_ENABLE_CUDA
492 #ifdef Tasmanian_ENABLE_HIP
496 #ifdef Tasmanian_ENABLE_DPCPP
500 #ifdef Tasmanian_ENABLE_BLAS
521 int getNumGpuDevices();
528 void setDefaultGpuDevice(
int deviceID);
535 unsigned long long getTotalGPUMemory(
int deviceID);
542 std::string getGpuDeviceName(
int deviceID);
547 template<
typename T>
void recvGpuArray(
AccelerationContext const*,
size_t num_entries,
const T *gpu_data, std::vector<T> &cpu_data);
558 void *createCublasHandle();
563 void deleteCublasHandle(
void *);
614 mutable std::unique_ptr<GpuEngine>
engine;
618 #ifdef Tasmanian_ENABLE_BLAS
626 #ifdef Tasmanian_ENABLE_DPCPP
638 #if __cplusplus > 201103L
665 #ifdef Tasmanian_ENABLE_BLAS
674 #if defined(Tasmanian_ENABLE_CUDA) || defined(Tasmanian_ENABLE_HIP)
683 TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);
684 #ifdef Tasmanian_ENABLE_DPCPP
685 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
686 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
688 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
689 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
695 return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? device_change :
change_gpu_device;
697 return (AccelerationMeta::isAccTypeGPU(effective_acc)) ?
change_gpu_enabled : mode_change;
704 TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);
706 #ifdef Tasmanian_ENABLE_DPCPP
707 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
708 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
710 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
711 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
713 if (AccelerationMeta::isAccTypeGPU(effective_acc)){
717 engine = Utils::make_unique<GpuEngine>();
723 mode = effective_acc;
734#ifdef Tasmanian_ENABLE_DPCPP
748struct InternalSyclQueue{
750 InternalSyclQueue() : use_testing(false){}
752 void init_testing(
int gpuid);
754 operator std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> (){
755 return std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>>(test_queue.get(),
756 HandleDeleter<AccHandle::Syclqueue>(
false));
761 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> test_queue;
770extern InternalSyclQueue test_queue;
Implements the domain transform algorithms in case the user data is provided on the GPU.
Definition tsgAcceleratedDataStructures.hpp:284
AccelerationDomainTransform(AccelerationContext const *, std::vector< double > const &transform_a, std::vector< double > const &transform_b)
Constructor, load the transform data to the GPU, the vectors are the same as used in the TasmanianSpa...
void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector< T > &gpu_canonical_x)
Transform a set of points, used in the calls to evaluateHierarchicalFunctionsGPU() Takes the user pro...
Template class that wraps around a single GPU array, providing functionality that mimics std::vector.
Definition tsgAcceleratedDataStructures.hpp:95
~GpuVector()
Destructor, release all allocated memory.
Definition tsgAcceleratedDataStructures.hpp:140
bool empty() const
Return true if the size() is zero.
Definition tsgAcceleratedDataStructures.hpp:154
void unload(AccelerationContext const *acc, std::vector< T > &cpu_data) const
Copy the data from the GPU array to cpu_data, the cpu_data will be resized and overwritten.
Definition tsgAcceleratedDataStructures.hpp:196
GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Construct a vector by loading from a given range.
Definition tsgAcceleratedDataStructures.hpp:138
GpuVector(GpuVector< T > const &)=delete
Delete the copy-constructor.
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, size_t count, const U *cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition tsgAcceleratedDataStructures.hpp:190
std::vector< T > unload(AccelerationContext const *acc) const
Return a CPU vector holding the data of the GPU.
Definition tsgAcceleratedDataStructures.hpp:201
T * eject()
Move the data to the external array, the vector is set to empty (unlike move command on std::vector).
Definition tsgAcceleratedDataStructures.hpp:212
GpuVector()
Default constructor, creates an empty (null) array.
Definition tsgAcceleratedDataStructures.hpp:120
void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Load from a range defined by the begin and end, converts if necessary.
Definition tsgAcceleratedDataStructures.hpp:161
void unload(AccelerationContext const *acc, size_t num, T *cpu_data) const
Copy the first num entries to the cpu_data buffer, assumes that the buffer is sufficiently large.
GpuVector(AccelerationContext const *acc, int dim1, int dim2)
Same as GpuVector(dim1 * dim2), but guards against overflow.
Definition tsgAcceleratedDataStructures.hpp:132
GpuVector(AccelerationContext const *acc, size_t count)
Construct a vector with count number of entries.
Definition tsgAcceleratedDataStructures.hpp:122
void load(AccelerationContext const *acc, size_t count, const T *cpu_data)
Copy the first count entries of cpu_data to the GPU device.
void unload(AccelerationContext const *acc, T *cpu_data) const
Copy the data from the GPU array to the cpu_data buffer, assumes that the buffer is sufficiently larg...
Definition tsgAcceleratedDataStructures.hpp:209
T value_type
The data-type of the vector entries.
Definition tsgAcceleratedDataStructures.hpp:220
void clear()
Delete all allocated memory and reset the array to empty.
GpuVector< T > & operator=(GpuVector< T > const &)=delete
Delete the copy-assignment.
size_t size() const
Return the current size of the GPU array.
Definition tsgAcceleratedDataStructures.hpp:143
const T * data() const
Get a const-reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition tsgAcceleratedDataStructures.hpp:147
void resize(AccelerationContext const *acc, size_t count)
Clear all data currently stored in the vector and allocate a new array (unlike std::vector this does ...
GpuVector(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Create a vector with size that matches cpu_data and copy the data to the GPU device.
Definition tsgAcceleratedDataStructures.hpp:134
GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data)
Construct a vector and load with date provided on to the cpu.
Definition tsgAcceleratedDataStructures.hpp:136
GpuVector(GpuVector< T > &&other)
Allow for move-construction.
Definition tsgAcceleratedDataStructures.hpp:103
T * data()
Get a reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition tsgAcceleratedDataStructures.hpp:145
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, const std::vector< U > &cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition tsgAcceleratedDataStructures.hpp:171
void load(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Copy the content of cpu_data to the GPU device, all pre-existing data is deleted and the vector is re...
Definition tsgAcceleratedDataStructures.hpp:157
TypeOneDRule
Used to specify the one dimensional family of rules that induces the sparse grid.
Definition tsgEnumerates.hpp:285
constexpr TypeAcceleration accel_gpu_rocblas
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition tsgEnumerates.hpp:575
constexpr TypeAcceleration accel_gpu_hip
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition tsgEnumerates.hpp:570
TypeAcceleration
Modes of acceleration.
Definition tsgEnumerates.hpp:551
@ accel_cpu_blas
Default (if available), uses both BLAS and LAPACK libraries.
Definition tsgEnumerates.hpp:555
@ accel_none
Usually the slowest mode, uses only OpenMP multi-threading, but optimized for memory and could be the...
Definition tsgEnumerates.hpp:553
@ accel_gpu_magma
Same the CUDA option but uses the UTK MAGMA library for the linear algebra operations.
Definition tsgEnumerates.hpp:563
@ accel_gpu_cublas
Mixed usage of the CPU (OpenMP) and GPU libraries.
Definition tsgEnumerates.hpp:559
@ accel_gpu_cuda
Similar to the cuBLAS option but also uses a set of Tasmanian custom GPU kernels.
Definition tsgEnumerates.hpp:561
size_t size_mult(IntA a, IntB b)
Converts two integer-like variables to size_t and returns the product..
Definition tsgUtils.hpp:82
typename std::enable_if< condition, void >::type use_if
Equivalent to C++14 enable_if_t<condition, void>
Definition tsgUtils.hpp:147
Encapsulates the Tasmanian Sparse Grid module.
Definition TasmanianSparseGrid.hpp:68
Wrapper class around GPU device ID, acceleration type and GpuEngine.
Definition tsgAcceleratedDataStructures.hpp:576
ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const
Returns the ChangeType if enable() is called, but does not change the acceleration.
Definition tsgAcceleratedDataStructures.hpp:682
AlgorithmPreference algorithm_select
The preference to use dense or sparse algorithms.
Definition tsgAcceleratedDataStructures.hpp:609
void enable(TypeAcceleration acc, int new_gpu_id)
Accepts parameters directly from TasmanianSparseGrid::enableAcceleration()
Definition tsgAcceleratedDataStructures.hpp:702
static constexpr int getDefaultAccDevice()
Returns the default acceleration device, CUDA/HIP use GPU 0, SYCL uses -1 which uses sycl::default_se...
Definition tsgAcceleratedDataStructures.hpp:625
ChangeType favorSparse(bool favor)
Sets algorithm affinity in the direction of sparse.
Definition tsgAcceleratedDataStructures.hpp:637
TypeAcceleration mode
The current active acceleration mode.
Definition tsgAcceleratedDataStructures.hpp:607
bool on_gpu() const
Returns true if any of the GPU-based acceleration modes have been enabled.
Definition tsgAcceleratedDataStructures.hpp:731
std::unique_ptr< GpuEngine > engine
Holds the context to the GPU TPL handles, e.g., MAGMA queue.
Definition tsgAcceleratedDataStructures.hpp:614
AlgorithmPreference
Defines the sparse-dense algorithm flavors, whenever applicable.
Definition tsgAcceleratedDataStructures.hpp:578
@ algorithm_dense
Use dense algorithm.
Definition tsgAcceleratedDataStructures.hpp:580
@ algorithm_sparse
Use sparse algorithm.
Definition tsgAcceleratedDataStructures.hpp:582
@ algorithm_autoselect
Use automatically select based on heuristics.
Definition tsgAcceleratedDataStructures.hpp:584
bool blasCompatible() const
Returns true if BLAS is enabled and the current mode is not none.
Definition tsgAcceleratedDataStructures.hpp:664
static constexpr TypeAcceleration getDefaultAccMode()
Returns the default acceleration mode, cpu_blas if BLAS is enabled and none otherwise.
Definition tsgAcceleratedDataStructures.hpp:617
int device
If using a GPU acceleration mode, holds the active device.
Definition tsgAcceleratedDataStructures.hpp:611
void setDevice() const
Set default device.
Definition tsgAcceleratedDataStructures.hpp:727
ChangeType
Defines the types of acceleration context updates so they can be linked to acceleration cache updates...
Definition tsgAcceleratedDataStructures.hpp:593
@ change_none
No change, do nothing.
Definition tsgAcceleratedDataStructures.hpp:595
@ change_sparse_dense
Change the sparse-dense AlgorithmPreference.
Definition tsgAcceleratedDataStructures.hpp:603
@ change_gpu_device
Change the associated GPU device.
Definition tsgAcceleratedDataStructures.hpp:597
@ change_cpu_blas
Change BLAS to none or none to BLAS.
Definition tsgAcceleratedDataStructures.hpp:601
@ change_gpu_enabled
Change from BLAS or none to a GPU acceleration mode.
Definition tsgAcceleratedDataStructures.hpp:599
AccelerationContext()
Creates a default context, the device id is set to 0 and acceleration is BLAS (if available) or none.
Definition tsgAcceleratedDataStructures.hpp:634
bool useKernels() const
Returns true if the current mode implies the use of custom GPU kernels.
Definition tsgAcceleratedDataStructures.hpp:673
Wrapper class around calls GPU accelerated linear algebra libraries.
Definition tsgAcceleratedDataStructures.hpp:236
std::unique_ptr< int > called_magma_init
Avoids an empty engine when no acceleration is enabled, allows for default constructor/move/copy,...
Definition tsgAcceleratedDataStructures.hpp:273
void setCuSolverDnHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusolverDnHandle_t associated with this CUD...
std::unique_ptr< int, HandleDeleter< AccHandle::Cusolver > > cusolver_handle
Holds the cuSolver handle.
Definition tsgAcceleratedDataStructures.hpp:250
void setCuBlasHandle(void *handle)
Manually sets the cuBlas handle, handle must be a valid cublasHandle_t associated with this CUDA devi...
void setCuSparseHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusparseHandle_t associated with this CUDA ...
std::unique_ptr< int, HandleDeleter< AccHandle::Cublas > > cublas_handle
Holds the cuBlas handle.
Definition tsgAcceleratedDataStructures.hpp:246
std::unique_ptr< int, HandleDeleter< AccHandle::Cusparse > > cusparse_handle
Holds the cuSparse handle.
Definition tsgAcceleratedDataStructures.hpp:248