31 #ifndef __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
32 #define __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
34 #include "tsgAcceleratedHandles.hpp"
81 struct AccelerationContext;
104 #ifdef Tasmanian_ENABLE_DPCPP
105 , sycl_queue(other.sycl_queue)
111 std::swap(num_entries, temp.num_entries);
112 std::swap(gpu_data, temp.gpu_data);
113 #ifdef Tasmanian_ENABLE_DPCPP
114 std::swap(sycl_queue, temp.sycl_queue);
143 size_t size()
const{
return num_entries; }
147 const T*
data()
const{
return gpu_data; }
154 bool empty()
const{
return (num_entries == 0); }
160 template<
typename IteratorLike>
162 load(acc, std::distance(ibegin, iend), &*ibegin);
172 load(acc, cpu_data.size(), cpu_data.data());
191 std::vector<T> converted(count);
192 std::transform(cpu_data, cpu_data + count, converted.begin(), [](U
const &x)->T{ return static_cast<T>(x); });
193 load(acc, converted);
197 cpu_data.resize(num_entries);
198 unload(acc, cpu_data.data());
213 T* external = gpu_data;
225 #ifdef Tasmanian_ENABLE_DPCPP
237 #ifdef Tasmanian_ENABLE_CUDA
253 #ifdef Tasmanian_ENABLE_HIP
255 void setRocBlasHandle(
void *handle);
257 void setRocSparseHandle(
void *handle);
260 std::unique_ptr<int, HandleDeleter<AccHandle::Rocblas>> rblas_handle;
262 std::unique_ptr<int, HandleDeleter<AccHandle::Rocsparse>> rsparse_handle;
265 #ifdef Tasmanian_ENABLE_DPCPP
267 void setSyclQueue(
void *queue);
269 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> internal_queue;
302 int num_dimensions, padded_size;
328 const double *gpu_trans_a,
const double *gpu_trans_b,
329 const T *gpu_x_transformed, T *gpu_x_canonical);
421 template<
typename T,
typename U>
423 std::vector<T> converted(num_entries);
424 std::transform(cpu_data, cpu_data + num_entries, converted.begin(), [](U
const &x)->T{ return static_cast<T>(x); });
425 load_n(acc, converted.data(), num_entries, gpu_data);
429 #ifdef __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__
435 void cudaDgemm(
int M,
int N,
int K,
const double *gpu_a,
const double *gpu_b,
double *gpu_c);
440 void cudaSparseMatmul(
int M,
int N,
int num_nz,
const int* gpu_spntr,
const int* gpu_sindx,
const double* gpu_svals,
const double *gpu_B,
double *gpu_C);
445 void cudaSparseVecDenseMat(
int M,
int N,
int num_nz,
const double *A,
const int *indx,
const double *vals,
double *C);
450 void convert_sparse_to_dense(
int num_rows,
int num_columns,
const int *gpu_pntr,
const int *gpu_indx,
const double *gpu_vals,
double *gpu_destination);
458 namespace AccelerationMeta{
485 #ifdef Tasmanian_ENABLE_MAGMA
488 #ifdef Tasmanian_ENABLE_CUDA
492 #ifdef Tasmanian_ENABLE_HIP
496 #ifdef Tasmanian_ENABLE_DPCPP
500 #ifdef Tasmanian_ENABLE_BLAS
614 mutable std::unique_ptr<GpuEngine>
engine;
618 #ifdef Tasmanian_ENABLE_BLAS
626 #ifdef Tasmanian_ENABLE_DPCPP
655 #ifdef Tasmanian_ENABLE_BLAS
664 #if defined(Tasmanian_ENABLE_CUDA) || defined(Tasmanian_ENABLE_HIP)
674 #ifdef Tasmanian_ENABLE_DPCPP
676 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
679 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
696 #ifdef Tasmanian_ENABLE_DPCPP
698 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
701 throw std::runtime_error(
"Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
707 engine = Utils::make_unique<GpuEngine>();
713 mode = effective_acc;
724 #ifdef Tasmanian_ENABLE_DPCPP
738 struct InternalSyclQueue{
740 InternalSyclQueue() : use_testing(false){}
742 void init_testing(
int gpuid);
744 operator std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> (){
745 return std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>>(test_queue.get(),
746 HandleDeleter<AccHandle::Syclqueue>(
false));
751 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> test_queue;
760 extern InternalSyclQueue test_queue;
Implements the domain transform algorithms in case the user data is provided on the GPU.
Definition: tsgAcceleratedDataStructures.hpp:284
AccelerationDomainTransform(AccelerationContext const *, std::vector< double > const &transform_a, std::vector< double > const &transform_b)
Constructor, load the transform data to the GPU, the vectors are the same as used in the TasmanianSpa...
void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector< T > &gpu_canonical_x)
Transform a set of points, used in the calls to evaluateHierarchicalFunctionsGPU() Takes the user pro...
Template class that wraps around a single GPU array, providing functionality that mimics std::vector.
Definition: tsgAcceleratedDataStructures.hpp:95
std::vector< T > unload(AccelerationContext const *acc) const
Return a CPU vector holding the data of the GPU.
Definition: tsgAcceleratedDataStructures.hpp:201
~GpuVector()
Destructor, release all allocated memory.
Definition: tsgAcceleratedDataStructures.hpp:140
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, size_t count, const U *cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:190
const T * data() const
Get a const-reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:147
bool empty() const
Return true if the size() is zero.
Definition: tsgAcceleratedDataStructures.hpp:154
void unload(AccelerationContext const *acc, std::vector< T > &cpu_data) const
Copy the data from the GPU array to cpu_data, the cpu_data will be resized and overwritten.
Definition: tsgAcceleratedDataStructures.hpp:196
GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Construct a vector by loading from a given range.
Definition: tsgAcceleratedDataStructures.hpp:138
GpuVector(GpuVector< T > const &)=delete
Delete the copy-constructor.
T * eject()
Move the data to the external array, the vector is set to empty (unlike move command on std::vector).
Definition: tsgAcceleratedDataStructures.hpp:212
T * data()
Get a reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:145
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, const std::vector< U > &cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:171
GpuVector()
Default constructor, creates an empty (null) array.
Definition: tsgAcceleratedDataStructures.hpp:120
void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Load from a range defined by the begin and end, converts if necessary.
Definition: tsgAcceleratedDataStructures.hpp:161
void unload(AccelerationContext const *acc, size_t num, T *cpu_data) const
Copy the first num entries to the cpu_data buffer, assumes that the buffer is sufficiently large.
GpuVector(AccelerationContext const *acc, int dim1, int dim2)
Same as GpuVector(dim1 * dim2), but guards against overflow.
Definition: tsgAcceleratedDataStructures.hpp:132
GpuVector(AccelerationContext const *acc, size_t count)
Construct a vector with count number of entries.
Definition: tsgAcceleratedDataStructures.hpp:122
GpuVector< T > & operator=(GpuVector< T > const &)=delete
Delete the copy-assignment.
void load(AccelerationContext const *acc, size_t count, const T *cpu_data)
Copy the first count entries of cpu_data to the GPU device.
void unload(AccelerationContext const *acc, T *cpu_data) const
Copy the data from the GPU array to the cpu_data buffer, assumes that the buffer is sufficiently larg...
Definition: tsgAcceleratedDataStructures.hpp:209
T value_type
The data-type of the vector entries.
Definition: tsgAcceleratedDataStructures.hpp:220
void clear()
Delete all allocated memory and reset the array to empty.
size_t size() const
Return the current size of the GPU array.
Definition: tsgAcceleratedDataStructures.hpp:143
void resize(AccelerationContext const *acc, size_t count)
Clear all data currently stored in the vector and allocate a new array (unlike std::vector this does ...
GpuVector(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Create a vector with size that matches cpu_data and copy the data to the GPU device.
Definition: tsgAcceleratedDataStructures.hpp:134
GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data)
Construct a vector and load with date provided on to the cpu.
Definition: tsgAcceleratedDataStructures.hpp:136
GpuVector(GpuVector< T > &&other)
Allow for move-construction.
Definition: tsgAcceleratedDataStructures.hpp:103
void load(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Copy the content of cpu_data to the GPU device, all pre-existing data is deleted and the vector is re...
Definition: tsgAcceleratedDataStructures.hpp:157
TypeOneDRule
Used to specify the one dimensional family of rules that induces the sparse grid.
Definition: tsgEnumerates.hpp:285
constexpr TypeAcceleration accel_gpu_rocblas
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:575
constexpr TypeAcceleration accel_gpu_hip
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:570
TypeAcceleration
Modes of acceleration.
Definition: tsgEnumerates.hpp:551
@ accel_cpu_blas
Default (if available), uses both BLAS and LAPACK libraries.
Definition: tsgEnumerates.hpp:555
@ accel_none
Usually the slowest mode, uses only OpenMP multi-threading, but optimized for memory and could be the...
Definition: tsgEnumerates.hpp:553
@ accel_gpu_magma
Same the CUDA option but uses the UTK MAGMA library for the linear algebra operations.
Definition: tsgEnumerates.hpp:563
@ accel_gpu_cublas
Mixed usage of the CPU (OpenMP) and GPU libraries.
Definition: tsgEnumerates.hpp:559
@ accel_gpu_cuda
Similar to the cuBLAS option but also uses a set of Tasmanian custom GPU kernels.
Definition: tsgEnumerates.hpp:561
void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result)
Evaluate the basis for a Sequence grid.
int getIOAccelerationInt(TypeAcceleration accel)
Convert the integer (coming from Fortran) into an enumerated type.
void setDefaultGpuDevice(int deviceID)
Selects the active device for this CPU thread, not supported for DPC++.
std::string getGpuDeviceName(int deviceID)
Returns the name of the selected GPU device, empty string if no device is available or the index is o...
void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size, const double *gpu_trans_a, const double *gpu_trans_b, const T *gpu_x_transformed, T *gpu_x_canonical)
Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA dev...
void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data)
Similar to copy_n, copies the data from the CPU to the GPU.
void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T *gpu_wreal, typename GpuVector< T >::value_type *gpu_wimag)
Evaluate the basis for a Fourier grid.
void delGpuArray(AccelerationContext const *, T *x)
Deallocate device array, used primarily for testing, always favor using GpuVector (if possible).
bool isAccTypeGPU(TypeAcceleration accel)
Returns true if accele is cuda, cublas or magma.
TypeAcceleration getIOIntAcceleration(int accel)
Convert the enumerated type to an integer, the inverse of getIOAccelerationInt()
void recvGpuArray(AccelerationContext const *, size_t num_entries, const T *gpu_data, std::vector< T > &cpu_data)
Copy a device array to the main memory, used for testing only, always favor using GpuVector (if possi...
void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const *gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result)
Evaluate the basis for Global grid.
TypeAcceleration getIOAccelerationString(const char *name)
Convert the string (coming from C or Python) into an enumerated type.
void * createCublasHandle()
Creates a new cuBlas handle, used in unit-testing only.
void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[])
Fills the data with the provided real number at the given stride.
void deleteCublasHandle(void *)
Destroys the cuBlas handle, used in unit-testing only.
int getNumGpuDevices()
Return the number of visible GPU devices.
void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals)
Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm.
TypeAcceleration getAvailableFallback(TypeAcceleration accel)
Implements fallback logic, if accel has been enabled through CMake then this returns accel,...
void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y)
Evaluate the basis functions for a local polynomial grid using the DENSE algorithm.
unsigned long long getTotalGPUMemory(int deviceID)
Return the memory available in the device (in units of bytes).
size_t size_mult(IntA a, IntB b)
Converts two integer-like variables to size_t and returns the product..
Definition: tsgUtils.hpp:82
T exchange(T &x, U new_x)
Equivalent to C++14 exchange, but works with simpler types (int, double, float*).
Definition: tsgUtils.hpp:153
typename std::enable_if< condition, void >::type use_if
Equivalent to C++14 enable_if_t<condition, void>
Definition: tsgUtils.hpp:147
Encapsulates the Tasmanian Sparse Grid module.
Definition: TasmanianSparseGrid.hpp:68
Wrapper class around GPU device ID, acceleration type and GpuEngine.
Definition: tsgAcceleratedDataStructures.hpp:576
ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const
Returns the ChangeType if enable() is called, but does not change the acceleration.
Definition: tsgAcceleratedDataStructures.hpp:672
AlgorithmPreference algorithm_select
The preference to use dense or sparse algorithms.
Definition: tsgAcceleratedDataStructures.hpp:609
void enable(TypeAcceleration acc, int new_gpu_id)
Accepts parameters directly from TasmanianSparseGrid::enableAcceleration()
Definition: tsgAcceleratedDataStructures.hpp:692
static constexpr int getDefaultAccDevice()
Returns the default acceleration device, CUDA/HIP use GPU 0, SYCL uses -1 which uses sycl::default_se...
Definition: tsgAcceleratedDataStructures.hpp:625
ChangeType favorSparse(bool favor)
Sets algorithm affinity in the direction of sparse.
Definition: tsgAcceleratedDataStructures.hpp:637
TypeAcceleration mode
The current active acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:607
bool on_gpu() const
Returns true if any of the GPU-based acceleration modes have been enabled.
Definition: tsgAcceleratedDataStructures.hpp:721
std::unique_ptr< GpuEngine > engine
Holds the context to the GPU TPL handles, e.g., MAGMA queue.
Definition: tsgAcceleratedDataStructures.hpp:614
AlgorithmPreference
Defines the sparse-dense algorithm flavors, whenever applicable.
Definition: tsgAcceleratedDataStructures.hpp:578
@ algorithm_dense
Use dense algorithm.
Definition: tsgAcceleratedDataStructures.hpp:580
@ algorithm_sparse
Use sparse algorithm.
Definition: tsgAcceleratedDataStructures.hpp:582
@ algorithm_autoselect
Use automatically select based on heuristics.
Definition: tsgAcceleratedDataStructures.hpp:584
bool blasCompatible() const
Returns true if BLAS is enabled and the current mode is not none.
Definition: tsgAcceleratedDataStructures.hpp:654
static constexpr TypeAcceleration getDefaultAccMode()
Returns the default acceleration mode, cpu_blas if BLAS is enabled and none otherwise.
Definition: tsgAcceleratedDataStructures.hpp:617
int device
If using a GPU acceleration mode, holds the active device.
Definition: tsgAcceleratedDataStructures.hpp:611
void setDevice() const
Set default device.
Definition: tsgAcceleratedDataStructures.hpp:717
ChangeType
Defines the types of acceleration context updates so they can be linked to acceleration cache updates...
Definition: tsgAcceleratedDataStructures.hpp:593
@ change_none
No change, do nothing.
Definition: tsgAcceleratedDataStructures.hpp:595
@ change_sparse_dense
Change the sparse-dense AlgorithmPreference.
Definition: tsgAcceleratedDataStructures.hpp:603
@ change_gpu_device
Change the associated GPU device.
Definition: tsgAcceleratedDataStructures.hpp:597
@ change_cpu_blas
Change BLAS to none or none to BLAS.
Definition: tsgAcceleratedDataStructures.hpp:601
@ change_gpu_enabled
Change from BLAS or none to a GPU acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:599
AccelerationContext()
Creates a default context, the device id is set to 0 and acceleration is BLAS (if available) or none.
Definition: tsgAcceleratedDataStructures.hpp:634
bool useKernels() const
Returns true if the current mode implies the use of custom GPU kernels.
Definition: tsgAcceleratedDataStructures.hpp:663
Wrapper class around calls GPU accelerated linear algebra libraries.
Definition: tsgAcceleratedDataStructures.hpp:236
std::unique_ptr< int > called_magma_init
Avoids an empty engine when no acceleration is enabled, allows for default constructor/move/copy,...
Definition: tsgAcceleratedDataStructures.hpp:273
void setCuSolverDnHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusolverDnHandle_t associated with this CUD...
std::unique_ptr< int, HandleDeleter< AccHandle::Cusolver > > cusolver_handle
Holds the cuSolver handle.
Definition: tsgAcceleratedDataStructures.hpp:250
void setCuBlasHandle(void *handle)
Manually sets the cuBlas handle, handle must be a valid cublasHandle_t associated with this CUDA devi...
void setCuSparseHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusparseHandle_t associated with this CUDA ...
std::unique_ptr< int, HandleDeleter< AccHandle::Cublas > > cublas_handle
Holds the cuBlas handle.
Definition: tsgAcceleratedDataStructures.hpp:246
std::unique_ptr< int, HandleDeleter< AccHandle::Cusparse > > cusparse_handle
Holds the cuSparse handle.
Definition: tsgAcceleratedDataStructures.hpp:248