Doxygen 1.9.8
Toolkit for Adaptive Stochastic Modeling and Non-Intrusive ApproximatioN: Tasmanian v8.2
 
Loading...
Searching...
No Matches
tsgAcceleratedDataStructures.hpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2017, Miroslav Stoyanov
3 *
4 * This file is part of
5 * Toolkit for Adaptive Stochastic Modeling And Non-Intrusive ApproximatioN: TASMANIAN
6 *
7 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
12 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
13 *
14 * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse
15 * or promote products derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
18 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
21 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 *
24 * UT-BATTELLE, LLC AND THE UNITED STATES GOVERNMENT MAKE NO REPRESENTATIONS AND DISCLAIM ALL WARRANTIES, BOTH EXPRESSED AND IMPLIED.
25 * THERE ARE NO EXPRESS OR IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY PATENT,
26 * COPYRIGHT, TRADEMARK, OR OTHER PROPRIETARY RIGHTS, OR THAT THE SOFTWARE WILL ACCOMPLISH THE INTENDED RESULTS OR THAT THE SOFTWARE OR ITS USE WILL NOT RESULT IN INJURY OR DAMAGE.
27 * THE USER ASSUMES RESPONSIBILITY FOR ALL LIABILITIES, PENALTIES, FINES, CLAIMS, CAUSES OF ACTION, AND COSTS AND EXPENSES, CAUSED BY, RESULTING FROM OR ARISING OUT OF,
28 * IN WHOLE OR IN PART THE USE, STORAGE OR DISPOSAL OF THE SOFTWARE.
29 */
30
31#ifndef __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
32#define __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
33
34#include "tsgAcceleratedHandles.hpp"
35
45
79namespace TasGrid{
80
81struct AccelerationContext; // forward declaration, CUDA and HIP GpuVector do not use the context, but DPC++ needs it
82
94template<typename T>
96public:
98 GpuVector(GpuVector<T> const &) = delete;
101
103 GpuVector(GpuVector<T> &&other) : num_entries(Utils::exchange(other.num_entries, 0)), gpu_data(Utils::exchange(other.gpu_data, nullptr))
104 #ifdef Tasmanian_ENABLE_DPCPP
105 , sycl_queue(other.sycl_queue)
106 #endif
107 {}
110 GpuVector<T> temp(std::move(other));
111 std::swap(num_entries, temp.num_entries);
112 std::swap(gpu_data, temp.gpu_data);
113 #ifdef Tasmanian_ENABLE_DPCPP
114 std::swap(sycl_queue, temp.sycl_queue);
115 #endif
116 return *this;
117 }
118
120 GpuVector() : num_entries(0), gpu_data(nullptr){}
122 GpuVector(AccelerationContext const *acc, size_t count) : num_entries(0), gpu_data(nullptr){ resize(acc, count); }
123
132 GpuVector(AccelerationContext const *acc, int dim1, int dim2) : num_entries(0), gpu_data(nullptr){ resize(acc, Utils::size_mult(dim1, dim2)); }
134 GpuVector(AccelerationContext const *acc, const std::vector<T> &cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, cpu_data); }
136 GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, Utils::size_mult(dim1, dim2), cpu_data); }
138 template<typename IteratorLike> GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend) : GpuVector(){ load(acc, ibegin, iend); }
141
143 size_t size() const{ return num_entries; }
145 T* data(){ return gpu_data; }
147 const T* data() const{ return gpu_data; }
148
150 void resize(AccelerationContext const *acc, size_t count);
152 void clear();
154 bool empty() const{ return (num_entries == 0); }
155
157 void load(AccelerationContext const *acc, const std::vector<T> &cpu_data){ load(acc, cpu_data.size(), cpu_data.data()); }
158
160 template<typename IteratorLike>
161 void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend){
162 load(acc, std::distance(ibegin, iend), &*ibegin);
163 }
164
170 template<typename U>
171 Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, const std::vector<U> &cpu_data){
172 load(acc, cpu_data.size(), cpu_data.data());
173 }
174
183 void load(AccelerationContext const *acc, size_t count, const T* cpu_data);
189 template<typename U>
190 Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, size_t count, const U* cpu_data){
191 std::vector<T> converted(count);
192 std::transform(cpu_data, cpu_data + count, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });
193 load(acc, converted);
194 }
196 void unload(AccelerationContext const *acc, std::vector<T> &cpu_data) const{
197 cpu_data.resize(num_entries);
198 unload(acc, cpu_data.data());
199 }
201 std::vector<T> unload(AccelerationContext const *acc) const{
202 std::vector<T> y;
203 unload(acc, y);
204 return y;
205 }
207 void unload(AccelerationContext const *acc, size_t num, T* cpu_data) const;
209 void unload(AccelerationContext const *acc, T* cpu_data) const{ unload(acc, num_entries, cpu_data); }
210
212 T* eject(){
213 T* external = gpu_data;
214 gpu_data = nullptr;
215 num_entries = 0;
216 return external;
217 }
218
220 using value_type = T;
221
222private:
223 size_t num_entries; // keep track of the size, update on every call that changes the gpu_data
224 T *gpu_data; // the GPU array
225 #ifdef Tasmanian_ENABLE_DPCPP
226 void* sycl_queue;
227 #endif
228};
229
237 #ifdef Tasmanian_ENABLE_CUDA
239 void setCuBlasHandle(void *handle);
241 void setCuSparseHandle(void *handle);
243 void setCuSolverDnHandle(void *handle);
244
246 std::unique_ptr<int, HandleDeleter<AccHandle::Cublas>> cublas_handle;
248 std::unique_ptr<int, HandleDeleter<AccHandle::Cusparse>> cusparse_handle;
250 std::unique_ptr<int, HandleDeleter<AccHandle::Cusolver>> cusolver_handle;
251 #endif
252
253 #ifdef Tasmanian_ENABLE_HIP
255 void setRocBlasHandle(void *handle);
257 void setRocSparseHandle(void *handle);
258
260 std::unique_ptr<int, HandleDeleter<AccHandle::Rocblas>> rblas_handle;
262 std::unique_ptr<int, HandleDeleter<AccHandle::Rocsparse>> rsparse_handle;
263 #endif
264
265 #ifdef Tasmanian_ENABLE_DPCPP
267 void setSyclQueue(void *queue);
269 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> internal_queue;
270 #endif
271
273 std::unique_ptr<int> called_magma_init;
274};
275
279
285public:
287 AccelerationDomainTransform(AccelerationContext const *, std::vector<double> const &transform_a, std::vector<double> const &transform_b);
288
296 template<typename T>
297 void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector<T> &gpu_canonical_x);
298
299private:
300 // these actually store the rate and shift and not the hard upper/lower limits
301 GpuVector<double> gpu_trans_a, gpu_trans_b;
302 int num_dimensions, padded_size;
303 AccelerationContext const *acceleration;
304};
305
309namespace TasGpu{
313
326 template<typename T>
327 void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size,
328 const double *gpu_trans_a, const double *gpu_trans_b,
329 const T *gpu_x_transformed, T *gpu_x_canonical);
330
334
343 template<typename T>
344 void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y);
345
349
357 template<typename T>
358 void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x,
359 const GpuVector<T> &gpu_nodes, const GpuVector<T> &gpu_support,
360 const GpuVector<int> &gpu_hpntr, const GpuVector<int> &gpu_hindx, const GpuVector<int> &gpu_hroots,
361 GpuVector<int> &gpu_spntr, GpuVector<int> &gpu_sindx, GpuVector<T> &gpu_svals);
362
366
375 template<typename T>
376 void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x,
377 const GpuVector<int> &num_nodes,
378 const GpuVector<int> &points, const GpuVector<T> &nodes, const GpuVector<T> &coeffs, T *gpu_result);
379
383
386 template<typename T>
387 void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x, const GpuVector<int> &num_nodes, const GpuVector<int> &points, T *gpu_wreal, typename GpuVector<T>::value_type *gpu_wimag);
388
397 template<typename T>
398 void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis,
399 T const *gpu_x, GpuVector<T> const &nodes, GpuVector<T> const &coeff, GpuVector<T> const &tensor_weights,
400 GpuVector<int> const &nodes_per_level, GpuVector<int> const &offset_per_level, GpuVector<int> const &map_dimension, GpuVector<int> const &map_level,
401 GpuVector<int> const &active_tensors, GpuVector<int> const &active_num_points, GpuVector<int> const &dim_offsets,
402 GpuVector<int> const &map_tensor, GpuVector<int> const &map_index, GpuVector<int> const &map_reference, T *gpu_result);
403
404
409 void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[]);
410
415 template<typename T> void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data);
416
421 template<typename T, typename U>
422 Utils::use_if<!std::is_same<U, T>::value> load_n(AccelerationContext const *acc, U const *cpu_data, size_t num_entries, T *gpu_data){
423 std::vector<T> converted(num_entries);
424 std::transform(cpu_data, cpu_data + num_entries, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });
425 load_n(acc, converted.data(), num_entries, gpu_data);
426 }
427
428 // #define __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__ // uncomment to compile a bunch of custom CUDA kernels that provide some functionality similar to cuBlas
429 #ifdef __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__
430 // CUDA kernels that provide essentially the same functionality as cuBlas and MAGMA, but nowhere near as optimal
431 // those functions should not be used in a Release or production builds
432 // the kernels are useful because they are simple and do not depend on potentially poorly documented 3d party library
433 // since the kernels are useful for testing and some debugging, the code should not be deleted (for now), but also don't waste time compiling in most cases
434
435 void cudaDgemm(int M, int N, int K, const double *gpu_a, const double *gpu_b, double *gpu_c);
436 // lazy cuda dgemm, nowhere near as powerful as cuBlas, but does not depend on cuBlas
437 // gpu_a is M by K, gpu_b is K by N, gpu_c is M by N, all in column-major format
438 // on exit gpu_c = gpu_a * gpu_b
439
440 void cudaSparseMatmul(int M, int N, int num_nz, const int* gpu_spntr, const int* gpu_sindx, const double* gpu_svals, const double *gpu_B, double *gpu_C);
441 // lazy cuda sparse dgemm, less efficient (especially for large N), but more memory conservative then cusparse as there is no need for a transpose
442 // C is M x N, B is K x N (K is max(gpu_sindx)), both are given in row-major format, num_nz/spntr/sindx/svals describe row compressed A which is M by K
443 // on exit C = A * B
444
445 void cudaSparseVecDenseMat(int M, int N, int num_nz, const double *A, const int *indx, const double *vals, double *C);
446 // dense matrix A (column major) times a sparse vector defiend by num_nz, indx, and vals
447 // A is M by N, C is M by 1,
448 // on exit C = A * (indx, vals)
449
450 void convert_sparse_to_dense(int num_rows, int num_columns, const int *gpu_pntr, const int *gpu_indx, const double *gpu_vals, double *gpu_destination);
451 // converts a sparse matrix to a dense representation (all data sits on the gpu and is pre-allocated)
452 #endif
453}
454
458namespace AccelerationMeta{
462 TypeAcceleration getIOAccelerationString(const char * name);
464 std::map<std::string, TypeAcceleration> getStringToAccelerationMap();
468 const char* getIOAccelerationString(TypeAcceleration accel);
472 int getIOAccelerationInt(TypeAcceleration accel);
476 TypeAcceleration getIOIntAcceleration(int accel);
480 bool isAccTypeGPU(TypeAcceleration accel);
481
483 inline bool isAvailable(TypeAcceleration accel){
484 switch(accel){
485 #ifdef Tasmanian_ENABLE_MAGMA
486 case accel_gpu_magma: return true;
487 #endif
488 #ifdef Tasmanian_ENABLE_CUDA
489 case accel_gpu_cuda: return true;
490 case accel_gpu_cublas: return true;
491 #endif
492 #ifdef Tasmanian_ENABLE_HIP
493 case accel_gpu_hip: return true;
494 case accel_gpu_rocblas: return true;
495 #endif
496 #ifdef Tasmanian_ENABLE_DPCPP
497 case accel_gpu_cuda: return true;
498 case accel_gpu_cublas: return true;
499 #endif
500 #ifdef Tasmanian_ENABLE_BLAS
501 case accel_cpu_blas: return true;
502 #endif
503 case accel_none:
504 return true;
505 default:
506 return false;
507 }
508 }
509
513
516 TypeAcceleration getAvailableFallback(TypeAcceleration accel);
517
521 int getNumGpuDevices();
522
526
528 void setDefaultGpuDevice(int deviceID);
529
533
535 unsigned long long getTotalGPUMemory(int deviceID);
536
540
542 std::string getGpuDeviceName(int deviceID);
543
547 template<typename T> void recvGpuArray(AccelerationContext const*, size_t num_entries, const T *gpu_data, std::vector<T> &cpu_data);
548
552 template<typename T> void delGpuArray(AccelerationContext const*, T *x);
553
558 void *createCublasHandle();
563 void deleteCublasHandle(void *);
564}
565
586
605
612
614 mutable std::unique_ptr<GpuEngine> engine;
615
617 inline static constexpr TypeAcceleration getDefaultAccMode() {
618 #ifdef Tasmanian_ENABLE_BLAS
619 return accel_cpu_blas;
620 #else
621 return accel_none;
622 #endif
623 }
625 inline static constexpr int getDefaultAccDevice() {
626 #ifdef Tasmanian_ENABLE_DPCPP
627 return -1;
628 #else
629 return 0;
630 #endif
631 }
632
635
638 #if __cplusplus > 201103L
639 AlgorithmPreference new_preference = [as=algorithm_select, favor]()->AlgorithmPreference{
640 if (favor){
642 }else{
644 }
645 }();
646 #else
647 AlgorithmPreference new_preference = [&]()->AlgorithmPreference{
648 if (favor){
650 }else{
652 }
653 }();
654 #endif
655 if (new_preference != algorithm_select){
656 algorithm_select = new_preference;
657 return change_sparse_dense;
658 }else{
659 return change_none;
660 }
661 }
662
664 bool blasCompatible() const{
665 #ifdef Tasmanian_ENABLE_BLAS
666 return (mode != accel_none);
667 #else
668 return false;
669 #endif
670 }
671
673 bool useKernels() const{
674 #if defined(Tasmanian_ENABLE_CUDA) || defined(Tasmanian_ENABLE_HIP)
675 return ((mode == accel_gpu_cuda) or (mode == accel_gpu_magma));
676 #else
677 return false;
678 #endif
679 }
680
682 ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const{
683 TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);
684 #ifdef Tasmanian_ENABLE_DPCPP
685 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
686 throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
687 #else
688 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
689 throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
690 #endif
691 ChangeType mode_change = (effective_acc == mode) ? change_none : change_cpu_blas;
692 ChangeType device_change = (device == new_gpu_id) ? change_none : change_gpu_device;
693
694 if (on_gpu()){
695 return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? device_change : change_gpu_device;
696 }else{
697 return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? change_gpu_enabled : mode_change;
698 }
699 }
700
702 void enable(TypeAcceleration acc, int new_gpu_id){
703 // get the effective new acceleration mode (use the fallback if acc is not enabled)
704 TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);
705 // if switching to a GPU mode, check if the device id is valid
706 #ifdef Tasmanian_ENABLE_DPCPP
707 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
708 throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
709 #else
710 if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
711 throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
712 #endif
713 if (AccelerationMeta::isAccTypeGPU(effective_acc)){
714 // if the new mode is GPU-based, make an engine or reset the engine if the device has changed
715 // if the engine exists and the device is not changed, then keep the existing engine
716 if (!engine or new_gpu_id != device)
717 engine = Utils::make_unique<GpuEngine>();
718 }else{
719 engine.reset();
720 }
721
722 // assign the new values for the mode and device
723 mode = effective_acc;
724 device = new_gpu_id;
725 }
727 void setDevice() const{ AccelerationMeta::setDefaultGpuDevice(device); }
729 operator GpuEngine* () const{ return engine.get(); }
731 bool on_gpu() const{ return !!engine; }
732};
733
734#ifdef Tasmanian_ENABLE_DPCPP
748struct InternalSyclQueue{
750 InternalSyclQueue() : use_testing(false){}
752 void init_testing(int gpuid);
754 operator std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> (){
755 return std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>>(test_queue.get(),
756 HandleDeleter<AccHandle::Syclqueue>(false));
757 }
759 bool use_testing;
761 std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> test_queue;
762};
770extern InternalSyclQueue test_queue;
771#endif
772
773}
774
775#endif // __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
Implements the domain transform algorithms in case the user data is provided on the GPU.
Definition tsgAcceleratedDataStructures.hpp:284
AccelerationDomainTransform(AccelerationContext const *, std::vector< double > const &transform_a, std::vector< double > const &transform_b)
Constructor, load the transform data to the GPU, the vectors are the same as used in the TasmanianSpa...
void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector< T > &gpu_canonical_x)
Transform a set of points, used in the calls to evaluateHierarchicalFunctionsGPU() Takes the user pro...
Template class that wraps around a single GPU array, providing functionality that mimics std::vector.
Definition tsgAcceleratedDataStructures.hpp:95
~GpuVector()
Destructor, release all allocated memory.
Definition tsgAcceleratedDataStructures.hpp:140
bool empty() const
Return true if the size() is zero.
Definition tsgAcceleratedDataStructures.hpp:154
void unload(AccelerationContext const *acc, std::vector< T > &cpu_data) const
Copy the data from the GPU array to cpu_data, the cpu_data will be resized and overwritten.
Definition tsgAcceleratedDataStructures.hpp:196
GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Construct a vector by loading from a given range.
Definition tsgAcceleratedDataStructures.hpp:138
GpuVector(GpuVector< T > const &)=delete
Delete the copy-constructor.
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, size_t count, const U *cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition tsgAcceleratedDataStructures.hpp:190
std::vector< T > unload(AccelerationContext const *acc) const
Return a CPU vector holding the data of the GPU.
Definition tsgAcceleratedDataStructures.hpp:201
T * eject()
Move the data to the external array, the vector is set to empty (unlike move command on std::vector).
Definition tsgAcceleratedDataStructures.hpp:212
GpuVector()
Default constructor, creates an empty (null) array.
Definition tsgAcceleratedDataStructures.hpp:120
void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Load from a range defined by the begin and end, converts if necessary.
Definition tsgAcceleratedDataStructures.hpp:161
void unload(AccelerationContext const *acc, size_t num, T *cpu_data) const
Copy the first num entries to the cpu_data buffer, assumes that the buffer is sufficiently large.
GpuVector(AccelerationContext const *acc, int dim1, int dim2)
Same as GpuVector(dim1 * dim2), but guards against overflow.
Definition tsgAcceleratedDataStructures.hpp:132
GpuVector(AccelerationContext const *acc, size_t count)
Construct a vector with count number of entries.
Definition tsgAcceleratedDataStructures.hpp:122
void load(AccelerationContext const *acc, size_t count, const T *cpu_data)
Copy the first count entries of cpu_data to the GPU device.
void unload(AccelerationContext const *acc, T *cpu_data) const
Copy the data from the GPU array to the cpu_data buffer, assumes that the buffer is sufficiently larg...
Definition tsgAcceleratedDataStructures.hpp:209
T value_type
The data-type of the vector entries.
Definition tsgAcceleratedDataStructures.hpp:220
void clear()
Delete all allocated memory and reset the array to empty.
GpuVector< T > & operator=(GpuVector< T > const &)=delete
Delete the copy-assignment.
size_t size() const
Return the current size of the GPU array.
Definition tsgAcceleratedDataStructures.hpp:143
const T * data() const
Get a const-reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition tsgAcceleratedDataStructures.hpp:147
void resize(AccelerationContext const *acc, size_t count)
Clear all data currently stored in the vector and allocate a new array (unlike std::vector this does ...
GpuVector(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Create a vector with size that matches cpu_data and copy the data to the GPU device.
Definition tsgAcceleratedDataStructures.hpp:134
GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data)
Construct a vector and load with date provided on to the cpu.
Definition tsgAcceleratedDataStructures.hpp:136
GpuVector(GpuVector< T > &&other)
Allow for move-construction.
Definition tsgAcceleratedDataStructures.hpp:103
T * data()
Get a reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition tsgAcceleratedDataStructures.hpp:145
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, const std::vector< U > &cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition tsgAcceleratedDataStructures.hpp:171
void load(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Copy the content of cpu_data to the GPU device, all pre-existing data is deleted and the vector is re...
Definition tsgAcceleratedDataStructures.hpp:157
TypeOneDRule
Used to specify the one dimensional family of rules that induces the sparse grid.
Definition tsgEnumerates.hpp:285
constexpr TypeAcceleration accel_gpu_rocblas
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition tsgEnumerates.hpp:575
constexpr TypeAcceleration accel_gpu_hip
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition tsgEnumerates.hpp:570
TypeAcceleration
Modes of acceleration.
Definition tsgEnumerates.hpp:551
@ accel_cpu_blas
Default (if available), uses both BLAS and LAPACK libraries.
Definition tsgEnumerates.hpp:555
@ accel_none
Usually the slowest mode, uses only OpenMP multi-threading, but optimized for memory and could be the...
Definition tsgEnumerates.hpp:553
@ accel_gpu_magma
Same the CUDA option but uses the UTK MAGMA library for the linear algebra operations.
Definition tsgEnumerates.hpp:563
@ accel_gpu_cublas
Mixed usage of the CPU (OpenMP) and GPU libraries.
Definition tsgEnumerates.hpp:559
@ accel_gpu_cuda
Similar to the cuBLAS option but also uses a set of Tasmanian custom GPU kernels.
Definition tsgEnumerates.hpp:561
size_t size_mult(IntA a, IntB b)
Converts two integer-like variables to size_t and returns the product..
Definition tsgUtils.hpp:82
typename std::enable_if< condition, void >::type use_if
Equivalent to C++14 enable_if_t<condition, void>
Definition tsgUtils.hpp:147
Encapsulates the Tasmanian Sparse Grid module.
Definition TasmanianSparseGrid.hpp:68
Wrapper class around GPU device ID, acceleration type and GpuEngine.
Definition tsgAcceleratedDataStructures.hpp:576
ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const
Returns the ChangeType if enable() is called, but does not change the acceleration.
Definition tsgAcceleratedDataStructures.hpp:682
AlgorithmPreference algorithm_select
The preference to use dense or sparse algorithms.
Definition tsgAcceleratedDataStructures.hpp:609
void enable(TypeAcceleration acc, int new_gpu_id)
Accepts parameters directly from TasmanianSparseGrid::enableAcceleration()
Definition tsgAcceleratedDataStructures.hpp:702
static constexpr int getDefaultAccDevice()
Returns the default acceleration device, CUDA/HIP use GPU 0, SYCL uses -1 which uses sycl::default_se...
Definition tsgAcceleratedDataStructures.hpp:625
ChangeType favorSparse(bool favor)
Sets algorithm affinity in the direction of sparse.
Definition tsgAcceleratedDataStructures.hpp:637
TypeAcceleration mode
The current active acceleration mode.
Definition tsgAcceleratedDataStructures.hpp:607
bool on_gpu() const
Returns true if any of the GPU-based acceleration modes have been enabled.
Definition tsgAcceleratedDataStructures.hpp:731
std::unique_ptr< GpuEngine > engine
Holds the context to the GPU TPL handles, e.g., MAGMA queue.
Definition tsgAcceleratedDataStructures.hpp:614
AlgorithmPreference
Defines the sparse-dense algorithm flavors, whenever applicable.
Definition tsgAcceleratedDataStructures.hpp:578
@ algorithm_dense
Use dense algorithm.
Definition tsgAcceleratedDataStructures.hpp:580
@ algorithm_sparse
Use sparse algorithm.
Definition tsgAcceleratedDataStructures.hpp:582
@ algorithm_autoselect
Use automatically select based on heuristics.
Definition tsgAcceleratedDataStructures.hpp:584
bool blasCompatible() const
Returns true if BLAS is enabled and the current mode is not none.
Definition tsgAcceleratedDataStructures.hpp:664
static constexpr TypeAcceleration getDefaultAccMode()
Returns the default acceleration mode, cpu_blas if BLAS is enabled and none otherwise.
Definition tsgAcceleratedDataStructures.hpp:617
int device
If using a GPU acceleration mode, holds the active device.
Definition tsgAcceleratedDataStructures.hpp:611
void setDevice() const
Set default device.
Definition tsgAcceleratedDataStructures.hpp:727
ChangeType
Defines the types of acceleration context updates so they can be linked to acceleration cache updates...
Definition tsgAcceleratedDataStructures.hpp:593
@ change_none
No change, do nothing.
Definition tsgAcceleratedDataStructures.hpp:595
@ change_sparse_dense
Change the sparse-dense AlgorithmPreference.
Definition tsgAcceleratedDataStructures.hpp:603
@ change_gpu_device
Change the associated GPU device.
Definition tsgAcceleratedDataStructures.hpp:597
@ change_cpu_blas
Change BLAS to none or none to BLAS.
Definition tsgAcceleratedDataStructures.hpp:601
@ change_gpu_enabled
Change from BLAS or none to a GPU acceleration mode.
Definition tsgAcceleratedDataStructures.hpp:599
AccelerationContext()
Creates a default context, the device id is set to 0 and acceleration is BLAS (if available) or none.
Definition tsgAcceleratedDataStructures.hpp:634
bool useKernels() const
Returns true if the current mode implies the use of custom GPU kernels.
Definition tsgAcceleratedDataStructures.hpp:673
Wrapper class around calls GPU accelerated linear algebra libraries.
Definition tsgAcceleratedDataStructures.hpp:236
std::unique_ptr< int > called_magma_init
Avoids an empty engine when no acceleration is enabled, allows for default constructor/move/copy,...
Definition tsgAcceleratedDataStructures.hpp:273
void setCuSolverDnHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusolverDnHandle_t associated with this CUD...
std::unique_ptr< int, HandleDeleter< AccHandle::Cusolver > > cusolver_handle
Holds the cuSolver handle.
Definition tsgAcceleratedDataStructures.hpp:250
void setCuBlasHandle(void *handle)
Manually sets the cuBlas handle, handle must be a valid cublasHandle_t associated with this CUDA devi...
void setCuSparseHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusparseHandle_t associated with this CUDA ...
std::unique_ptr< int, HandleDeleter< AccHandle::Cublas > > cublas_handle
Holds the cuBlas handle.
Definition tsgAcceleratedDataStructures.hpp:246
std::unique_ptr< int, HandleDeleter< AccHandle::Cusparse > > cusparse_handle
Holds the cuSparse handle.
Definition tsgAcceleratedDataStructures.hpp:248