Doxygen 1.9.1
Toolkit for Adaptive Stochastic Modeling and Non-Intrusive ApproximatioN: Tasmanian v8.2 (development)
tsgAcceleratedDataStructures.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017, Miroslav Stoyanov
3  *
4  * This file is part of
5  * Toolkit for Adaptive Stochastic Modeling And Non-Intrusive ApproximatioN: TASMANIAN
6  *
7  * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
12  * and the following disclaimer in the documentation and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse
15  * or promote products derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
18  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
21  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  * UT-BATTELLE, LLC AND THE UNITED STATES GOVERNMENT MAKE NO REPRESENTATIONS AND DISCLAIM ALL WARRANTIES, BOTH EXPRESSED AND IMPLIED.
25  * THERE ARE NO EXPRESS OR IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY PATENT,
26  * COPYRIGHT, TRADEMARK, OR OTHER PROPRIETARY RIGHTS, OR THAT THE SOFTWARE WILL ACCOMPLISH THE INTENDED RESULTS OR THAT THE SOFTWARE OR ITS USE WILL NOT RESULT IN INJURY OR DAMAGE.
27  * THE USER ASSUMES RESPONSIBILITY FOR ALL LIABILITIES, PENALTIES, FINES, CLAIMS, CAUSES OF ACTION, AND COSTS AND EXPENSES, CAUSED BY, RESULTING FROM OR ARISING OUT OF,
28  * IN WHOLE OR IN PART THE USE, STORAGE OR DISPOSAL OF THE SOFTWARE.
29  */
30 
31 #ifndef __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
32 #define __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
33 
34 #include "tsgAcceleratedHandles.hpp"
35 
45 
79 namespace TasGrid{
80 
81 struct AccelerationContext; // forward declaration, CUDA and HIP GpuVector do not use the context, but DPC++ needs it
82 
94 template<typename T>
95 class GpuVector{
96 public:
98  GpuVector(GpuVector<T> const &) = delete;
101 
103  GpuVector(GpuVector<T> &&other) : num_entries(Utils::exchange(other.num_entries, 0)), gpu_data(Utils::exchange(other.gpu_data, nullptr))
104  #ifdef Tasmanian_ENABLE_DPCPP
105  , sycl_queue(other.sycl_queue)
106  #endif
107  {}
110  GpuVector<T> temp(std::move(other));
111  std::swap(num_entries, temp.num_entries);
112  std::swap(gpu_data, temp.gpu_data);
113  #ifdef Tasmanian_ENABLE_DPCPP
114  std::swap(sycl_queue, temp.sycl_queue);
115  #endif
116  return *this;
117  }
118 
120  GpuVector() : num_entries(0), gpu_data(nullptr){}
122  GpuVector(AccelerationContext const *acc, size_t count) : num_entries(0), gpu_data(nullptr){ resize(acc, count); }
123 
132  GpuVector(AccelerationContext const *acc, int dim1, int dim2) : num_entries(0), gpu_data(nullptr){ resize(acc, Utils::size_mult(dim1, dim2)); }
134  GpuVector(AccelerationContext const *acc, const std::vector<T> &cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, cpu_data); }
136  GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, Utils::size_mult(dim1, dim2), cpu_data); }
138  template<typename IteratorLike> GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend) : GpuVector(){ load(acc, ibegin, iend); }
141 
143  size_t size() const{ return num_entries; }
145  T* data(){ return gpu_data; }
147  const T* data() const{ return gpu_data; }
148 
150  void resize(AccelerationContext const *acc, size_t count);
152  void clear();
154  bool empty() const{ return (num_entries == 0); }
155 
157  void load(AccelerationContext const *acc, const std::vector<T> &cpu_data){ load(acc, cpu_data.size(), cpu_data.data()); }
158 
160  template<typename IteratorLike>
161  void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend){
162  load(acc, std::distance(ibegin, iend), &*ibegin);
163  }
164 
170  template<typename U>
171  Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, const std::vector<U> &cpu_data){
172  load(acc, cpu_data.size(), cpu_data.data());
173  }
174 
183  void load(AccelerationContext const *acc, size_t count, const T* cpu_data);
189  template<typename U>
190  Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, size_t count, const U* cpu_data){
191  std::vector<T> converted(count);
192  std::transform(cpu_data, cpu_data + count, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });
193  load(acc, converted);
194  }
196  void unload(AccelerationContext const *acc, std::vector<T> &cpu_data) const{
197  cpu_data.resize(num_entries);
198  unload(acc, cpu_data.data());
199  }
201  std::vector<T> unload(AccelerationContext const *acc) const{
202  std::vector<T> y;
203  unload(acc, y);
204  return y;
205  }
207  void unload(AccelerationContext const *acc, size_t num, T* cpu_data) const;
209  void unload(AccelerationContext const *acc, T* cpu_data) const{ unload(acc, num_entries, cpu_data); }
210 
212  T* eject(){
213  T* external = gpu_data;
214  gpu_data = nullptr;
215  num_entries = 0;
216  return external;
217  }
218 
220  using value_type = T;
221 
222 private:
223  size_t num_entries; // keep track of the size, update on every call that changes the gpu_data
224  T *gpu_data; // the GPU array
225  #ifdef Tasmanian_ENABLE_DPCPP
226  void* sycl_queue;
227  #endif
228 };
229 
236 struct GpuEngine{
237  #ifdef Tasmanian_ENABLE_CUDA
239  void setCuBlasHandle(void *handle);
241  void setCuSparseHandle(void *handle);
243  void setCuSolverDnHandle(void *handle);
244 
246  std::unique_ptr<int, HandleDeleter<AccHandle::Cublas>> cublas_handle;
248  std::unique_ptr<int, HandleDeleter<AccHandle::Cusparse>> cusparse_handle;
250  std::unique_ptr<int, HandleDeleter<AccHandle::Cusolver>> cusolver_handle;
251  #endif
252 
253  #ifdef Tasmanian_ENABLE_HIP
255  void setRocBlasHandle(void *handle);
257  void setRocSparseHandle(void *handle);
258 
260  std::unique_ptr<int, HandleDeleter<AccHandle::Rocblas>> rblas_handle;
262  std::unique_ptr<int, HandleDeleter<AccHandle::Rocsparse>> rsparse_handle;
263  #endif
264 
265  #ifdef Tasmanian_ENABLE_DPCPP
267  void setSyclQueue(void *queue);
269  std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> internal_queue;
270  #endif
271 
273  std::unique_ptr<int> called_magma_init;
274 };
275 
279 
285 public:
287  AccelerationDomainTransform(AccelerationContext const *, std::vector<double> const &transform_a, std::vector<double> const &transform_b);
288 
296  template<typename T>
297  void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector<T> &gpu_canonical_x);
298 
299 private:
300  // these actually store the rate and shift and not the hard upper/lower limits
301  GpuVector<double> gpu_trans_a, gpu_trans_b;
302  int num_dimensions, padded_size;
303  AccelerationContext const *acceleration;
304 };
305 
309 namespace TasGpu{
313 
326  template<typename T>
327  void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size,
328  const double *gpu_trans_a, const double *gpu_trans_b,
329  const T *gpu_x_transformed, T *gpu_x_canonical);
330 
334 
343  template<typename T>
344  void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y);
345 
349 
357  template<typename T>
358  void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x,
359  const GpuVector<T> &gpu_nodes, const GpuVector<T> &gpu_support,
360  const GpuVector<int> &gpu_hpntr, const GpuVector<int> &gpu_hindx, const GpuVector<int> &gpu_hroots,
361  GpuVector<int> &gpu_spntr, GpuVector<int> &gpu_sindx, GpuVector<T> &gpu_svals);
362 
366 
375  template<typename T>
376  void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x,
377  const GpuVector<int> &num_nodes,
378  const GpuVector<int> &points, const GpuVector<T> &nodes, const GpuVector<T> &coeffs, T *gpu_result);
379 
383 
386  template<typename T>
387  void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x, const GpuVector<int> &num_nodes, const GpuVector<int> &points, T *gpu_wreal, typename GpuVector<T>::value_type *gpu_wimag);
388 
397  template<typename T>
398  void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis,
399  T const *gpu_x, GpuVector<T> const &nodes, GpuVector<T> const &coeff, GpuVector<T> const &tensor_weights,
400  GpuVector<int> const &nodes_per_level, GpuVector<int> const &offset_per_level, GpuVector<int> const &map_dimension, GpuVector<int> const &map_level,
401  GpuVector<int> const &active_tensors, GpuVector<int> const &active_num_points, GpuVector<int> const &dim_offsets,
402  GpuVector<int> const &map_tensor, GpuVector<int> const &map_index, GpuVector<int> const &map_reference, T *gpu_result);
403 
404 
409  void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[]);
410 
415  template<typename T> void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data);
416 
421  template<typename T, typename U>
422  Utils::use_if<!std::is_same<U, T>::value> load_n(AccelerationContext const *acc, U const *cpu_data, size_t num_entries, T *gpu_data){
423  std::vector<T> converted(num_entries);
424  std::transform(cpu_data, cpu_data + num_entries, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });
425  load_n(acc, converted.data(), num_entries, gpu_data);
426  }
427 
428  // #define __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__ // uncomment to compile a bunch of custom CUDA kernels that provide some functionality similar to cuBlas
429  #ifdef __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__
430  // CUDA kernels that provide essentially the same functionality as cuBlas and MAGMA, but nowhere near as optimal
431  // those functions should not be used in a Release or production builds
432  // the kernels are useful because they are simple and do not depend on potentially poorly documented 3d party library
433  // since the kernels are useful for testing and some debugging, the code should not be deleted (for now), but also don't waste time compiling in most cases
434 
435  void cudaDgemm(int M, int N, int K, const double *gpu_a, const double *gpu_b, double *gpu_c);
436  // lazy cuda dgemm, nowhere near as powerful as cuBlas, but does not depend on cuBlas
437  // gpu_a is M by K, gpu_b is K by N, gpu_c is M by N, all in column-major format
438  // on exit gpu_c = gpu_a * gpu_b
439 
440  void cudaSparseMatmul(int M, int N, int num_nz, const int* gpu_spntr, const int* gpu_sindx, const double* gpu_svals, const double *gpu_B, double *gpu_C);
441  // lazy cuda sparse dgemm, less efficient (especially for large N), but more memory conservative then cusparse as there is no need for a transpose
442  // C is M x N, B is K x N (K is max(gpu_sindx)), both are given in row-major format, num_nz/spntr/sindx/svals describe row compressed A which is M by K
443  // on exit C = A * B
444 
445  void cudaSparseVecDenseMat(int M, int N, int num_nz, const double *A, const int *indx, const double *vals, double *C);
446  // dense matrix A (column major) times a sparse vector defiend by num_nz, indx, and vals
447  // A is M by N, C is M by 1,
448  // on exit C = A * (indx, vals)
449 
450  void convert_sparse_to_dense(int num_rows, int num_columns, const int *gpu_pntr, const int *gpu_indx, const double *gpu_vals, double *gpu_destination);
451  // converts a sparse matrix to a dense representation (all data sits on the gpu and is pre-allocated)
452  #endif
453 }
454 
458 namespace AccelerationMeta{
464  std::map<std::string, TypeAcceleration> getStringToAccelerationMap();
481 
483  inline bool isAvailable(TypeAcceleration accel){
484  switch(accel){
485  #ifdef Tasmanian_ENABLE_MAGMA
486  case accel_gpu_magma: return true;
487  #endif
488  #ifdef Tasmanian_ENABLE_CUDA
489  case accel_gpu_cuda: return true;
490  case accel_gpu_cublas: return true;
491  #endif
492  #ifdef Tasmanian_ENABLE_HIP
493  case accel_gpu_hip: return true;
494  case accel_gpu_rocblas: return true;
495  #endif
496  #ifdef Tasmanian_ENABLE_DPCPP
497  case accel_gpu_cuda: return true;
498  case accel_gpu_cublas: return true;
499  #endif
500  #ifdef Tasmanian_ENABLE_BLAS
501  case accel_cpu_blas: return true;
502  #endif
503  case accel_none:
504  return true;
505  default:
506  return false;
507  }
508  }
509 
513 
517 
522 
526 
528  void setDefaultGpuDevice(int deviceID);
529 
533 
535  unsigned long long getTotalGPUMemory(int deviceID);
536 
540 
542  std::string getGpuDeviceName(int deviceID);
543 
547  template<typename T> void recvGpuArray(AccelerationContext const*, size_t num_entries, const T *gpu_data, std::vector<T> &cpu_data);
548 
552  template<typename T> void delGpuArray(AccelerationContext const*, T *x);
553 
563  void deleteCublasHandle(void *);
564 }
565 
585  };
586 
604  };
605 
611  int device;
612 
614  mutable std::unique_ptr<GpuEngine> engine;
615 
617  inline static constexpr TypeAcceleration getDefaultAccMode() {
618  #ifdef Tasmanian_ENABLE_BLAS
619  return accel_cpu_blas;
620  #else
621  return accel_none;
622  #endif
623  }
625  inline static constexpr int getDefaultAccDevice() {
626  #ifdef Tasmanian_ENABLE_DPCPP
627  return -1;
628  #else
629  return 0;
630  #endif
631  }
632 
635 
637  ChangeType favorSparse(bool favor){
638  AlgorithmPreference new_preference = [=]()->AlgorithmPreference{
639  if (favor){
641  }else{
643  }
644  }();
645  if (new_preference != algorithm_select){
646  algorithm_select = new_preference;
647  return change_sparse_dense;
648  }else{
649  return change_none;
650  }
651  }
652 
654  bool blasCompatible() const{
655  #ifdef Tasmanian_ENABLE_BLAS
656  return (mode != accel_none);
657  #else
658  return false;
659  #endif
660  }
661 
663  bool useKernels() const{
664  #if defined(Tasmanian_ENABLE_CUDA) || defined(Tasmanian_ENABLE_HIP)
665  return ((mode == accel_gpu_cuda) or (mode == accel_gpu_magma));
666  #else
667  return false;
668  #endif
669  }
670 
672  ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const{
674  #ifdef Tasmanian_ENABLE_DPCPP
675  if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
676  throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
677  #else
678  if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
679  throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
680  #endif
681  ChangeType mode_change = (effective_acc == mode) ? change_none : change_cpu_blas;
682  ChangeType device_change = (device == new_gpu_id) ? change_none : change_gpu_device;
683 
684  if (on_gpu()){
685  return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? device_change : change_gpu_device;
686  }else{
687  return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? change_gpu_enabled : mode_change;
688  }
689  }
690 
692  void enable(TypeAcceleration acc, int new_gpu_id){
693  // get the effective new acceleration mode (use the fallback if acc is not enabled)
695  // if switching to a GPU mode, check if the device id is valid
696  #ifdef Tasmanian_ENABLE_DPCPP
697  if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
698  throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
699  #else
700  if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))
701  throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");
702  #endif
703  if (AccelerationMeta::isAccTypeGPU(effective_acc)){
704  // if the new mode is GPU-based, make an engine or reset the engine if the device has changed
705  // if the engine exists and the device is not changed, then keep the existing engine
706  if (!engine or new_gpu_id != device)
707  engine = Utils::make_unique<GpuEngine>();
708  }else{
709  engine.reset();
710  }
711 
712  // assign the new values for the mode and device
713  mode = effective_acc;
714  device = new_gpu_id;
715  }
719  operator GpuEngine* () const{ return engine.get(); }
721  bool on_gpu() const{ return !!engine; }
722 };
723 
724 #ifdef Tasmanian_ENABLE_DPCPP
738 struct InternalSyclQueue{
740  InternalSyclQueue() : use_testing(false){}
742  void init_testing(int gpuid);
744  operator std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> (){
745  return std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>>(test_queue.get(),
746  HandleDeleter<AccHandle::Syclqueue>(false));
747  }
749  bool use_testing;
751  std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> test_queue;
752 };
760 extern InternalSyclQueue test_queue;
761 #endif
762 
763 }
764 
765 #endif // __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP
Implements the domain transform algorithms in case the user data is provided on the GPU.
Definition: tsgAcceleratedDataStructures.hpp:284
AccelerationDomainTransform(AccelerationContext const *, std::vector< double > const &transform_a, std::vector< double > const &transform_b)
Constructor, load the transform data to the GPU, the vectors are the same as used in the TasmanianSpa...
void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector< T > &gpu_canonical_x)
Transform a set of points, used in the calls to evaluateHierarchicalFunctionsGPU() Takes the user pro...
Template class that wraps around a single GPU array, providing functionality that mimics std::vector.
Definition: tsgAcceleratedDataStructures.hpp:95
std::vector< T > unload(AccelerationContext const *acc) const
Return a CPU vector holding the data of the GPU.
Definition: tsgAcceleratedDataStructures.hpp:201
~GpuVector()
Destructor, release all allocated memory.
Definition: tsgAcceleratedDataStructures.hpp:140
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, size_t count, const U *cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:190
const T * data() const
Get a const-reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:147
bool empty() const
Return true if the size() is zero.
Definition: tsgAcceleratedDataStructures.hpp:154
void unload(AccelerationContext const *acc, std::vector< T > &cpu_data) const
Copy the data from the GPU array to cpu_data, the cpu_data will be resized and overwritten.
Definition: tsgAcceleratedDataStructures.hpp:196
GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Construct a vector by loading from a given range.
Definition: tsgAcceleratedDataStructures.hpp:138
GpuVector(GpuVector< T > const &)=delete
Delete the copy-constructor.
T * eject()
Move the data to the external array, the vector is set to empty (unlike move command on std::vector).
Definition: tsgAcceleratedDataStructures.hpp:212
T * data()
Get a reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:145
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, const std::vector< U > &cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:171
GpuVector()
Default constructor, creates an empty (null) array.
Definition: tsgAcceleratedDataStructures.hpp:120
void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Load from a range defined by the begin and end, converts if necessary.
Definition: tsgAcceleratedDataStructures.hpp:161
void unload(AccelerationContext const *acc, size_t num, T *cpu_data) const
Copy the first num entries to the cpu_data buffer, assumes that the buffer is sufficiently large.
GpuVector(AccelerationContext const *acc, int dim1, int dim2)
Same as GpuVector(dim1 * dim2), but guards against overflow.
Definition: tsgAcceleratedDataStructures.hpp:132
GpuVector(AccelerationContext const *acc, size_t count)
Construct a vector with count number of entries.
Definition: tsgAcceleratedDataStructures.hpp:122
GpuVector< T > & operator=(GpuVector< T > const &)=delete
Delete the copy-assignment.
void load(AccelerationContext const *acc, size_t count, const T *cpu_data)
Copy the first count entries of cpu_data to the GPU device.
void unload(AccelerationContext const *acc, T *cpu_data) const
Copy the data from the GPU array to the cpu_data buffer, assumes that the buffer is sufficiently larg...
Definition: tsgAcceleratedDataStructures.hpp:209
T value_type
The data-type of the vector entries.
Definition: tsgAcceleratedDataStructures.hpp:220
void clear()
Delete all allocated memory and reset the array to empty.
size_t size() const
Return the current size of the GPU array.
Definition: tsgAcceleratedDataStructures.hpp:143
void resize(AccelerationContext const *acc, size_t count)
Clear all data currently stored in the vector and allocate a new array (unlike std::vector this does ...
GpuVector(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Create a vector with size that matches cpu_data and copy the data to the GPU device.
Definition: tsgAcceleratedDataStructures.hpp:134
GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data)
Construct a vector and load with date provided on to the cpu.
Definition: tsgAcceleratedDataStructures.hpp:136
GpuVector(GpuVector< T > &&other)
Allow for move-construction.
Definition: tsgAcceleratedDataStructures.hpp:103
void load(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Copy the content of cpu_data to the GPU device, all pre-existing data is deleted and the vector is re...
Definition: tsgAcceleratedDataStructures.hpp:157
TypeOneDRule
Used to specify the one dimensional family of rules that induces the sparse grid.
Definition: tsgEnumerates.hpp:285
constexpr TypeAcceleration accel_gpu_rocblas
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:575
constexpr TypeAcceleration accel_gpu_hip
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:570
TypeAcceleration
Modes of acceleration.
Definition: tsgEnumerates.hpp:551
@ accel_cpu_blas
Default (if available), uses both BLAS and LAPACK libraries.
Definition: tsgEnumerates.hpp:555
@ accel_none
Usually the slowest mode, uses only OpenMP multi-threading, but optimized for memory and could be the...
Definition: tsgEnumerates.hpp:553
@ accel_gpu_magma
Same the CUDA option but uses the UTK MAGMA library for the linear algebra operations.
Definition: tsgEnumerates.hpp:563
@ accel_gpu_cublas
Mixed usage of the CPU (OpenMP) and GPU libraries.
Definition: tsgEnumerates.hpp:559
@ accel_gpu_cuda
Similar to the cuBLAS option but also uses a set of Tasmanian custom GPU kernels.
Definition: tsgEnumerates.hpp:561
void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result)
Evaluate the basis for a Sequence grid.
int getIOAccelerationInt(TypeAcceleration accel)
Convert the integer (coming from Fortran) into an enumerated type.
void setDefaultGpuDevice(int deviceID)
Selects the active device for this CPU thread, not supported for DPC++.
std::string getGpuDeviceName(int deviceID)
Returns the name of the selected GPU device, empty string if no device is available or the index is o...
void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size, const double *gpu_trans_a, const double *gpu_trans_b, const T *gpu_x_transformed, T *gpu_x_canonical)
Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA dev...
void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data)
Similar to copy_n, copies the data from the CPU to the GPU.
void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T *gpu_wreal, typename GpuVector< T >::value_type *gpu_wimag)
Evaluate the basis for a Fourier grid.
void delGpuArray(AccelerationContext const *, T *x)
Deallocate device array, used primarily for testing, always favor using GpuVector (if possible).
bool isAccTypeGPU(TypeAcceleration accel)
Returns true if accele is cuda, cublas or magma.
TypeAcceleration getIOIntAcceleration(int accel)
Convert the enumerated type to an integer, the inverse of getIOAccelerationInt()
void recvGpuArray(AccelerationContext const *, size_t num_entries, const T *gpu_data, std::vector< T > &cpu_data)
Copy a device array to the main memory, used for testing only, always favor using GpuVector (if possi...
void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const *gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result)
Evaluate the basis for Global grid.
TypeAcceleration getIOAccelerationString(const char *name)
Convert the string (coming from C or Python) into an enumerated type.
void * createCublasHandle()
Creates a new cuBlas handle, used in unit-testing only.
void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[])
Fills the data with the provided real number at the given stride.
void deleteCublasHandle(void *)
Destroys the cuBlas handle, used in unit-testing only.
int getNumGpuDevices()
Return the number of visible GPU devices.
void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals)
Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm.
TypeAcceleration getAvailableFallback(TypeAcceleration accel)
Implements fallback logic, if accel has been enabled through CMake then this returns accel,...
void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y)
Evaluate the basis functions for a local polynomial grid using the DENSE algorithm.
unsigned long long getTotalGPUMemory(int deviceID)
Return the memory available in the device (in units of bytes).
size_t size_mult(IntA a, IntB b)
Converts two integer-like variables to size_t and returns the product..
Definition: tsgUtils.hpp:82
T exchange(T &x, U new_x)
Equivalent to C++14 exchange, but works with simpler types (int, double, float*).
Definition: tsgUtils.hpp:153
typename std::enable_if< condition, void >::type use_if
Equivalent to C++14 enable_if_t<condition, void>
Definition: tsgUtils.hpp:147
std::map< std::string, TypeAcceleration > getStringToAccelerationMap()
Creates a map with std::string rule names (used by C/Python/CLI) mapped to TypeAcceleration enums.
bool isAvailable(TypeAcceleration accel)
Identifies whether the acceleration mode is available.
Definition: tsgAcceleratedDataStructures.hpp:483
Encapsulates the Tasmanian Sparse Grid module.
Definition: TasmanianSparseGrid.hpp:68
Wrapper class around GPU device ID, acceleration type and GpuEngine.
Definition: tsgAcceleratedDataStructures.hpp:576
ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const
Returns the ChangeType if enable() is called, but does not change the acceleration.
Definition: tsgAcceleratedDataStructures.hpp:672
AlgorithmPreference algorithm_select
The preference to use dense or sparse algorithms.
Definition: tsgAcceleratedDataStructures.hpp:609
void enable(TypeAcceleration acc, int new_gpu_id)
Accepts parameters directly from TasmanianSparseGrid::enableAcceleration()
Definition: tsgAcceleratedDataStructures.hpp:692
static constexpr int getDefaultAccDevice()
Returns the default acceleration device, CUDA/HIP use GPU 0, SYCL uses -1 which uses sycl::default_se...
Definition: tsgAcceleratedDataStructures.hpp:625
ChangeType favorSparse(bool favor)
Sets algorithm affinity in the direction of sparse.
Definition: tsgAcceleratedDataStructures.hpp:637
TypeAcceleration mode
The current active acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:607
bool on_gpu() const
Returns true if any of the GPU-based acceleration modes have been enabled.
Definition: tsgAcceleratedDataStructures.hpp:721
std::unique_ptr< GpuEngine > engine
Holds the context to the GPU TPL handles, e.g., MAGMA queue.
Definition: tsgAcceleratedDataStructures.hpp:614
AlgorithmPreference
Defines the sparse-dense algorithm flavors, whenever applicable.
Definition: tsgAcceleratedDataStructures.hpp:578
@ algorithm_dense
Use dense algorithm.
Definition: tsgAcceleratedDataStructures.hpp:580
@ algorithm_sparse
Use sparse algorithm.
Definition: tsgAcceleratedDataStructures.hpp:582
@ algorithm_autoselect
Use automatically select based on heuristics.
Definition: tsgAcceleratedDataStructures.hpp:584
bool blasCompatible() const
Returns true if BLAS is enabled and the current mode is not none.
Definition: tsgAcceleratedDataStructures.hpp:654
static constexpr TypeAcceleration getDefaultAccMode()
Returns the default acceleration mode, cpu_blas if BLAS is enabled and none otherwise.
Definition: tsgAcceleratedDataStructures.hpp:617
int device
If using a GPU acceleration mode, holds the active device.
Definition: tsgAcceleratedDataStructures.hpp:611
void setDevice() const
Set default device.
Definition: tsgAcceleratedDataStructures.hpp:717
ChangeType
Defines the types of acceleration context updates so they can be linked to acceleration cache updates...
Definition: tsgAcceleratedDataStructures.hpp:593
@ change_none
No change, do nothing.
Definition: tsgAcceleratedDataStructures.hpp:595
@ change_sparse_dense
Change the sparse-dense AlgorithmPreference.
Definition: tsgAcceleratedDataStructures.hpp:603
@ change_gpu_device
Change the associated GPU device.
Definition: tsgAcceleratedDataStructures.hpp:597
@ change_cpu_blas
Change BLAS to none or none to BLAS.
Definition: tsgAcceleratedDataStructures.hpp:601
@ change_gpu_enabled
Change from BLAS or none to a GPU acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:599
AccelerationContext()
Creates a default context, the device id is set to 0 and acceleration is BLAS (if available) or none.
Definition: tsgAcceleratedDataStructures.hpp:634
bool useKernels() const
Returns true if the current mode implies the use of custom GPU kernels.
Definition: tsgAcceleratedDataStructures.hpp:663
Wrapper class around calls GPU accelerated linear algebra libraries.
Definition: tsgAcceleratedDataStructures.hpp:236
std::unique_ptr< int > called_magma_init
Avoids an empty engine when no acceleration is enabled, allows for default constructor/move/copy,...
Definition: tsgAcceleratedDataStructures.hpp:273
void setCuSolverDnHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusolverDnHandle_t associated with this CUD...
std::unique_ptr< int, HandleDeleter< AccHandle::Cusolver > > cusolver_handle
Holds the cuSolver handle.
Definition: tsgAcceleratedDataStructures.hpp:250
void setCuBlasHandle(void *handle)
Manually sets the cuBlas handle, handle must be a valid cublasHandle_t associated with this CUDA devi...
void setCuSparseHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusparseHandle_t associated with this CUDA ...
std::unique_ptr< int, HandleDeleter< AccHandle::Cublas > > cublas_handle
Holds the cuBlas handle.
Definition: tsgAcceleratedDataStructures.hpp:246
std::unique_ptr< int, HandleDeleter< AccHandle::Cusparse > > cusparse_handle
Holds the cuSparse handle.
Definition: tsgAcceleratedDataStructures.hpp:248