TASMANIAN/rolling/tsgAcceleratedDataStructures_8hpp_source.html

 /*

  * Copyright (c) 2017, Miroslav Stoyanov

  *

  * This file is part of

  * Toolkit for Adaptive Stochastic Modeling And Non-Intrusive ApproximatioN: TASMANIAN

  *

  * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  *

  * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  *

  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions

  *    and the following disclaimer in the documentation and/or other materials provided with the distribution.

  *

  * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse

  *    or promote products derived from this software without specific prior written permission.

  *

  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,

  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.

  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,

  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,

  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,

  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  *

  * UT-BATTELLE, LLC AND THE UNITED STATES GOVERNMENT MAKE NO REPRESENTATIONS AND DISCLAIM ALL WARRANTIES, BOTH EXPRESSED AND IMPLIED.

  * THERE ARE NO EXPRESS OR IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY PATENT,

  * COPYRIGHT, TRADEMARK, OR OTHER PROPRIETARY RIGHTS, OR THAT THE SOFTWARE WILL ACCOMPLISH THE INTENDED RESULTS OR THAT THE SOFTWARE OR ITS USE WILL NOT RESULT IN INJURY OR DAMAGE.

  * THE USER ASSUMES RESPONSIBILITY FOR ALL LIABILITIES, PENALTIES, FINES, CLAIMS, CAUSES OF ACTION, AND COSTS AND EXPENSES, CAUSED BY, RESULTING FROM OR ARISING OUT OF,

  * IN WHOLE OR IN PART THE USE, STORAGE OR DISPOSAL OF THE SOFTWARE.

  */


 #ifndef __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP

 #define __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP


 #include "tsgAcceleratedHandles.hpp"


 namespace TasGrid{


 struct AccelerationContext; // forward declaration, CUDA and HIP GpuVector do not use the context, but DPC++ needs it


 template<typename T>

 class GpuVector{

 public:

     GpuVector(GpuVector<T> const &) = delete;

     GpuVector<T>& operator =(GpuVector<T> const &) = delete;


     GpuVector(GpuVector<T> &&other) : num_entries(Utils::exchange(other.num_entries, 0)), gpu_data(Utils::exchange(other.gpu_data, nullptr))

     #ifdef Tasmanian_ENABLE_DPCPP

         , sycl_queue(other.sycl_queue)

     #endif

     {}

     GpuVector<T>& operator =(GpuVector<T> &&other){

         GpuVector<T> temp(std::move(other));

         std::swap(num_entries, temp.num_entries);

         std::swap(gpu_data, temp.gpu_data);

         #ifdef Tasmanian_ENABLE_DPCPP

         std::swap(sycl_queue, temp.sycl_queue);

         #endif

         return *this;

     }


     GpuVector() : num_entries(0), gpu_data(nullptr){}

     GpuVector(AccelerationContext const *acc, size_t count) : num_entries(0), gpu_data(nullptr){ resize(acc, count); }


     GpuVector(AccelerationContext const *acc, int dim1, int dim2) : num_entries(0), gpu_data(nullptr){ resize(acc, Utils::size_mult(dim1, dim2)); }

     GpuVector(AccelerationContext const *acc, const std::vector<T> &cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, cpu_data); }

     GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data) : num_entries(0), gpu_data(nullptr){ load(acc, Utils::size_mult(dim1, dim2), cpu_data); }

     template<typename IteratorLike> GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend) : GpuVector(){ load(acc, ibegin, iend); }

     ~GpuVector(){ clear(); }


     size_t size() const{ return num_entries; }

     T* data(){ return gpu_data; }

     const T* data() const{ return gpu_data; }


     void resize(AccelerationContext const *acc, size_t count);

     void clear();

     bool empty() const{ return (num_entries == 0); }


     void load(AccelerationContext const *acc, const std::vector<T> &cpu_data){ load(acc, cpu_data.size(), cpu_data.data()); }


     template<typename IteratorLike>

     void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend){

         load(acc, std::distance(ibegin, iend), &*ibegin);

     }


     template<typename U>

     Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, const std::vector<U> &cpu_data){

         load(acc, cpu_data.size(), cpu_data.data());

     }


     void load(AccelerationContext const *acc, size_t count, const T* cpu_data);

     template<typename U>

     Utils::use_if<!std::is_same<U, T>::value> load(AccelerationContext const *acc, size_t count, const U* cpu_data){

         std::vector<T> converted(count);

         std::transform(cpu_data, cpu_data + count, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });

         load(acc, converted);

     }

     void unload(AccelerationContext const *acc, std::vector<T> &cpu_data) const{

         cpu_data.resize(num_entries);

         unload(acc, cpu_data.data());

     }

     std::vector<T> unload(AccelerationContext const *acc) const{

         std::vector<T> y;

         unload(acc, y);

         return y;

     }

     void unload(AccelerationContext const *acc, size_t num, T* cpu_data) const;

     void unload(AccelerationContext const *acc, T* cpu_data) const{ unload(acc, num_entries, cpu_data); }


     T* eject(){

         T* external = gpu_data;

         gpu_data = nullptr;

         num_entries = 0;

         return external;

     }


     using value_type = T;


 private:

     size_t num_entries; // keep track of the size, update on every call that changes the gpu_data

     T *gpu_data; // the GPU array

     #ifdef Tasmanian_ENABLE_DPCPP

     void* sycl_queue;

     #endif

 };


 struct GpuEngine{

     #ifdef Tasmanian_ENABLE_CUDA

     void setCuBlasHandle(void *handle);

     void setCuSparseHandle(void *handle);

     void setCuSolverDnHandle(void *handle);


     std::unique_ptr<int, HandleDeleter<AccHandle::Cublas>> cublas_handle;

     std::unique_ptr<int, HandleDeleter<AccHandle::Cusparse>> cusparse_handle;

     std::unique_ptr<int, HandleDeleter<AccHandle::Cusolver>> cusolver_handle;

     #endif


     #ifdef Tasmanian_ENABLE_HIP

     void setRocBlasHandle(void *handle);

     void setRocSparseHandle(void *handle);


     std::unique_ptr<int, HandleDeleter<AccHandle::Rocblas>> rblas_handle;

     std::unique_ptr<int, HandleDeleter<AccHandle::Rocsparse>> rsparse_handle;

     #endif


     #ifdef Tasmanian_ENABLE_DPCPP

     void setSyclQueue(void *queue);

     std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> internal_queue;

     #endif


     std::unique_ptr<int> called_magma_init;

 };


 class AccelerationDomainTransform{

 public:

     AccelerationDomainTransform(AccelerationContext const *, std::vector<double> const &transform_a, std::vector<double> const &transform_b);


     template<typename T>

     void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector<T> &gpu_canonical_x);


 private:

     // these actually store the rate and shift and not the hard upper/lower limits

     GpuVector<double> gpu_trans_a, gpu_trans_b;

     int num_dimensions, padded_size;

     AccelerationContext const *acceleration;

 };


 namespace TasGpu{


     template<typename T>

     void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size,

                     const double *gpu_trans_a, const double *gpu_trans_b,

                     const T *gpu_x_transformed, T *gpu_x_canonical);


     template<typename T>

     void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y);


     template<typename T>

     void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x,

                             const GpuVector<T> &gpu_nodes, const GpuVector<T> &gpu_support,

                             const GpuVector<int> &gpu_hpntr, const GpuVector<int> &gpu_hindx, const GpuVector<int> &gpu_hroots,

                             GpuVector<int> &gpu_spntr, GpuVector<int> &gpu_sindx, GpuVector<T> &gpu_svals);


     template<typename T>

     void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x,

                   const GpuVector<int> &num_nodes,

                   const GpuVector<int> &points, const GpuVector<T> &nodes, const GpuVector<T> &coeffs, T *gpu_result);


     template<typename T>

     void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector<int> &max_levels, const T *gpu_x, const GpuVector<int> &num_nodes, const GpuVector<int> &points, T *gpu_wreal, typename GpuVector<T>::value_type *gpu_wimag);


     template<typename T>

     void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis,

                   T const *gpu_x, GpuVector<T> const &nodes, GpuVector<T> const &coeff, GpuVector<T> const &tensor_weights,

                   GpuVector<int> const &nodes_per_level, GpuVector<int> const &offset_per_level, GpuVector<int> const &map_dimension, GpuVector<int> const &map_level,

                   GpuVector<int> const &active_tensors, GpuVector<int> const &active_num_points, GpuVector<int> const &dim_offsets,

                   GpuVector<int> const &map_tensor, GpuVector<int> const &map_index, GpuVector<int> const &map_reference, T *gpu_result);


     void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[]);


     template<typename T> void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data);


     template<typename T, typename U>

     Utils::use_if<!std::is_same<U, T>::value> load_n(AccelerationContext const *acc, U const *cpu_data, size_t num_entries, T *gpu_data){

         std::vector<T> converted(num_entries);

         std::transform(cpu_data, cpu_data + num_entries, converted.begin(), [](U const &x)->T{ return static_cast<T>(x); });

         load_n(acc, converted.data(), num_entries, gpu_data);

     }


     // #define __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__ // uncomment to compile a bunch of custom CUDA kernels that provide some functionality similar to cuBlas

     #ifdef __TASMANIAN_COMPILE_FALLBACK_CUDA_KERNELS__

     // CUDA kernels that provide essentially the same functionality as cuBlas and MAGMA, but nowhere near as optimal

     // those functions should not be used in a Release or production builds

     // the kernels are useful because they are simple and do not depend on potentially poorly documented 3d party library

     // since the kernels are useful for testing and some debugging, the code should not be deleted (for now), but also don't waste time compiling in most cases


     void cudaDgemm(int M, int N, int K, const double *gpu_a, const double *gpu_b, double *gpu_c);

     // lazy cuda dgemm, nowhere near as powerful as cuBlas, but does not depend on cuBlas

     // gpu_a is M by K, gpu_b is K by N, gpu_c is M by N, all in column-major format

     // on exit gpu_c = gpu_a * gpu_b


     void cudaSparseMatmul(int M, int N, int num_nz, const int* gpu_spntr, const int* gpu_sindx, const double* gpu_svals, const double *gpu_B, double *gpu_C);

     // lazy cuda sparse dgemm, less efficient (especially for large N), but more memory conservative then cusparse as there is no need for a transpose

     // C is M x N, B is K x N (K is max(gpu_sindx)), both are given in row-major format, num_nz/spntr/sindx/svals describe row compressed A which is M by K

     // on exit C = A * B


     void cudaSparseVecDenseMat(int M, int N, int num_nz, const double *A, const int *indx, const double *vals, double *C);

     // dense matrix A (column major) times a sparse vector defiend by num_nz, indx, and vals

     // A is M by N, C is M by 1,

     // on exit C = A * (indx, vals)


     void convert_sparse_to_dense(int num_rows, int num_columns, const int *gpu_pntr, const int *gpu_indx, const double *gpu_vals, double *gpu_destination);

     // converts a sparse matrix to a dense representation (all data sits on the gpu and is pre-allocated)

     #endif

 }


 namespace AccelerationMeta{

     TypeAcceleration getIOAccelerationString(const char * name);

     std::map<std::string, TypeAcceleration> getStringToAccelerationMap();

     const char* getIOAccelerationString(TypeAcceleration accel);

     int getIOAccelerationInt(TypeAcceleration accel);

     TypeAcceleration getIOIntAcceleration(int accel);

     bool isAccTypeGPU(TypeAcceleration accel);


     inline bool isAvailable(TypeAcceleration accel){

         switch(accel){

             #ifdef Tasmanian_ENABLE_MAGMA

             case accel_gpu_magma: return true;

             #endif

             #ifdef Tasmanian_ENABLE_CUDA

             case accel_gpu_cuda: return true;

             case accel_gpu_cublas: return true;

             #endif

             #ifdef Tasmanian_ENABLE_HIP

             case accel_gpu_hip: return true;

             case accel_gpu_rocblas: return true;

             #endif

             #ifdef Tasmanian_ENABLE_DPCPP

             case accel_gpu_cuda: return true;

             case accel_gpu_cublas: return true;

             #endif

             #ifdef Tasmanian_ENABLE_BLAS

             case accel_cpu_blas: return true;

             #endif

             case accel_none:

                 return true;

             default:

                 return false;

         }

     }


     TypeAcceleration getAvailableFallback(TypeAcceleration accel);


     int getNumGpuDevices();


     void setDefaultGpuDevice(int deviceID);


     unsigned long long getTotalGPUMemory(int deviceID);


     std::string getGpuDeviceName(int deviceID);


     template<typename T> void recvGpuArray(AccelerationContext const*, size_t num_entries, const T *gpu_data, std::vector<T> &cpu_data);


     template<typename T> void delGpuArray(AccelerationContext const*, T *x);


     void *createCublasHandle();

     void deleteCublasHandle(void *);

 }


 struct AccelerationContext{

     enum AlgorithmPreference{

         algorithm_dense,

         algorithm_sparse,

         algorithm_autoselect

     };


     enum ChangeType{

         change_none,

         change_gpu_device,

         change_gpu_enabled,

         change_cpu_blas,

         change_sparse_dense

     };


     TypeAcceleration mode;

     AlgorithmPreference algorithm_select;

     int device;


     mutable std::unique_ptr<GpuEngine> engine;


     inline static constexpr TypeAcceleration getDefaultAccMode() {

         #ifdef Tasmanian_ENABLE_BLAS

         return accel_cpu_blas;

         #else

         return accel_none;

         #endif

     }

     inline static constexpr int getDefaultAccDevice() {

         #ifdef Tasmanian_ENABLE_DPCPP

         return -1;

         #else

         return 0;

         #endif

     }


     AccelerationContext() : mode(getDefaultAccMode()), algorithm_select(algorithm_autoselect), device(getDefaultAccDevice()){}


     ChangeType favorSparse(bool favor){

         AlgorithmPreference new_preference = [=]()->AlgorithmPreference{

             if (favor){

                 return (algorithm_select == algorithm_dense) ? algorithm_autoselect : algorithm_sparse;

             }else{

                 return (algorithm_select == algorithm_sparse) ? algorithm_autoselect : algorithm_dense;

             }

         }();

         if (new_preference != algorithm_select){

             algorithm_select = new_preference;

             return change_sparse_dense;

         }else{

             return change_none;

         }

     }


     bool blasCompatible() const{

         #ifdef Tasmanian_ENABLE_BLAS

         return (mode != accel_none);

         #else

         return false;

         #endif

     }


     bool useKernels() const{

         #if defined(Tasmanian_ENABLE_CUDA) || defined(Tasmanian_ENABLE_HIP)

         return ((mode == accel_gpu_cuda) or (mode == accel_gpu_magma));

         #else

         return false;

         #endif

     }


     ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const{

         TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);

         #ifdef Tasmanian_ENABLE_DPCPP

         if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))

             throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");

         #else

         if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))

             throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");

         #endif

         ChangeType mode_change = (effective_acc == mode) ? change_none : change_cpu_blas;

         ChangeType device_change = (device == new_gpu_id) ? change_none : change_gpu_device;


         if (on_gpu()){

             return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? device_change : change_gpu_device;

         }else{

             return (AccelerationMeta::isAccTypeGPU(effective_acc)) ? change_gpu_enabled : mode_change;

         }

     }


     void enable(TypeAcceleration acc, int new_gpu_id){

         // get the effective new acceleration mode (use the fallback if acc is not enabled)

         TypeAcceleration effective_acc = AccelerationMeta::getAvailableFallback(acc);

         // if switching to a GPU mode, check if the device id is valid

         #ifdef Tasmanian_ENABLE_DPCPP

         if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < -1 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))

             throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");

         #else

         if (AccelerationMeta::isAccTypeGPU(effective_acc) and ((new_gpu_id < 0 or new_gpu_id >= AccelerationMeta::getNumGpuDevices())))

             throw std::runtime_error("Invalid GPU device ID, see ./tasgrid -v for list of detected devices.");

         #endif

         if (AccelerationMeta::isAccTypeGPU(effective_acc)){

             // if the new mode is GPU-based, make an engine or reset the engine if the device has changed

             // if the engine exists and the device is not changed, then keep the existing engine

             if (!engine or new_gpu_id != device)

                 engine = Utils::make_unique<GpuEngine>();

         }else{

             engine.reset();

         }


         // assign the new values for the mode and device

         mode = effective_acc;

         device = new_gpu_id;

     }

     void setDevice() const{ AccelerationMeta::setDefaultGpuDevice(device); }

     operator GpuEngine* () const{ return engine.get(); }

     bool on_gpu() const{ return !!engine; }

 };


 #ifdef Tasmanian_ENABLE_DPCPP

 struct InternalSyclQueue{

     InternalSyclQueue() : use_testing(false){}

     void init_testing(int gpuid);

     operator std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> (){

         return std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>>(test_queue.get(),

                                                                          HandleDeleter<AccHandle::Syclqueue>(false));

     }

     bool use_testing;

     std::unique_ptr<int, HandleDeleter<AccHandle::Syclqueue>> test_queue;

 };

 extern InternalSyclQueue test_queue;

 #endif


 }


 #endif // __TASMANIAN_SPARSE_GRID_ACCELERATED_DATA_STRUCTURES_HPP

TasGrid::AccelerationDomainTransform
Implements the domain transform algorithms in case the user data is provided on the GPU.
Definition: tsgAcceleratedDataStructures.hpp:284

TasGrid::AccelerationDomainTransform::AccelerationDomainTransform
AccelerationDomainTransform(AccelerationContext const *, std::vector< double > const &transform_a, std::vector< double > const &transform_b)
Constructor, load the transform data to the GPU, the vectors are the same as used in the TasmanianSpa...

TasGrid::AccelerationDomainTransform::getCanonicalPoints
void getCanonicalPoints(bool use01, T const gpu_transformed_x[], int num_x, GpuVector< T > &gpu_canonical_x)
Transform a set of points, used in the calls to evaluateHierarchicalFunctionsGPU() Takes the user pro...

TasGrid::GpuVector
Template class that wraps around a single GPU array, providing functionality that mimics std::vector.
Definition: tsgAcceleratedDataStructures.hpp:95

TasGrid::GpuVector::unload
std::vector< T > unload(AccelerationContext const *acc) const
Return a CPU vector holding the data of the GPU.
Definition: tsgAcceleratedDataStructures.hpp:201

TasGrid::GpuVector::~GpuVector
~GpuVector()
Destructor, release all allocated memory.
Definition: tsgAcceleratedDataStructures.hpp:140

TasGrid::GpuVector::load
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, size_t count, const U *cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:190

TasGrid::GpuVector::data
const T * data() const
Get a const-reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:147

TasGrid::GpuVector::empty
bool empty() const
Return true if the size() is zero.
Definition: tsgAcceleratedDataStructures.hpp:154

TasGrid::GpuVector::unload
void unload(AccelerationContext const *acc, std::vector< T > &cpu_data) const
Copy the data from the GPU array to cpu_data, the cpu_data will be resized and overwritten.
Definition: tsgAcceleratedDataStructures.hpp:196

TasGrid::GpuVector::GpuVector
GpuVector(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Construct a vector by loading from a given range.
Definition: tsgAcceleratedDataStructures.hpp:138

TasGrid::GpuVector::GpuVector
GpuVector(GpuVector< T > const &)=delete
Delete the copy-constructor.

TasGrid::GpuVector::eject
T * eject()
Move the data to the external array, the vector is set to empty (unlike move command on std::vector).
Definition: tsgAcceleratedDataStructures.hpp:212

TasGrid::GpuVector::data
T * data()
Get a reference to the GPU array, which an be used as input to GPU libraries and kernels.
Definition: tsgAcceleratedDataStructures.hpp:145

TasGrid::GpuVector::load
Utils::use_if<!std::is_same< U, T >::value > load(AccelerationContext const *acc, const std::vector< U > &cpu_data)
Takes a vector with entries of different precision, converts and loads.
Definition: tsgAcceleratedDataStructures.hpp:171

TasGrid::GpuVector::GpuVector
GpuVector()
Default constructor, creates an empty (null) array.
Definition: tsgAcceleratedDataStructures.hpp:120

TasGrid::GpuVector::load
void load(AccelerationContext const *acc, IteratorLike ibegin, IteratorLike iend)
Load from a range defined by the begin and end, converts if necessary.
Definition: tsgAcceleratedDataStructures.hpp:161

TasGrid::GpuVector::unload
void unload(AccelerationContext const *acc, size_t num, T *cpu_data) const
Copy the first num entries to the cpu_data buffer, assumes that the buffer is sufficiently large.

TasGrid::GpuVector::GpuVector
GpuVector(AccelerationContext const *acc, int dim1, int dim2)
Same as GpuVector(dim1 * dim2), but guards against overflow.
Definition: tsgAcceleratedDataStructures.hpp:132

TasGrid::GpuVector::GpuVector
GpuVector(AccelerationContext const *acc, size_t count)
Construct a vector with count number of entries.
Definition: tsgAcceleratedDataStructures.hpp:122

TasGrid::GpuVector::operator=
GpuVector< T > & operator=(GpuVector< T > const &)=delete
Delete the copy-assignment.

TasGrid::GpuVector::load
void load(AccelerationContext const *acc, size_t count, const T *cpu_data)
Copy the first count entries of cpu_data to the GPU device.

TasGrid::GpuVector::unload
void unload(AccelerationContext const *acc, T *cpu_data) const
Copy the data from the GPU array to the cpu_data buffer, assumes that the buffer is sufficiently larg...
Definition: tsgAcceleratedDataStructures.hpp:209

TasGrid::GpuVector::value_type
T value_type
The data-type of the vector entries.
Definition: tsgAcceleratedDataStructures.hpp:220

TasGrid::GpuVector::clear
void clear()
Delete all allocated memory and reset the array to empty.

TasGrid::GpuVector::size
size_t size() const
Return the current size of the GPU array.
Definition: tsgAcceleratedDataStructures.hpp:143

TasGrid::GpuVector::resize
void resize(AccelerationContext const *acc, size_t count)
Clear all data currently stored in the vector and allocate a new array (unlike std::vector this does ...

TasGrid::GpuVector::GpuVector
GpuVector(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Create a vector with size that matches cpu_data and copy the data to the GPU device.
Definition: tsgAcceleratedDataStructures.hpp:134

TasGrid::GpuVector::GpuVector
GpuVector(AccelerationContext const *acc, int dim1, int dim2, T const *cpu_data)
Construct a vector and load with date provided on to the cpu.
Definition: tsgAcceleratedDataStructures.hpp:136

TasGrid::GpuVector::GpuVector
GpuVector(GpuVector< T > &&other)
Allow for move-construction.
Definition: tsgAcceleratedDataStructures.hpp:103

TasGrid::GpuVector::load
void load(AccelerationContext const *acc, const std::vector< T > &cpu_data)
Copy the content of cpu_data to the GPU device, all pre-existing data is deleted and the vector is re...
Definition: tsgAcceleratedDataStructures.hpp:157

TasGrid::TypeOneDRule
TypeOneDRule
Used to specify the one dimensional family of rules that induces the sparse grid.
Definition: tsgEnumerates.hpp:285

TasGrid::accel_gpu_rocblas
constexpr TypeAcceleration accel_gpu_rocblas
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:575

TasGrid::accel_gpu_hip
constexpr TypeAcceleration accel_gpu_hip
At the front API, the HIP and CUDA options are equivalent, see TasGrid::TypeAcceleration.
Definition: tsgEnumerates.hpp:570

TasGrid::TypeAcceleration
TypeAcceleration
Modes of acceleration.
Definition: tsgEnumerates.hpp:551

TasGrid::accel_cpu_blas
@ accel_cpu_blas
Default (if available), uses both BLAS and LAPACK libraries.
Definition: tsgEnumerates.hpp:555

TasGrid::accel_none
@ accel_none
Usually the slowest mode, uses only OpenMP multi-threading, but optimized for memory and could be the...
Definition: tsgEnumerates.hpp:553

TasGrid::accel_gpu_magma
@ accel_gpu_magma
Same the CUDA option but uses the UTK MAGMA library for the linear algebra operations.
Definition: tsgEnumerates.hpp:563

TasGrid::accel_gpu_cublas
@ accel_gpu_cublas
Mixed usage of the CPU (OpenMP) and GPU libraries.
Definition: tsgEnumerates.hpp:559

TasGrid::accel_gpu_cuda
@ accel_gpu_cuda
Similar to the cuBLAS option but also uses a set of Tasmanian custom GPU kernels.
Definition: tsgEnumerates.hpp:561

TasGrid::TasGpu::devalseq
void devalseq(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, const GpuVector< T > &nodes, const GpuVector< T > &coeffs, T *gpu_result)
Evaluate the basis for a Sequence grid.

TasGrid::AccelerationMeta::getIOAccelerationInt
int getIOAccelerationInt(TypeAcceleration accel)
Convert the integer (coming from Fortran) into an enumerated type.

TasGrid::AccelerationMeta::setDefaultGpuDevice
void setDefaultGpuDevice(int deviceID)
Selects the active device for this CPU thread, not supported for DPC++.

TasGrid::AccelerationMeta::getGpuDeviceName
std::string getGpuDeviceName(int deviceID)
Returns the name of the selected GPU device, empty string if no device is available or the index is o...

TasGrid::TasGpu::dtrans2can
void dtrans2can(AccelerationContext const *acc, bool use01, int dims, int num_x, int pad_size, const double *gpu_trans_a, const double *gpu_trans_b, const T *gpu_x_transformed, T *gpu_x_canonical)
Uses custom kernel to convert transformed points to canonical points, all arrays live on the CUDA dev...

TasGrid::TasGpu::load_n
void load_n(AccelerationContext const *acc, T const *cpu_data, size_t num_entries, T *gpu_data)
Similar to copy_n, copies the data from the CPU to the GPU.

TasGrid::TasGpu::devalfor
void devalfor(AccelerationContext const *acc, int dims, int num_x, const std::vector< int > &max_levels, const T *gpu_x, const GpuVector< int > &num_nodes, const GpuVector< int > &points, T *gpu_wreal, typename GpuVector< T >::value_type *gpu_wimag)
Evaluate the basis for a Fourier grid.

TasGrid::AccelerationMeta::delGpuArray
void delGpuArray(AccelerationContext const *, T *x)
Deallocate device array, used primarily for testing, always favor using GpuVector (if possible).

TasGrid::AccelerationMeta::isAccTypeGPU
bool isAccTypeGPU(TypeAcceleration accel)
Returns true if accele is cuda, cublas or magma.

TasGrid::AccelerationMeta::getIOIntAcceleration
TypeAcceleration getIOIntAcceleration(int accel)
Convert the enumerated type to an integer, the inverse of getIOAccelerationInt()

TasGrid::AccelerationMeta::recvGpuArray
void recvGpuArray(AccelerationContext const *, size_t num_entries, const T *gpu_data, std::vector< T > &cpu_data)
Copy a device array to the main memory, used for testing only, always favor using GpuVector (if possi...

TasGrid::TasGpu::devalglo
void devalglo(AccelerationContext const *acc, bool is_nested, bool is_clenshawcurtis0, int dims, int num_x, int num_p, int num_basis, T const *gpu_x, GpuVector< T > const &nodes, GpuVector< T > const &coeff, GpuVector< T > const &tensor_weights, GpuVector< int > const &nodes_per_level, GpuVector< int > const &offset_per_level, GpuVector< int > const &map_dimension, GpuVector< int > const &map_level, GpuVector< int > const &active_tensors, GpuVector< int > const &active_num_points, GpuVector< int > const &dim_offsets, GpuVector< int > const &map_tensor, GpuVector< int > const &map_index, GpuVector< int > const &map_reference, T *gpu_result)
Evaluate the basis for Global grid.

TasGrid::AccelerationMeta::getIOAccelerationString
TypeAcceleration getIOAccelerationString(const char *name)
Convert the string (coming from C or Python) into an enumerated type.

TasGrid::AccelerationMeta::createCublasHandle
void * createCublasHandle()
Creates a new cuBlas handle, used in unit-testing only.

TasGrid::TasGpu::fillDataGPU
void fillDataGPU(AccelerationContext const *acc, double value, long long N, long long stride, double data[])
Fills the data with the provided real number at the given stride.

TasGrid::AccelerationMeta::deleteCublasHandle
void deleteCublasHandle(void *)
Destroys the cuBlas handle, used in unit-testing only.

TasGrid::AccelerationMeta::getNumGpuDevices
int getNumGpuDevices()
Return the number of visible GPU devices.

TasGrid::TasGpu::devalpwpoly_sparse
void devalpwpoly_sparse(AccelerationContext const *acc, int order, TypeOneDRule rule, int dims, int num_x, const T *gpu_x, const GpuVector< T > &gpu_nodes, const GpuVector< T > &gpu_support, const GpuVector< int > &gpu_hpntr, const GpuVector< int > &gpu_hindx, const GpuVector< int > &gpu_hroots, GpuVector< int > &gpu_spntr, GpuVector< int > &gpu_sindx, GpuVector< T > &gpu_svals)
Evaluate the basis functions for a local polynomial grid using the SPARSE algorithm.

TasGrid::AccelerationMeta::getAvailableFallback
TypeAcceleration getAvailableFallback(TypeAcceleration accel)
Implements fallback logic, if accel has been enabled through CMake then this returns accel,...

TasGrid::TasGpu::devalpwpoly
void devalpwpoly(AccelerationContext const *acc, int order, TypeOneDRule rule, int num_dimensions, int num_x, int num_basis, const T *gpu_x, const T *gpu_nodes, const T *gpu_support, T *gpu_y)
Evaluate the basis functions for a local polynomial grid using the DENSE algorithm.

TasGrid::AccelerationMeta::getTotalGPUMemory
unsigned long long getTotalGPUMemory(int deviceID)
Return the memory available in the device (in units of bytes).

TasGrid::Utils::size_mult
size_t size_mult(IntA a, IntB b)
Converts two integer-like variables to size_t and returns the product..
Definition: tsgUtils.hpp:82

TasGrid::Utils::exchange
T exchange(T &x, U new_x)
Equivalent to C++14 exchange, but works with simpler types (int, double, float*).
Definition: tsgUtils.hpp:153

TasGrid::Utils::use_if
typename std::enable_if< condition, void >::type use_if
Equivalent to C++14 enable_if_t<condition, void>
Definition: tsgUtils.hpp:147

TasGrid::AccelerationMeta::getStringToAccelerationMap
std::map< std::string, TypeAcceleration > getStringToAccelerationMap()
Creates a map with std::string rule names (used by C/Python/CLI) mapped to TypeAcceleration enums.

TasGrid::AccelerationMeta::isAvailable
bool isAvailable(TypeAcceleration accel)
Identifies whether the acceleration mode is available.
Definition: tsgAcceleratedDataStructures.hpp:483

TasGrid
Encapsulates the Tasmanian Sparse Grid module.
Definition: TasmanianSparseGrid.hpp:68

TasGrid::AccelerationContext
Wrapper class around GPU device ID, acceleration type and GpuEngine.
Definition: tsgAcceleratedDataStructures.hpp:576

TasGrid::AccelerationContext::testEnable
ChangeType testEnable(TypeAcceleration acc, int new_gpu_id) const
Returns the ChangeType if enable() is called, but does not change the acceleration.
Definition: tsgAcceleratedDataStructures.hpp:672

TasGrid::AccelerationContext::algorithm_select
AlgorithmPreference algorithm_select
The preference to use dense or sparse algorithms.
Definition: tsgAcceleratedDataStructures.hpp:609

TasGrid::AccelerationContext::enable
void enable(TypeAcceleration acc, int new_gpu_id)
Accepts parameters directly from TasmanianSparseGrid::enableAcceleration()
Definition: tsgAcceleratedDataStructures.hpp:692

TasGrid::AccelerationContext::getDefaultAccDevice
static constexpr int getDefaultAccDevice()
Returns the default acceleration device, CUDA/HIP use GPU 0, SYCL uses -1 which uses sycl::default_se...
Definition: tsgAcceleratedDataStructures.hpp:625

TasGrid::AccelerationContext::favorSparse
ChangeType favorSparse(bool favor)
Sets algorithm affinity in the direction of sparse.
Definition: tsgAcceleratedDataStructures.hpp:637

TasGrid::AccelerationContext::mode
TypeAcceleration mode
The current active acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:607

TasGrid::AccelerationContext::on_gpu
bool on_gpu() const
Returns true if any of the GPU-based acceleration modes have been enabled.
Definition: tsgAcceleratedDataStructures.hpp:721

TasGrid::AccelerationContext::engine
std::unique_ptr< GpuEngine > engine
Holds the context to the GPU TPL handles, e.g., MAGMA queue.
Definition: tsgAcceleratedDataStructures.hpp:614

TasGrid::AccelerationContext::AlgorithmPreference
AlgorithmPreference
Defines the sparse-dense algorithm flavors, whenever applicable.
Definition: tsgAcceleratedDataStructures.hpp:578

TasGrid::AccelerationContext::algorithm_dense
@ algorithm_dense
Use dense algorithm.
Definition: tsgAcceleratedDataStructures.hpp:580

TasGrid::AccelerationContext::algorithm_sparse
@ algorithm_sparse
Use sparse algorithm.
Definition: tsgAcceleratedDataStructures.hpp:582

TasGrid::AccelerationContext::algorithm_autoselect
@ algorithm_autoselect
Use automatically select based on heuristics.
Definition: tsgAcceleratedDataStructures.hpp:584

TasGrid::AccelerationContext::blasCompatible
bool blasCompatible() const
Returns true if BLAS is enabled and the current mode is not none.
Definition: tsgAcceleratedDataStructures.hpp:654

TasGrid::AccelerationContext::getDefaultAccMode
static constexpr TypeAcceleration getDefaultAccMode()
Returns the default acceleration mode, cpu_blas if BLAS is enabled and none otherwise.
Definition: tsgAcceleratedDataStructures.hpp:617

TasGrid::AccelerationContext::device
int device
If using a GPU acceleration mode, holds the active device.
Definition: tsgAcceleratedDataStructures.hpp:611

TasGrid::AccelerationContext::setDevice
void setDevice() const
Set default device.
Definition: tsgAcceleratedDataStructures.hpp:717

TasGrid::AccelerationContext::ChangeType
ChangeType
Defines the types of acceleration context updates so they can be linked to acceleration cache updates...
Definition: tsgAcceleratedDataStructures.hpp:593

TasGrid::AccelerationContext::change_none
@ change_none
No change, do nothing.
Definition: tsgAcceleratedDataStructures.hpp:595

TasGrid::AccelerationContext::change_sparse_dense
@ change_sparse_dense
Change the sparse-dense AlgorithmPreference.
Definition: tsgAcceleratedDataStructures.hpp:603

TasGrid::AccelerationContext::change_gpu_device
@ change_gpu_device
Change the associated GPU device.
Definition: tsgAcceleratedDataStructures.hpp:597

TasGrid::AccelerationContext::change_cpu_blas
@ change_cpu_blas
Change BLAS to none or none to BLAS.
Definition: tsgAcceleratedDataStructures.hpp:601

TasGrid::AccelerationContext::change_gpu_enabled
@ change_gpu_enabled
Change from BLAS or none to a GPU acceleration mode.
Definition: tsgAcceleratedDataStructures.hpp:599

TasGrid::AccelerationContext::AccelerationContext
AccelerationContext()
Creates a default context, the device id is set to 0 and acceleration is BLAS (if available) or none.
Definition: tsgAcceleratedDataStructures.hpp:634

TasGrid::AccelerationContext::useKernels
bool useKernels() const
Returns true if the current mode implies the use of custom GPU kernels.
Definition: tsgAcceleratedDataStructures.hpp:663

TasGrid::GpuEngine
Wrapper class around calls GPU accelerated linear algebra libraries.
Definition: tsgAcceleratedDataStructures.hpp:236

TasGrid::GpuEngine::called_magma_init
std::unique_ptr< int > called_magma_init
Avoids an empty engine when no acceleration is enabled, allows for default constructor/move/copy,...
Definition: tsgAcceleratedDataStructures.hpp:273

TasGrid::GpuEngine::setCuSolverDnHandle
void setCuSolverDnHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusolverDnHandle_t associated with this CUD...

TasGrid::GpuEngine::cusolver_handle
std::unique_ptr< int, HandleDeleter< AccHandle::Cusolver > > cusolver_handle
Holds the cuSolver handle.
Definition: tsgAcceleratedDataStructures.hpp:250

TasGrid::GpuEngine::setCuBlasHandle
void setCuBlasHandle(void *handle)
Manually sets the cuBlas handle, handle must be a valid cublasHandle_t associated with this CUDA devi...

TasGrid::GpuEngine::setCuSparseHandle
void setCuSparseHandle(void *handle)
Manually sets the cuSparse handle, handle must be a valid cusparseHandle_t associated with this CUDA ...

TasGrid::GpuEngine::cublas_handle
std::unique_ptr< int, HandleDeleter< AccHandle::Cublas > > cublas_handle
Holds the cuBlas handle.
Definition: tsgAcceleratedDataStructures.hpp:246

TasGrid::GpuEngine::cusparse_handle
std::unique_ptr< int, HandleDeleter< AccHandle::Cusparse > > cusparse_handle
Holds the cuSparse handle.
Definition: tsgAcceleratedDataStructures.hpp:248