35 #include <immintrin.h>
58 nAllocatedElements(0),
75 #pragma omp parallel for schedule (static)
90 #pragma omp parallel for schedule (static)
107 cudaMemcpyHostToDevice));
120 cudaMemcpyDeviceToHost));
142 throw std::bad_alloc();
146 checkCudaErrors(cudaHostRegister(hostData, sizeInBytes, cudaHostRegisterPortable));
151 throw std::bad_alloc();
float * hostData
Raw CPU matrix data.
virtual void FreeMemory()
Memory allocation (both on CPU and GPU).
Abstract base class. The common ancestor defining the common interface and allowing derived classes t...
virtual void ZeroMatrix()
Zero all elements of the matrix (NUMA first touch).
virtual void AllocateMemory()
Memory allocation (both on CPU and GPU).
TBaseFloatMatrix()
Default constructor.
size_t nAllocatedElements
Total number of allocated elements (in terms of floats).
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
The header file containing a class responsible for printing out info and error messages (stdout...
float * deviceData
Raw GPU matrix data.
The header file containing the structure with 3D dimension sizes.
The header file containing the base class for single precisions floating point numbers (floats)...
const int DATA_ALIGNMENT
memory alignment for SSE, SSE2, SSE3, SSE4 (16B)
virtual void CopyToDevice()
Copy data from CPU -> GPU (Host -> Device).
virtual void ScalarDividedBy(const float scalar)
Divide scalar/ matrix_element[i].
virtual void CopyFromDevice()
Copy data from GPU -> CPU (Device -> Host).