kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
BaseFloatMatrix.cpp
Go to the documentation of this file.
1 /**
2  * @file BaseFloatMatrix.cpp
3  *
4  * @author Jiri Jaros \n
5  * Faculty of Information Technology \n
6  * Brno University of Technology \n
7  * jarosjir@fit.vutbr.cz
8  *
9  * @brief The implementation file containing the base class for single precisions floating
10  * point numbers (floats).
11  *
12  * @version kspaceFirstOrder3D 3.4
13  *
14  * @date 11 July 2011, 12:13 (created) \n
15  * 10 August 2016, 11:54 (revised)
16  *
17  * @section License
18  * This file is part of the C++ extension of the k-Wave Toolbox
19  * (http://www.k-wave.org).\n Copyright (C) 2016 Jiri Jaros and Bradley Treeby.
20  *
21  * This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published by the Free Software
23  * Foundation, either version 3 of the License, or (at your option) any later version.
24  *
25  * k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
26  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
27  * General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License along with k-Wave.
30  * If not, see http://www.gnu.org/licenses/.
31  */
32 
33 
34 #include <cstring>
35 #include <immintrin.h>
36 
38 #include <Utils/DimensionSizes.h>
39 #include <Logger/Logger.h>
40 
41 
42 using std::string;
43 
44 //------------------------------------------------------------------------------------------------//
45 //------------------------------------------ Constants -------------------------------------------//
46 //------------------------------------------------------------------------------------------------//
47 
48 
49 //------------------------------------------------------------------------------------------------//
50 //--------------------------------------- Public methods -----------------------------------------//
51 //------------------------------------------------------------------------------------------------//
52 
53 /**
54  * Default constructor
55  */
57  nElements(0),
58  nAllocatedElements(0),
59  dimensionSizes(),
60  dataRowSize(0),
61  dataSlabSize(0),
62  hostData(nullptr),
63  deviceData(nullptr)
64 {
65 
66 }// end of TBaseFloatMatrix
67 //--------------------------------------------------------------------------------------------------
68 
69 /**
70  * Zero all allocated elements in parallel. \n
71  * Also work as the first touch strategy on NUMA machines.
72  */
74 {
75  #pragma omp parallel for schedule (static)
76  for (size_t i = 0; i < nAllocatedElements; i++)
77  {
78  hostData[i] = 0.0f;
79  }
80 }// end of ZeroMatrix
81 //--------------------------------------------------------------------------------------------------
82 
83 /**
84  * Divide a scalar by the elements of matrix.
85  *
86  * @param [in] scalar - Scalar to be divided by evey element of the array
87  */
88 void TBaseFloatMatrix::ScalarDividedBy(const float scalar)
89 {
90  #pragma omp parallel for schedule (static)
91  for (size_t i = 0; i < nAllocatedElements; i++)
92  {
93  hostData[i] = scalar / hostData[i];
94  }
95 }// end of ScalarDividedBy
96 //-------------------------------------------------------------------------------------------------
97 
98 /**
99  * Copy data from CPU -> GPU (Host -> Device).
100  * The transfer is synchronous (there is nothing to overlap with in the code)
101  */
103 {
104  checkCudaErrors(cudaMemcpy(deviceData,
105  hostData,
106  nAllocatedElements * sizeof(float),
107  cudaMemcpyHostToDevice));
108 }// end of CopyToDevice
109 //--------------------------------------------------------------------------------------------------
110 
111 /**
112  * Copy data from GPU -> CPU (Device -> Host).
113  * The transfer is synchronous (there is nothing to overlap with in the code)
114  */
116 {
117  checkCudaErrors(cudaMemcpy(hostData,
118  deviceData,
119  nAllocatedElements * sizeof(float),
120  cudaMemcpyDeviceToHost));
121 }// end of CopyFromDevice
122 //--------------------------------------------------------------------------------------------------
123 
124 
125 //------------------------------------------------------------------------------------------------//
126 //-------------------------------------- Protected methods ---------------------------------------//
127 //------------------------------------------------------------------------------------------------//
128 
129 /**
130  * Memory allocation based on the total number of elements. \n
131  * CPU memory is aligned by the DATA_ALIGNMENT and then registered as pinned and zeroed.
132  */
134 {
135  // Size of memory to allocate
136  size_t sizeInBytes = nAllocatedElements * sizeof(float);
137 
138  // Allocate CPU memory
139  hostData = static_cast<float*> (_mm_malloc(sizeInBytes, DATA_ALIGNMENT));
140  if (!hostData)
141  {
142  throw std::bad_alloc();
143  }
144 
145  // Register Host memory (pin in memory)
146  checkCudaErrors(cudaHostRegister(hostData, sizeInBytes, cudaHostRegisterPortable));
147 
148  // Allocate memory on the GPU
149  if ((cudaMalloc<float>(&deviceData, sizeInBytes) != cudaSuccess) || (!deviceData))
150  {
151  throw std::bad_alloc();
152  }
153  // This has to be done for simulations based on input sources
154  ZeroMatrix();
155 }//end of AllocateMemory
156 //--------------------------------------------------------------------------------------------------
157 
158 /**
159  * Free memory.
160  * Both on the CPU and GPU side
161  */
163 {
164  // Free CPU memory
165  if (hostData)
166  {
167  cudaHostUnregister(hostData);
168  _mm_free(hostData);
169  }
170  hostData = nullptr;
171 
172  // Free GPU memory
173  if (deviceData)
174  {
175  checkCudaErrors(cudaFree(deviceData));
176  }
177  deviceData = nullptr;
178 }//end of FreeMemory
179 //--------------------------------------------------------------------------------------------------
180 
181 //------------------------------------------------------------------------------------------------//
182 //--------------------------------------- Private methods ----------------------------------------//
183 //------------------------------------------------------------------------------------------------//
184 
float * hostData
Raw CPU matrix data.
virtual void FreeMemory()
Memory allocation (both on CPU and GPU).
Abstract base class. The common ancestor defining the common interface and allowing derived classes t...
Definition: BaseMatrix.h:48
virtual void ZeroMatrix()
Zero all elements of the matrix (NUMA first touch).
virtual void AllocateMemory()
Memory allocation (both on CPU and GPU).
TBaseFloatMatrix()
Default constructor.
size_t nAllocatedElements
Total number of allocated elements (in terms of floats).
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
Definition: Logger.h:209
The header file containing a class responsible for printing out info and error messages (stdout...
float * deviceData
Raw GPU matrix data.
The header file containing the structure with 3D dimension sizes.
The header file containing the base class for single precisions floating point numbers (floats)...
const int DATA_ALIGNMENT
memory alignment for SSE, SSE2, SSE3, SSE4 (16B)
virtual void CopyToDevice()
Copy data from CPU -> GPU (Host -> Device).
virtual void ScalarDividedBy(const float scalar)
Divide scalar/ matrix_element[i].
virtual void CopyFromDevice()
Copy data from GPU -> CPU (Device -> Host).