kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
BaseOutputHDF5Stream.cpp
Go to the documentation of this file.
1 /**
2  * @file BaseOutputHDF5Stream.cpp
3  *
4  * @author Jiri Jaros \n
5  * Faculty of Information Technology \n
6  * Brno University of Technology \n
7  * jarosjir@fit.vutbr.cz
8  *
9  * @brief The implementation file of the class saving RealMatrix data into the output
10  * HDF5 file.
11  *
12  * @version kspaceFirstOrder3D 3.4
13  *
14  * @date 11 July 2012, 10:30 (created) \n
15  * 29 July 2016, 16:55 (revised)
16  *
17  * @section License
18  * This file is part of the C++ extension of the k-Wave Toolbox
19  * (http://www.k-wave.org).\n Copyright (C) 2016 Jiri Jaros and Bradley Treeby.
20  *
21  * This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify
22  * it under the terms of the GNU Lesser General Public License as published by the Free Software
23  * Foundation, either version 3 of the License, or (at your option) any later version.
24  *
25  * k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
26  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
27  * General Public License for more details.
28  *
29  * You should have received a copy of the GNU Lesser General Public License along with k-Wave.
30  * If not, see http://www.gnu.org/licenses/.
31  */
32 
33 #include <cmath>
34 #include <limits>
35 #include <immintrin.h>
36 
39 
40 #include <Logger/Logger.h>
41 #include <Parameters/Parameters.h>
42 
43 
44 //------------------------------------------------------------------------------------------------//
45 //------------------------------------------ Constants -------------------------------------------//
46 //------------------------------------------------------------------------------------------------//
47 
48 //------------------------------------------------------------------------------------------------//
49 //--------------------------------------- Public methods -----------------------------------------//
50 //------------------------------------------------------------------------------------------------//
51 
52 /**
53  * Constructor - there is no sensor mask by default!
54  * it links the HDF5 dataset, source (sampled matrix) and the reduce operator together.
55  * The constructor DOES NOT allocate memory because the size of the sensor mask is not known at
56  * the time the instance of the class is being created.
57  *
58  * @param [in] file - Handle to the HDF5 (output) file
59  * @param [in] rootObjectName - The root object that stores the sample data (dataset or group)
60  * @param [in] sourceMatrix - The source matrix (only real matrices are supported)
61  * @param [in] reduceOp - Reduce operator
62  */
64  TMatrixName& rootObjectName,
65  const TRealMatrix& sourceMatrix,
66  const TReduceOperator reduceOp)
67  : file(file),
68  rootObjectName(rootObjectName),
69  sourceMatrix(sourceMatrix),
70  reduceOp(reduceOp)
71 {
72 
73  }// end of TBaseOutputHDF5Stream
74 //--------------------------------------------------------------------------------------------------
75 
76 
77 /**
78  * Apply post-processing on the buffer (Done on the GPU side as well).
79  */
81 {
82  switch (reduceOp)
83  {
84  case NONE:
85  {
86  // do nothing
87  break;
88  }
89 
90  case RMS:
91  {
92  const float scalingCoeff = 1.0f / (TParameters::GetInstance().Get_nt() -
94 
96  break;
97  }
98 
99  case MAX:
100  {
101  // do nothing
102  break;
103  }
104 
105  case MIN:
106  {
107  // do nothing
108  break;
109  }
110  }// switch
111 
112 }// end of ApplyPostProcessing
113 //-------------------------------------------------------------------------------------------------
114 
115 //------------------------------------------------------------------------------------------------//
116 //-------------------------------------- Protected methods ---------------------------------------//
117 //------------------------------------------------------------------------------------------------//
118 
119 /**
120  * Allocate memory using proper memory alignment.
121  *
122  * @warning - This can routine is not used in the base class (should be used in derived ones).
123  */
125 {
126  // Allocate memory on the CPU side (always)
127  hostBuffer = (float*) _mm_malloc(bufferSize * sizeof (float), DATA_ALIGNMENT);
128 
129  if (!hostBuffer)
130  {
131  throw std::bad_alloc();
132  }
133 
134  // memory allocation done on core 0 - GPU is pinned to the first sockets
135  // we need different initialization for different reduce ops
136  switch (reduceOp)
137  {
138  case NONE :
139  {
140  // zero the matrix - on the CPU side and lock on core 0 (gpu pinned to 1st socket)
141  for (size_t i = 0; i < bufferSize; i++)
142  {
143  hostBuffer[i] = 0.0f;
144  }
145  break;
146  }
147 
148  case RMS :
149  {
150  // zero the matrix - on the CPU side and lock on core 0 (gpu pinned to 1st socket)
151  for (size_t i = 0; i < bufferSize; i++)
152  {
153  hostBuffer[i] = 0.0f;
154  }
155  break;
156  }
157 
158  case MAX :
159  {
160  // set the values to the highest negative float value - on the core 0
161  for (size_t i = 0; i < bufferSize; i++)
162  {
163  hostBuffer[i] = -1 * std::numeric_limits<float>::max();
164  }
165  break;
166  }
167 
168  case MIN :
169  {
170  // set the values to the highest float value - on the core 0
171  for (size_t i = 0; i < bufferSize; i++)
172  {
173  hostBuffer[i] = std::numeric_limits<float>::max();
174  }
175  break;
176  }
177  }// switch
178 
179  // Register Host memory (pin in memory only - no mapped data)
180  checkCudaErrors(cudaHostRegister(hostBuffer,
181  bufferSize * sizeof (float),
182  cudaHostRegisterPortable | cudaHostRegisterMapped));
183  // cudaHostAllocWriteCombined - cannot be used since GPU writes and CPU reads
184 
185  // Map CPU buffer to GPU memory (RAW data) or allocate a GPU buffer (aggregated)
186  if (reduceOp == NONE)
187  {
188  // Register CPU memory for zero-copy
189  checkCudaErrors(cudaHostGetDevicePointer<float>(&deviceBuffer, hostBuffer, 0));
190  }
191  else
192  {
193  // Allocate memory on the GPU side
194  if ((cudaMalloc<float>(&deviceBuffer, bufferSize * sizeof (float))!= cudaSuccess) || (!deviceBuffer))
195  {
196  throw std::bad_alloc();
197  }
198  // if doing aggregation copy initialised arrays on GPU
200  }
201 }// end of AllocateMemory
202 //--------------------------------------------------------------------------------------------------
203 
204 /**
205  * Free memory.
206  *
207  * @warning - This can routine is not used in the base class (should be used in derived ones).
208  */
210 {
211  // free host buffer
212  if (hostBuffer)
213  {
214  cudaHostUnregister(hostBuffer);
215  _mm_free(hostBuffer);
216  }
217  hostBuffer = nullptr;
218 
219  // Free GPU memory
220  if (reduceOp != NONE)
221  {
222  checkCudaErrors(cudaFree(deviceBuffer));
223  }
224  deviceBuffer = nullptr;
225 }// end of FreeMemory
226 //--------------------------------------------------------------------------------------------------
227 
228 /**
229  * Copy data hostBuffer -> deviceBuffer
230  */
232 {
233 
234  checkCudaErrors(cudaMemcpy(deviceBuffer,
235  hostBuffer,
236  bufferSize * sizeof(float),
237  cudaMemcpyHostToDevice));
238 
239 }// end of CopyDataToDevice
240 //--------------------------------------------------------------------------------------------------
241 
242 /**
243  * Copy data deviceBuffer -> hostBuffer
244  */
246 {
247  checkCudaErrors(cudaMemcpy(hostBuffer,
248  deviceBuffer,
249  bufferSize * sizeof(float),
250  cudaMemcpyDeviceToHost));
251 }// end of CopyDataFromDevice
252 //--------------------------------------------------------------------------------------------------
253 
254 
255 //------------------------------------------------------------------------------------------------//
256 //--------------------------------------- Private methods ----------------------------------------//
257 //------------------------------------------------------------------------------------------------//
258 
size_t GetStartTimeIndex() const
Get start time index for sensor recording.
Definition: Parameters.h:225
TBaseOutputHDF5Stream()
Default constructor not allowed.
The header file of the class saving RealMatrix data into the output HDF5 file.
virtual void CopyDataFromDevice()
Copy data deviceBuffer -> hostBuffer.
calculate root mean square
static TParameters & GetInstance()
Get instance of the singleton class.
Definition: Parameters.cpp:70
void PostProcessingRMS(float *samplingBuffer, const float scalingCoeff, const size_t nSamples)
Kernel to calculate post-processing for RMS.
const TReduceOperator reduceOp
Reduce operator.
The header file containing the parameters of the simulation.
The header file of cuda kernels used for data sampling (output streams).
const std::string TMatrixName
Datatype for matrix names.
Definition: MatrixNames.h:45
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
Definition: Logger.h:209
virtual void PostProcess()
Apply post-processing on the buffer and flush it to the file.
virtual void AllocateMemory()
A generic function to allocate memory - not used in the base class.
The header file containing a class responsible for printing out info and error messages (stdout...
TReduceOperator
How to aggregate data.
float * deviceBuffer
Temporary buffer on the GPU side - only for aggregated quantities.
store actual data (time series)
virtual void FreeMemory()
A generic function to free memory - not used in the base class.
The class for real matrices.
Definition: RealMatrix.h:45
float * hostBuffer
Temporary buffer for store on the GPU side.
virtual void CopyDataToDevice()
Copy data hostBuffer-> deviceBuffer.
size_t bufferSize
Buffer size.
const int DATA_ALIGNMENT
memory alignment for SSE, SSE2, SSE3, SSE4 (16B)
size_t Get_nt() const
Get Nt value.
Definition: Parameters.h:100
Class wrapping the HDF5 routines.
Definition: HDF5_File.h:500