kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
OutputStreamsCUDAKernels.cu
Go to the documentation of this file.
1 /**
2  * @file OutputStreamsCUDAKernels.cu
3  *
4  * @author Jiri Jaros \n
5  * Faculty of Information Technology \n
6  * Brno University of Technology \n
7  * jarosjir@fit.vutbr.cz
8  *
9  * @brief The implementation file of cuda kernels used for data sampling (output streams).
10  *
11  * @version kspaceFirstOrder3D 3.4
12  *
13  * @date 27 January 2015, 17:21 (created) \n
14  * 10 August 2016, 12:03 (revised)
15  *
16  * @section License
17  * This file is part of the C++ extension of the k-Wave Toolbox
18  * (http://www.k-wave.org).\n Copyright (C) 2016 Jiri Jaros and Bradley Treeby.
19  *
20  * This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published by the Free Software
22  * Foundation, either version 3 of the License, or (at your option) any later version.
23  *
24  * k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
25  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
26  * General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License along with k-Wave.
29  * If not, see http://www.gnu.org/licenses/.
30  */
31 
32 
33 #include <cuda.h>
34 #include <cuda_runtime.h>
35 
38 
39 #include <Parameters/Parameters.h>
40 #include <Logger/Logger.h>
41 #include <Utils/CUDAUtils.cuh>
42 
43 
44 
45 
46 //------------------------------------------------------------------------------------------------//
47 //------------------------------------------ Constants -------------------------------------------//
48 //------------------------------------------------------------------------------------------------//
49 
50 //------------------------------------------------------------------------------------------------//
51 //-------------------------------------- Global routines -----------------------------------------//
52 //------------------------------------------------------------------------------------------------//
53 
54 /**
55  * Get Sampler CUDA Block size.
56  *
57  * @return CUDA block size
58  */
60 {
62 }// end of GetSamplerBlockSize
63 //--------------------------------------------------------------------------------------------------
64 
65 
66 /**
67  * Get sampler CUDA grid size.
68  *
69  * @return CUDA grid size
70  */
72 {
74 }// end of GetSamplerGridSize
75 //--------------------------------------------------------------------------------------------------
76 
77 //------------------------------------------------------------------------------------------------//
78 //------------------------------------ Index mask sampling ---------------------------------------//
79 //------------------------------------------------------------------------------------------------//
80 
81 
82 /**
83  * CUDA kernel to sample data based on index sensor mask. The operator is given by
84  * the template parameter.
85  *
86  * @param [out] samplingBuffer - Buffer to sample data in
87  * @param [in] sourceData - Source matrix
88  * @param [in] sensorData - Sensor mask
89  * @param [in] nSamples - Number of sampled points
90  */
91 template <TBaseOutputHDF5Stream::TReduceOperator reduceOp>
92 __global__ void CUDASampleIndex(float* samplingBuffer,
93  const float* sourceData,
94  const size_t* sensorData,
95  const size_t nSamples)
96 {
97  for (auto i = GetIndex(); i < nSamples; i += GetStride())
98  {
99  switch (reduceOp)
100  {
101  case TBaseOutputHDF5Stream::TReduceOperator::NONE:
102  {
103  samplingBuffer[i] = sourceData[sensorData[i]];
104  break;
105  }
106  case TBaseOutputHDF5Stream::TReduceOperator::RMS:
107  {
108  samplingBuffer[i] += (sourceData[sensorData[i]] * sourceData[sensorData[i]]);
109  break;
110  }
111  case TBaseOutputHDF5Stream::TReduceOperator::MAX:
112  {
113  samplingBuffer[i] = max(samplingBuffer[i], sourceData[sensorData[i]]);
114  break;
115  }
116  case TBaseOutputHDF5Stream::TReduceOperator::MIN:
117  {
118  samplingBuffer[i] = min(samplingBuffer[i], sourceData[sensorData[i]]);
119  break;
120  }
121  }// switch
122  } // for
123 }// end of CUDASampleIndex
124 //--------------------------------------------------------------------------------------------------
125 
126 /**
127  * Sample the source matrix using the index sensor mask and store data in buffer.
128  *
129  * @param [out] samplingBuffer - Buffer to sample data in
130  * @param [in] sourceData - Source matrix
131  * @param [in] sensorData - Sensor mask
132  * @param [in] nSamples - Number of sampled points
133  */
134 template<TBaseOutputHDF5Stream::TReduceOperator reduceOp>
135 void OutputStreamsCUDAKernels::SampleIndex(float* samplingBuffer,
136  const float* sourceData,
137  const size_t* sensorData,
138  const size_t nSamples)
139 {
140  CUDASampleIndex<reduceOp>
142  (samplingBuffer,
143  sourceData,
144  sensorData,
145  nSamples);
146 
147  // check for errors
148  checkCudaErrors(cudaGetLastError());
149 }// end of SampleIndex
150 //--------------------------------------------------------------------------------------------------
151 
152 //------------------------------ Explicit instances of SampleIndex -------------------------------//
153 template
154 void OutputStreamsCUDAKernels::SampleIndex<TBaseOutputHDF5Stream::TReduceOperator::NONE>
155  (float* samplingBuffer,
156  const float* sourceData,
157  const size_t* sensorData,
158  const size_t nSamples);
159 
160 template
161 void OutputStreamsCUDAKernels::SampleIndex<TBaseOutputHDF5Stream::TReduceOperator::RMS>
162  (float* samplingBuffer,
163  const float* sourceData,
164  const size_t* sensorData,
165  const size_t nSamples);
166 
167 template
168 void OutputStreamsCUDAKernels::SampleIndex<TBaseOutputHDF5Stream::TReduceOperator::MAX>
169  (float* samplingBuffer,
170  const float* sourceData,
171  const size_t* sensorData,
172  const size_t nSamples);
173 
174 template
175 void OutputStreamsCUDAKernels::SampleIndex<TBaseOutputHDF5Stream::TReduceOperator::MIN>
176  (float* samplingBuffer,
177  const float* sourceData,
178  const size_t* sensorData,
179  const size_t nSamples);
180 
181 
182 //------------------------------------------------------------------------------------------------//
183 //----------------------------------- Cuboid mask sampling ---------------------------------------//
184 //------------------------------------------------------------------------------------------------//
185 
186 /**
187  * Transform 3D coordinates within the cuboid into 1D coordinates within the matrix being sampled.
188  *
189  * @param [in] cuboidIdx - Cuboid index
190  * @param [in] topLeftCorner - Top left corner
191  * @param [in] bottomRightCorner - Bottom right corner
192  * @param [in] matrixSize - Size of the matrix being sampled
193  * @return 1D index into the matrix being sampled
194  */
195 inline __device__ size_t TransformCoordinates(const size_t cuboidIdx,
196  const dim3& topLeftCorner,
197  const dim3& bottomRightCorner,
198  const dim3& matrixSize)
199 {
200  dim3 localPosition;
201  // calculate the cuboid size
202  dim3 cuboidSize(bottomRightCorner.x - topLeftCorner.x + 1,
203  bottomRightCorner.y - topLeftCorner.y + 1,
204  bottomRightCorner.z - topLeftCorner.z + 1);
205 
206  // find coordinates within the cuboid
207  size_t slabSize = cuboidSize.x * cuboidSize.y;
208  localPosition.z = cuboidIdx / slabSize;
209  localPosition.y = (cuboidIdx % slabSize) / cuboidSize.x;
210  localPosition.x = (cuboidIdx % slabSize) % cuboidSize.x;
211 
212  // transform the coordinates to the global dimensions
213  dim3 globalPosition(localPosition);
214  globalPosition.z += topLeftCorner.z;
215  globalPosition.y += topLeftCorner.y;
216  globalPosition.x += topLeftCorner.x;
217 
218  // calculate 1D index
219  return (globalPosition.z * matrixSize.x * matrixSize.y +
220  globalPosition.y * matrixSize.x +
221  globalPosition.x);
222 }// end of TransformCoordinates
223 //--------------------------------------------------------------------------------------------------
224 
225 /**
226  * CUDA kernel to sample data inside one cuboid, operation is selected by a template parameter.
227 
228  * @param [out] samplingBuffer - Buffer to sample data in
229  * @param [in] sourceData - Source matrix
230  * @param [in] topLeftCorner - Top left corner of the cuboid
231  * @param [in] bottomRightCorner - Bottom right corner of the cuboid
232  * @param [in] matrixSize - Dimension sizes of the matrix being sampled
233  * @param [in] nSamples - Number of grid points inside the cuboid
234  */
235 template <TBaseOutputHDF5Stream::TReduceOperator reduceOp>
236 __global__ void CUDASampleCuboid(float* samplingBuffer,
237  const float* sourceData,
238  const dim3 topLeftCorner,
239  const dim3 bottomRightCorner,
240  const dim3 matrixSize,
241  const size_t nSamples)
242 {
243  // iterate over all grid points
244  for (auto i = GetIndex(); i < nSamples; i += GetStride())
245  {
246  auto Position = TransformCoordinates(i, topLeftCorner, bottomRightCorner, matrixSize);
247  switch (reduceOp)
248  {
249  case TBaseOutputHDF5Stream::TReduceOperator::NONE:
250  {
251  samplingBuffer[i] = sourceData[Position];
252  break;
253  }
254  case TBaseOutputHDF5Stream::TReduceOperator::RMS:
255  {
256  samplingBuffer[i] += (sourceData[Position] * sourceData[Position]);
257  break;
258  }
259  case TBaseOutputHDF5Stream::TReduceOperator::MAX:
260  {
261  samplingBuffer[i] = max(samplingBuffer[i], sourceData[Position]);
262  break;
263  }
264  case TBaseOutputHDF5Stream::TReduceOperator::MIN:
265  {
266  samplingBuffer[i] = min(samplingBuffer[i], sourceData[Position]);
267  break;
268  }
269  }// switch
270  } // for
271 }// end of CUDASampleCuboid
272 //-------------------------------------------------------------------------------------------------
273 
274 
275 /**
276  * Sample data inside one cuboid and store it to buffer. The operation is given in the template
277  * parameter.
278  *
279  * @param [out] samplingBuffer - Buffer to sample data in
280  * @param [in] sourceData - Source matrix
281  * @param [in] topLeftCorner - Top left corner of the cuboid
282  * @param [in] bottomRightCorner - Bottom right corner of the cuboid
283  * @param [in] matrixSize - Size of the matrix being sampled
284  * @param [in] nSamples - Number of grid points inside the cuboid
285  */
286 template<TBaseOutputHDF5Stream::TReduceOperator reduceOp>
287 void OutputStreamsCUDAKernels::SampleCuboid(float* samplingBuffer,
288  const float* sourceData,
289  const dim3 topLeftCorner,
290  const dim3 bottomRightCorner,
291  const dim3 matrixSize,
292  const size_t nSamples)
293 {
294  CUDASampleCuboid<reduceOp>
296  (samplingBuffer,
297  sourceData,
298  topLeftCorner,
299  bottomRightCorner,
300  matrixSize,
301  nSamples);
302  // check for errors
303  checkCudaErrors(cudaGetLastError());
304 }// end of SampleCuboid
305 //--------------------------------------------------------------------------------------------------
306 
307 
308 //----------------------------- Explicit instances of SampleCuboid -------------------------------//
309 template
310 void OutputStreamsCUDAKernels::SampleCuboid<TBaseOutputHDF5Stream::TReduceOperator::NONE>
311  (float* samplingBuffer,
312  const float* sourceData,
313  const dim3 topLeftCorner,
314  const dim3 bottomRightCorner,
315  const dim3 matrixSize,
316  const size_t nSamples);
317 template
318 void OutputStreamsCUDAKernels::SampleCuboid<TBaseOutputHDF5Stream::TReduceOperator::RMS>
319  (float* samplingBuffer,
320  const float* sourceData,
321  const dim3 topLeftCorner,
322  const dim3 bottomRightCorner,
323  const dim3 matrixSize,
324  const size_t nSamples);
325 template
326 void OutputStreamsCUDAKernels::SampleCuboid<TBaseOutputHDF5Stream::TReduceOperator::MAX>
327  (float* samplingBuffer,
328  const float* sourceData,
329  const dim3 topLeftCorner,
330  const dim3 bottomRightCorner,
331  const dim3 matrixSize,
332  const size_t nSamples);
333 template
334 void OutputStreamsCUDAKernels::SampleCuboid<TBaseOutputHDF5Stream::TReduceOperator::MIN>
335  (float* samplingBuffer,
336  const float* sourceData,
337  const dim3 topLeftCorner,
338  const dim3 bottomRightCorner,
339  const dim3 matrixSize,
340  const size_t nSamples);
341 
342 //------------------------------------------------------------------------------------------------//
343 //--------------------------------- Whole domain based sampling ----------------------------------//
344 //------------------------------------------------------------------------------------------------//
345 
346 /**
347  * CUDA kernel to sample and aggregate the source matrix on the whole domain and apply a reduce
348  * operator.
349  *
350  * @param [in,out] samplingBuffer - Buffer to sample data in
351  * @param [in] sourceData - Source matrix
352  * @param [in] nSamples - Number of sampled points
353  */
354 template <TBaseOutputHDF5Stream::TReduceOperator reduceOp>
355 __global__ void CUDASampleAll(float* samplingBuffer,
356  const float* sourceData,
357  const size_t nSamples)
358 {
359  for (size_t i = GetIndex(); i < nSamples; i += GetStride())
360  {
361  switch (reduceOp)
362  {
363  case TBaseOutputHDF5Stream::TReduceOperator::RMS:
364  {
365  samplingBuffer[i] += (sourceData[i] * sourceData[i]);
366  break;
367  }
368  case TBaseOutputHDF5Stream::TReduceOperator::MAX:
369  {
370  samplingBuffer[i] = max(samplingBuffer[i], sourceData[i]);
371  break;
372  }
373  case TBaseOutputHDF5Stream::TReduceOperator::MIN:
374  {
375  samplingBuffer[i] = min(samplingBuffer[i], sourceData[i]);
376  break;
377  }
378  }
379  }
380 }// end of CUDASampleAll
381 //--------------------------------------------------------------------------------------------------
382 
383 
384 /**
385  * Sample and the whole domain and apply a defined operator.
386  *
387  * @param [in,out] samplingBuffer - Buffer to sample data in
388  * @param [in] sourceData - Source matrix
389  * @param [in] nSamples - Number of sampled points
390  */
391 template<TBaseOutputHDF5Stream::TReduceOperator reduceOp>
392 void OutputStreamsCUDAKernels::SampleAll(float* samplingBuffer,
393  const float* sourceData,
394  const size_t nSamples)
395 {
396  CUDASampleAll<reduceOp>
398  (samplingBuffer,
399  sourceData,
400  nSamples);
401  // check for errors
402  checkCudaErrors(cudaGetLastError());
403 }// end of SampleMaxAll
404 //--------------------------------------------------------------------------------------------------
405 
406 
407 //------------------------------ Explicit instances of SampleAll ---------------------------------//
408 template
409 void OutputStreamsCUDAKernels::SampleAll<TBaseOutputHDF5Stream::TReduceOperator::RMS>
410  (float* samplingBuffer,
411  const float* sourceData,
412  const size_t nSamples);
413 template
414 void OutputStreamsCUDAKernels::SampleAll<TBaseOutputHDF5Stream::TReduceOperator::MAX>
415  (float* samplingBuffer,
416  const float* sourceData,
417  const size_t nSamples);
418 template
419 void OutputStreamsCUDAKernels::SampleAll<TBaseOutputHDF5Stream::TReduceOperator::MIN>
420  (float* samplingBuffer,
421  const float* sourceData,
422  const size_t nSamples);
423 
424 
425 //----------------------------------------------------------------------------//
426 //------------------------------ Post-processing -----------------------------//
427 //----------------------------------------------------------------------------//
428 
429 
430 /**
431  * CUDA kernel to apply post-processing for RMS
432  * @param [in, out] samplingBuffer - Buffer to apply post-processing on
433  * @param [in] scalingCoeff - Scaling coeficinet for RMS
434  * @param [in] nSamples - Number of elements
435  */
436 __global__ void CUDAPostProcessingRMS(float* samplingBuffer,
437  const float scalingCoeff,
438  const size_t nSamples)
439 {
440  for (size_t i = GetIndex(); i < nSamples; i += GetStride())
441  {
442  samplingBuffer[i] = sqrt(samplingBuffer[i] * scalingCoeff);
443  }
444 }// end of CUDAPostProcessingRMS
445 //--------------------------------------------------------------------------------------------------
446 
447 
448 /**
449  * Calculate post-processing for RMS.
450  *
451  * @param [in, out] samplingBuffer - Buffer to apply post-processing on
452  * @param [in] scalingCoeff - Scaling coefficent
453  * @param [in] nSamples - Number of elements
454  */
456  const float scalingCoeff,
457  const size_t nSamples)
458 {
459  CUDAPostProcessingRMS<<<GetSamplerGridSize(),GetSamplerBlockSize()>>>
460  (samplingBuffer, scalingCoeff, nSamples);
461 
462  // check for errors
463  checkCudaErrors(cudaGetLastError());
464 }// end of PostProcessingRMS
465 //--------------------------------------------------------------------------------------------------
int GetSamplerGridSize1D() const
Get Number of blocks for the 1D data sampling kernels.
__global__ void CUDASampleIndex(float *samplingBuffer, const float *sourceData, const size_t *sensorData, const size_t nSamples)
__global__ void CUDAPostProcessingRMS(float *samplingBuffer, const float scalingCoeff, const size_t nSamples)
The header file of the class saving RealMatrix data into the output HDF5 file.
void SampleIndex(float *samplingBuffer, const float *sourceData, const size_t *sensorData, const size_t nSamples)
Kernel to sample quantities using an index sensor mask.
static TParameters & GetInstance()
Get instance of the singleton class.
Definition: Parameters.cpp:70
__device__ unsigned int GetIndex()
Get global 1D coordinate for 1D CUDA block.
Definition: CUDAUtils.cuh:57
void PostProcessingRMS(float *samplingBuffer, const float scalingCoeff, const size_t nSamples)
Kernel to calculate post-processing for RMS.
The header file containing the parameters of the simulation.
The header file of cuda kernels used for data sampling (output streams).
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
Definition: Logger.h:209
The header file containing a class responsible for printing out info and error messages (stdout...
int GetSamplerGridSize()
const TCUDAParameters & GetCudaParameters() const
Get class with CUDA Parameters (runtime setup), cosnt version.
Definition: Parameters.h:76
int GetSamplerBlockSize1D() const
Get number of threads for the 1D data sampling kernels.
The header file with CUDA utility functions. These routines are to be inlined.
__global__ void CUDASampleCuboid(float *samplingBuffer, const float *sourceData, const dim3 topLeftCorner, const dim3 bottomRightCorner, const dim3 matrixSize, const size_t nSamples)
__device__ unsigned int GetStride()
Get x-stride for 3D CUDA block (for processing multiple grid points by a single thread).
Definition: CUDAUtils.cuh:69
int GetSamplerBlockSize()
void SampleCuboid(float *samplingBuffer, const float *sourceData, const dim3 topLeftCorner, const dim3 bottomRightCorner, const dim3 matrixSize, const size_t nSamples)
Kernel to sample quantities inside one cuboid.
__global__ void CUDASampleAll(float *samplingBuffer, const float *sourceData, const size_t nSamples)
__device__ size_t TransformCoordinates(const size_t cuboidIdx, const dim3 &topLeftCorner, const dim3 &bottomRightCorner, const dim3 &matrixSize)
void SampleAll(float *samplingBuffer, const float *sourceData, const size_t nSamples)
Kernel to sample of the quantity on the whole domain.