kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SolverCUDAKernels.cu
Go to the documentation of this file.
1 /**
2  * @file SolverCUDAKernels.cu
3  *
4  * @author Jiri Jaros \n
5  * Faculty of Information Technology \n
6  * Brno University of Technology \n
7  * jarosjir@fit.vutbr.cz
8  *
9  * @brief The implementation file containing the all CUDA kernels for the GPU implementation
10  *
11  * @version kspaceFirstOrder3D 3.4
12  *
13  * @date 11 March 2013, 13:10 (created) \n
14  * 27 July 2016, 15:09 (revised)
15  *
16  * @section License
17  * This file is part of the C++ extension of the k-Wave Toolbox
18  * (http://www.k-wave.org).\n Copyright (C) 2016 Jiri Jaros and Bradley Treeby.
19  *
20  * This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published by the Free Software
22  * Foundation, either version 3 of the License, or (at your option) any later version.
23  *
24  * k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
25  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
26  * General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License along with k-Wave.
29  * If not, see http://www.gnu.org/licenses/.
30  */
31 
32 #include <cuComplex.h>
33 
36 
37 #include <Logger/Logger.h>
38 #include <Utils/CUDAUtils.cuh>
39 
40 //------------------------------------------------------------------------------------------------//
41 //------------------------------------------ Constants -------------------------------------------//
42 //------------------------------------------------------------------------------------------------//
43 
44 //------------------------------------------------------------------------------------------------//
45 //------------------------------------------ Variables -------------------------------------------//
46 //------------------------------------------------------------------------------------------------//
47 
48 
49 /**
50  * @var cudaDeviceConstants
51  * @brief This variable holds basic simulation constants for GPU.
52  * @details This variable holds necessary simulation constants in the CUDA GPU memory.
53  * The variable is defined in TCUDADeviceConstants.cu
54  */
55 extern __constant__ TCUDADeviceConstants cudaDeviceConstants;
56 
57 
58 //------------------------------------------------------------------------------------------------//
59 //--------------------------------------- Global methods -----------------------------------------//
60 //------------------------------------------------------------------------------------------------//
61 
62 /**
63  * Get block size for 1D kernels.
64  * @return 1D block size
65  */
67 {
69 };// end of GetSolverBlockSize1D
70 //--------------------------------------------------------------------------------------------------
71 
72 /**
73  * Get grid size for 1D kernels.
74  * @return 1D grid size
75  */
76 inline int GetSolverGridSize1D()
77 {
79 };// end of GetSolverGridSize1D
80 //--------------------------------------------------------------------------------------------------
81 
82 /**
83  * Get block size for the transposition kernels.
84  * @return 3D grid size
85  */
87 {
89 };//end of GetSolverTransposeBlockSize()
90 //--------------------------------------------------------------------------------------------------
91 
92 /**
93  * Get grid size for complex 3D kernels
94  * @return 3D grid size
95  */
97 {
99 };// end of GetSolverTransposeGirdSize()
100 //--------------------------------------------------------------------------------------------------
101 
102 //------------------------------------------------------------------------------------------------//
103 //--------------------------------------- Public routines ----------------------------------------//
104 //------------------------------------------------------------------------------------------------//
105 
106 /**
107  * Kernel to find out the version of the code.
108  * The list of GPUs can be found at https://en.wikipedia.org/wiki/CUDA
109  *
110  * @param [out] cudaCodeVersion
111  */
112 __global__ void CUDAGetCUDACodeVersion(int* cudaCodeVersion)
113 {
114  *cudaCodeVersion = -1;
115 
116  #if (__CUDA_ARCH__ == 530)
117  *cudaCodeVersion = 53;
118  #elif (__CUDA_ARCH__ == 520)
119  *cudaCodeVersion = 52;
120  #elif (__CUDA_ARCH__ == 500)
121  *cudaCodeVersion = 50;
122  #elif (__CUDA_ARCH__ == 370)
123  *cudaCodeVersion = 37;
124  #elif (__CUDA_ARCH__ == 350)
125  *cudaCodeVersion = 35;
126  #elif (__CUDA_ARCH__ == 320)
127  *cudaCodeVersion = 32;
128  #elif (__CUDA_ARCH__ == 300)
129  *cudaCodeVersion = 30;
130  #elif (__CUDA_ARCH__ == 210)
131  *cudaCodeVersion = 21;
132  #elif (__CUDA_ARCH__ == 200)
133  *cudaCodeVersion = 20;
134  #endif
135 }// end of CUDAGetCodeVersion
136 //--------------------------------------------------------------------------------------------------
137 
138 /**
139  * Get the CUDA architecture and GPU code version the code was compiled with.
140  * @return the CUDA code version the code was compiled for
141  */
143 {
144  // host and device pointers, data copied over zero copy memory
145  int* hCudaCodeVersion;
146  int* dCudaCodeVersion;
147 
148  // returned value
149  int cudaCodeVersion = 0;
150  cudaError_t cudaError;
151 
152  // allocate for zero copy
153  cudaError = cudaHostAlloc<int>(&hCudaCodeVersion,
154  sizeof(int),
155  cudaHostRegisterPortable | cudaHostRegisterMapped);
156 
157  // if the device is busy, return 0 - the GPU is not supported
158  if (cudaError == cudaSuccess)
159  {
160  checkCudaErrors(cudaHostGetDevicePointer<int>(&dCudaCodeVersion, hCudaCodeVersion, 0));
161 
162  // find out the CUDA code version
163  CUDAGetCUDACodeVersion<<<1,1>>>(dCudaCodeVersion);
164  cudaDeviceSynchronize();
165  if (cudaGetLastError() != cudaSuccess)
166  {
167  // The GPU architecture is not supported
168  cudaCodeVersion = 0;
169  }
170  else
171  {
172  cudaCodeVersion = *hCudaCodeVersion;
173  }
174 
175  checkCudaErrors(cudaFreeHost(hCudaCodeVersion));
176  }
177 
178  return (cudaCodeVersion);
179 }// end of GetCodeVersion
180 //--------------------------------------------------------------------------------------------------
181 
182 
183 /**
184  * CUDA kernel to calculate ux_sgx, uy_sgy, uz_sgz. Default (heterogeneous case).
185  *
186  * @param [in, out] ux_sgx
187  * @param [in, out] uy_sgy
188  * @param [in, out] uz_sgz
189  * @param [in] ifft_x
190  * @param [in] ifft_y
191  * @param [in] ifft_z
192  * @param [in] dt_rho0_sgx
193  * @param [in] dt_rho0_sgy
194  * @param [in] dt_rho0_sgz
195  * @param [in] pml_x
196  * @param [in] pml_y
197  * @param [in] pml_z
198  */
199 __global__ void CUDAComputeVelocity(float* ux_sgx,
200  float* uy_sgy,
201  float* uz_sgz,
202  const float* ifft_x,
203  const float* ifft_y,
204  const float* ifft_z,
205  const float* dt_rho0_sgx,
206  const float* dt_rho0_sgy,
207  const float* dt_rho0_sgz,
208  const float* pml_x,
209  const float* pml_y,
210  const float* pml_z)
211 {
212  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
213  {
214  const dim3 coords = GetReal3DCoords(i);
215 
216  const float ifft_x_el = cudaDeviceConstants.fftDivider * ifft_x[i] * dt_rho0_sgx[i];
217  const float ifft_y_el = cudaDeviceConstants.fftDivider * ifft_y[i] * dt_rho0_sgy[i];
218  const float ifft_z_el = cudaDeviceConstants.fftDivider * ifft_z[i] * dt_rho0_sgz[i];
219 
220  const float pml_x_data = pml_x[coords.x];
221  const float pml_y_data = pml_y[coords.y];
222  const float pml_z_data = pml_z[coords.z];
223 
224  ux_sgx[i] = (ux_sgx[i] * pml_x_data - ifft_x_el) * pml_x_data;
225  uy_sgy[i] = (uy_sgy[i] * pml_y_data - ifft_y_el) * pml_y_data;
226  uz_sgz[i] = (uz_sgz[i] * pml_z_data - ifft_z_el) * pml_z_data;
227  }
228 }// end of CUDAComputeVelocity
229 //--------------------------------------------------------------------------------------------------
230 
231 /**
232  * Interface to the CUDA kernel computing new version of ux_sgx. Default (heterogeneous case)
233  *
234  * @param [in, out] ux_sgx
235  * @param [in, out] uy_sgy
236  * @param [in, out] uz_sgz
237  * @param [in] ifft_x
238  * @param [in] ifft_y
239  * @param [in] ifft_z
240  * @param [in] dt_rho0_sgx
241  * @param [in] dt_rho0_sgy
242  * @param [in] dt_rho0_sgz
243  * @param [in] pml_x
244  * @param [in] pml_y
245  * @param [in] pml_z
246  */
248  TRealMatrix& uy_sgy,
249  TRealMatrix& uz_sgz,
250  const TRealMatrix& ifft_x,
251  const TRealMatrix& ifft_y,
252  const TRealMatrix& ifft_z,
253  const TRealMatrix& dt_rho0_sgx,
254  const TRealMatrix& dt_rho0_sgy,
255  const TRealMatrix& dt_rho0_sgz,
256  const TRealMatrix& pml_x,
257  const TRealMatrix& pml_y,
258  const TRealMatrix& pml_z)
259  {
260  CUDAComputeVelocity<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
261  (ux_sgx.GetDeviceData(),
262  uy_sgy.GetDeviceData(),
263  uz_sgz.GetDeviceData(),
264  ifft_x.GetDeviceData(),
265  ifft_y.GetDeviceData(),
266  ifft_z.GetDeviceData(),
267  dt_rho0_sgx.GetDeviceData(),
268  dt_rho0_sgy.GetDeviceData(),
269  dt_rho0_sgz.GetDeviceData(),
270  pml_x.GetDeviceData(),
271  pml_y.GetDeviceData(),
272  pml_z.GetDeviceData());
273 
274  // check for errors
275  checkCudaErrors(cudaGetLastError());
276 }// end of ComputeVelocity
277 //--------------------------------------------------------------------------------------------------
278 
279 
280 
281 /**
282  * CUDA kernel to calculate ux_sgx, uy_sgy, uz_sgz.
283  * This is the case for rho0 being a scalar and a uniform grid.
284  *
285  * @param [in, out] ux_sgx - new value of ux
286  * @param [in, out] uy_sgy - new value of uy
287  * @param [in, out] uz_sgz - new value of ux
288  * @param [in] ifft_x - gradient for X
289  * @param [in] ifft_y - gradient for Y
290  * @param [in] ifft_z - gradient for Z
291  * @param [in] pml_x
292  * @param [in] pml_y
293  * @param [in] pml_z
294  */
295 __global__ void CUDAComputeVelocityScalarUniform(float* ux_sgx,
296  float* uy_sgy,
297  float* uz_sgz,
298  const float* ifft_x,
299  const float* ifft_y,
300  const float* ifft_z,
301  const float* pml_x,
302  const float* pml_y,
303  const float* pml_z)
304 {
308 
309  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
310  {
311  const dim3 coords = GetReal3DCoords(i);
312 
313  const float pml_x_el = pml_x[coords.x];
314  const float pml_y_el = pml_y[coords.y];
315  const float pml_z_el = pml_z[coords.z];
316 
317  ux_sgx[i] = (ux_sgx[i] * pml_x_el - Divider_X * ifft_x[i]) * pml_x_el;
318  uy_sgy[i] = (uy_sgy[i] * pml_y_el - Divider_Y * ifft_y[i]) * pml_y_el;
319  uz_sgz[i] = (uz_sgz[i] * pml_z_el - Divider_Z * ifft_z[i]) * pml_z_el;
320  }// for
321 }// end of CUDAComputeVelocityScalarUniform
322 //--------------------------------------------------------------------------------------------------
323 
324 /**
325  * Interface to the CUDA kernel computing new version of ux_sgx, uy_sgy, uz_sgz.
326  * This is the case for rho0 being a scalar and a uniform grid.
327  *
328  * @param [in, out] ux_sgx
329  * @param [in, out] uy_sgy
330  * @param [in, out] uz_sgz
331  * @param [in] ifft_x
332  * @param [in] ifft_y
333  * @param [in] ifft_z
334  * @param [in] pml_x
335  * @param [in] pml_y
336  * @param [in] pml_z
337  */
339  TRealMatrix& uy_sgy,
340  TRealMatrix& uz_sgz,
341  const TRealMatrix& ifft_x,
342  const TRealMatrix& ifft_y,
343  const TRealMatrix& ifft_z,
344  const TRealMatrix& pml_x,
345  const TRealMatrix& pml_y,
346  const TRealMatrix& pml_z)
347 {
348  CUDAComputeVelocityScalarUniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
349  (ux_sgx.GetDeviceData(),
350  uy_sgy.GetDeviceData(),
351  uz_sgz.GetDeviceData(),
352  ifft_x.GetDeviceData(),
353  ifft_y.GetDeviceData(),
354  ifft_z.GetDeviceData(),
355  pml_x.GetDeviceData(),
356  pml_y.GetDeviceData(),
357  pml_z.GetDeviceData());
358  // check for errors
359  checkCudaErrors(cudaGetLastError());
360 }// end of ComputeVelocityScalarUniform
361 //--------------------------------------------------------------------------------------------------
362 
363 
364 /**
365  * CUDA kernel to calculate ux_sgx, uy_sgy and uz_sgz.
366  * This is the case for rho0 being a scalar and a non-uniform grid.
367  *
368  * @param [in,out] ux_sgx - updated value of ux_sgx
369  * @param [in,out] uy_sgy - updated value of ux_sgx
370  * @param [in,out] uz_sgz - updated value of ux_sgx
371  * @param [in] ifft_x - gradient of X
372  * @param [in] ifft_y - gradient of X
373  * @param [in] ifft_z - gradient of X
374  * @param [in] dxudxn_sgx - matrix dx shift
375  * @param [in] dyudyn_sgy - matrix dy shift
376  * @param [in] dzudzn_sgz - matrix dz shift
377  * @param [in] pml_x - matrix of pml_x
378  * @param [in] pml_y - matrix of pml_x
379  * @param [in] pml_z - matrix of pml_x
380  */
381 __global__ void CUDAComputeVelocityScalarNonuniform(float* ux_sgx,
382  float* uy_sgy,
383  float* uz_sgz,
384  const float* ifft_x,
385  const float* ifft_y,
386  const float* ifft_z,
387  const float* dxudxn_sgx,
388  const float* dyudyn_sgy,
389  const float* dzudzn_sgz,
390  const float* pml_x,
391  const float* pml_y,
392  const float* pml_z)
393 {
397 
398  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
399  {
400  const dim3 coords = GetReal3DCoords(i);
401 
402  const float pml_x_el = pml_x[coords.x];
403  const float pml_y_el = pml_y[coords.y];
404  const float pml_z_el = pml_z[coords.z];
405 
406  const float ifft_x_el = Divider_X * dxudxn_sgx[coords.x] * ifft_x[i];
407  const float ifft_y_el = Divider_Y * dyudyn_sgy[coords.y] * ifft_y[i];
408  const float ifft_z_el = Divider_Z * dzudzn_sgz[coords.z] * ifft_z[i];
409 
410  ux_sgx[i] = (ux_sgx[i] * pml_x_el - ifft_x_el) * pml_x_el;
411  uy_sgy[i] = (uy_sgy[i] * pml_y_el - ifft_y_el) * pml_y_el;
412  uz_sgz[i] = (uz_sgz[i] * pml_z_el - ifft_z_el) * pml_z_el;
413  }// for
414 }// end of CUDAComputeVelocityScalarNonuniform
415 //--------------------------------------------------------------------------------------------------
416 
417 /**
418  * Interface to calculate ux_sgx, uy_sgy and uz_sgz.
419  * This is the case for rho0 being a scalar and a non-uniform grid.
420  * @param [in,out] ux_sgx - updated value of ux_sgx
421  * @param [in,out] uy_sgy - updated value of ux_sgx
422  * @param [in,out] uz_sgz - updated value of ux_sgx
423  * @param [in] ifft_x - gradient of X
424  * @param [in] ifft_y - gradient of X
425  * @param [in] ifft_z - gradient of X
426  * @param [in] dxudxn_sgx - matrix dx shift
427  * @param [in] dyudyn_sgy - matrix dy shift
428  * @param [in] dzudzn_sgz - matrix dz shift
429  * @param [in] pml_x - matrix of pml_x
430  * @param [in] pml_y - matrix of pml_x
431  * @param [in] pml_z - matrix of pml_x
432  */
434  TRealMatrix& uy_sgy,
435  TRealMatrix& uz_sgz,
436  const TRealMatrix& ifft_x,
437  const TRealMatrix& ifft_y,
438  const TRealMatrix& ifft_z,
439  const TRealMatrix& dxudxn_sgx,
440  const TRealMatrix& dyudyn_sgy,
441  const TRealMatrix& dzudzn_sgz,
442  const TRealMatrix& pml_x,
443  const TRealMatrix& pml_y,
444  const TRealMatrix& pml_z)
445 {
446  CUDAComputeVelocityScalarNonuniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
447  (ux_sgx.GetDeviceData(),
448  uy_sgy.GetDeviceData(),
449  uz_sgz.GetDeviceData(),
450  ifft_x.GetDeviceData(),
451  ifft_y.GetDeviceData(),
452  ifft_z.GetDeviceData(),
453  dxudxn_sgx.GetDeviceData(),
454  dyudyn_sgy.GetDeviceData(),
455  dzudzn_sgz.GetDeviceData(),
456  pml_x.GetDeviceData(),
457  pml_y.GetDeviceData(),
458  pml_z.GetDeviceData());
459 
460  // check for errors
461  checkCudaErrors(cudaGetLastError());
462 }// end of CUDAComputeVelocityScalarNonuniform
463 //--------------------------------------------------------------------------------------------------
464 
465 
466 /**
467  * CUDA kernel adding transducer data to ux_sgx
468  *
469  * @param [in, out] ux_sgx - Here we add the signal
470  * @param [in] u_source_index - Where to add the signal (source)
471  * @param [in, out] delay_mask - Delay mask to push the signal in the domain (incremented per invocation)
472  * @param [in] transducer_signal - Transducer signal
473  */
474 __global__ void CUDAAddTransducerSource(float* ux_sgx,
475  const size_t* u_source_index,
476  size_t* delay_mask,
477  const float* transducer_signal)
478 {
479  for (auto i = GetIndex(); i < cudaDeviceConstants.u_source_index_size; i += GetStride())
480  {
481  ux_sgx[u_source_index[i]] += transducer_signal[delay_mask[i]];
482  delay_mask[i]++;
483  }
484 }// end of CUDAAddTransducerSource
485 //------------------------------------------------------------------------------
486 
487 /**
488  * Interface to kernel adding transducer data to ux_sgx.
489  *
490  * @param [in, out] ux_sgx - Here we add the signal
491  * @param [in] u_source_index - Where to add the signal (source)
492  * @param [in, out] delay_mask - Delay mask to push the signal in the domain (incremented per invocation)
493  * @param [in] transducer_signal - Transducer signal
494  */
496  const TIndexMatrix& u_source_index,
497  TIndexMatrix& delay_mask,
498  const TRealMatrix& transducer_signal)
499 {
500  const auto u_source_index_size = u_source_index.GetElementCount();
501 
502  // Grid size is calculated based on the source size
503  const int gridSize = (static_cast<int>(u_source_index_size) < (GetSolverGridSize1D() * GetSolverBlockSize1D()))
504  ? (u_source_index_size + GetSolverBlockSize1D() - 1 ) / GetSolverBlockSize1D()
506 
507  CUDAAddTransducerSource<<<gridSize, GetSolverBlockSize1D()>>>
508  (ux_sgx.GetDeviceData(),
509  u_source_index.GetDeviceData(),
510  delay_mask.GetDeviceData(),
511  transducer_signal.GetDeviceData());
512  // check for errors
513  checkCudaErrors(cudaGetLastError());
514 }// end of AddTransducerSource
515 //--------------------------------------------------------------------------------------------------
516 
517 
518 /**
519  * CUDA kernel to add in velocity source terms.
520  *
521  * @param [in, out] uxyz_sgxyz - velocity matrix to update
522  * @param [in] u_source_input - Source input to add
523  * @param [in] u_source_index - Index matrix
524  * @param [in] t_index - Actual time step
525  */
526 __global__ void CUDAAddVelocitySource(float* uxyz_sgxyz,
527  const float* u_source_input,
528  const size_t* u_source_index,
529  const size_t t_index)
530 {
531  // Set 1D or 2D step for source
532  auto index2D = (cudaDeviceConstants.u_source_many == 0) ? t_index : t_index * cudaDeviceConstants.u_source_index_size;
533 
535  {
536  for (auto i = GetIndex(); i < cudaDeviceConstants.u_source_index_size; i += GetStride())
537  {
538  uxyz_sgxyz[u_source_index[i]] = (cudaDeviceConstants.u_source_many == 0) ? u_source_input[index2D] :
539  u_source_input[index2D + i];
540  }// for
541  }// end of Dirichlet
542 
544  {
545  for (auto i = GetIndex(); i < cudaDeviceConstants.u_source_index_size; i += GetStride())
546  {
547  uxyz_sgxyz[u_source_index[i]] += (cudaDeviceConstants.u_source_many == 0) ? u_source_input[index2D] :
548  u_source_input[index2D + i];
549  }
550  }
551 }// end of CUDAAddVelocitySource
552 //------------------------------------------------------------------------------
553 
554 
555 /**
556  * Interface to CUDA kernel adding in velocity source terms.
557  *
558  * @param [in, out] uxyz_sgxyz - Velocity matrix to update
559  * @param [in] u_source_input - Source input to add
560  * @param [in] u_source_index - Index matrix
561  * @param [in] t_index - Actual time step
562  */
564  const TRealMatrix& u_source_input,
565  const TIndexMatrix& u_source_index,
566  const size_t t_index)
567 {
568  const auto u_source_index_size = u_source_index.GetElementCount();
569 
570  // Grid size is calculated based on the source size
571  // for small sources, a custom number of thread blocks is created,
572  // otherwise, a standard number is used
573 
574  const int gridSize = (static_cast<int>(u_source_index_size) < (GetSolverGridSize1D() * GetSolverBlockSize1D()))
575  ? (u_source_index_size + GetSolverBlockSize1D() - 1 ) / GetSolverBlockSize1D()
577 
578  CUDAAddVelocitySource<<< gridSize, GetSolverBlockSize1D()>>>
579  (uxyz_sgxyz.GetDeviceData(),
580  u_source_input.GetDeviceData(),
581  u_source_index.GetDeviceData(),
582  t_index);
583 
584  // check for errors
585  checkCudaErrors(cudaGetLastError());
586 }// end of AddVelocitySource
587 //-------------------------------------------------------------------------------------------------
588 
589 /**
590  * CUDA kernel to add p_source to acoustic density.
591  *
592  * @param [out] rhox - Acoustic density
593  * @param [out] rhoy - Acoustic density
594  * @param [out] rhoz - Acoustic density
595  * @param [in] p_source_input - Source input to add
596  * @param [in] p_source_index - Index matrix with source
597  * @param [in] t_index - Actual timestep
598 
599  */
600 __global__ void CUDAAddPressureSource(float* rhox,
601  float* rhoy,
602  float* rhoz,
603  const float* p_source_input,
604  const size_t* p_source_index,
605  const size_t t_index)
606 {
607  // Set 1D or 2D step for source
608  auto index2D = (cudaDeviceConstants.p_source_many == 0) ? t_index : t_index * cudaDeviceConstants.p_source_index_size;
609 
611  {
613  { // single signal
614  for (auto i = GetIndex(); i < cudaDeviceConstants.p_source_index_size; i += GetStride())
615  {
616  rhox[p_source_index[i]] = p_source_input[index2D];
617  rhoy[p_source_index[i]] = p_source_input[index2D];
618  rhoz[p_source_index[i]] = p_source_input[index2D];
619  }
620  }
621  else
622  { // multiple signals
623  for (auto i = GetIndex(); i < cudaDeviceConstants.p_source_index_size; i += GetStride())
624  {
625  rhox[p_source_index[i]] = p_source_input[index2D + i];
626  rhoy[p_source_index[i]] = p_source_input[index2D + i];
627  rhoz[p_source_index[i]] = p_source_input[index2D + i];
628  }
629  }
630  }// end mode == 0 (Cauchy)
631 
633  {
635  { // single signal
636  for (auto i = GetIndex(); i < cudaDeviceConstants.p_source_index_size; i += GetStride())
637  {
638  rhox[p_source_index[i]] += p_source_input[index2D];
639  rhoy[p_source_index[i]] += p_source_input[index2D];
640  rhoz[p_source_index[i]] += p_source_input[index2D];
641  }
642  }
643  else
644  { // multiple signals
645  for (auto i = GetIndex(); i < cudaDeviceConstants.p_source_index_size; i += GetStride())
646  {
647  rhox[p_source_index[i]] += p_source_input[index2D + i];
648  rhoy[p_source_index[i]] += p_source_input[index2D + i];
649  rhoz[p_source_index[i]] += p_source_input[index2D + i];
650  }
651  }
652  }// end mode == 0 (Dirichlet)
653 }// end of CUDAAdd_p_source
654 //--------------------------------------------------------------------------------------------------
655 
656 /**
657  * Interface to kernel which adds in pressure source (to acoustic density).
658  *
659  * @param [out] rhox - Acoustic density
660  * @param [out] rhoy - Acoustic density
661  * @param [out] rhoz - Acoustic density
662  * @param [in] p_source_input - Source input to add
663  * @param [in] p_source_index - Index matrix with source
664  * @param [in] t_index - Actual timestep
665  */
667  TRealMatrix& rhoy,
668  TRealMatrix& rhoz,
669  const TRealMatrix& p_source_input,
670  const TIndexMatrix& p_source_index,
671  const size_t t_index)
672 {
673  const auto p_source_index_size = p_source_index.GetElementCount();
674  // Grid size is calculated based on the source size
675  const int gridSize = (static_cast<int>(p_source_index_size) < (GetSolverGridSize1D() * GetSolverBlockSize1D()))
676  ? (p_source_index_size + GetSolverBlockSize1D() - 1 ) / GetSolverBlockSize1D()
678 
679  CUDAAddPressureSource<<<gridSize,GetSolverBlockSize1D()>>>
680  (rhox.GetDeviceData(),
681  rhoy.GetDeviceData(),
682  rhoz.GetDeviceData(),
683  p_source_input.GetDeviceData(),
684  p_source_index.GetDeviceData(),
685  t_index);
686 
687  // check for errors
688  checkCudaErrors(cudaGetLastError());
689 }// end of AddPressureSource
690 //--------------------------------------------------------------------------------------------------
691 
692 /**
693  * CUDA kernel Compute u = dt ./ rho0_sgx .* u.
694  *
695  * @param [in, out] ux_sgx - data stored in u matrix
696  * @param [in, out] uy_sgy - data stored in u matrix
697  * @param [in, out] uz_sgz - data stored in u matrix
698  * @param [in] dt_rho0_sgx - inner member of the equation
699  * @param [in] dt_rho0_sgy - inner member of the equation
700  * @param [in] dt_rho0_sgz - inner member of the equation
701  *
702  */
703 template <bool Is_rho0_scalar>
704 __global__ void CUDACompute_p0_Velocity(float* ux_sgx,
705  float* uy_sgy,
706  float* uz_sgz,
707  const float* dt_rho0_sgx = nullptr,
708  const float* dt_rho0_sgy = nullptr,
709  const float* dt_rho0_sgz = nullptr)
710 
711 {
712  if (Is_rho0_scalar)
713  {
714  const float dividerX = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgx_scalar;
715  const float dividerY = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgy_scalar;
716  const float dividerZ = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgz_scalar;
717 
718  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
719  {
720  ux_sgx[i] *= dividerX;
721  uy_sgy[i] *= dividerY;
722  uz_sgz[i] *= dividerZ;
723  }
724  }
725  else
726  { // heterogeneous
727  const float divider = cudaDeviceConstants.fftDivider * 0.5f;
728 
729  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
730  {
731  ux_sgx[i] *= dt_rho0_sgx[i] * divider;
732  uy_sgy[i] *= dt_rho0_sgy[i] * divider;
733  uz_sgz[i] *= dt_rho0_sgz[i] * divider;
734  }
735  }
736 }// end of CUDACompute_p0_Velocity
737 //-------------------------------------------------------------------------------------------------
738 
739 /**
740  * Interface to CUDA Compute u = dt ./ rho0_sgx .* ifft(FFT).
741  *
742  * @param [in, out] ux_sgx - data stored in u matrix
743  * @param [in, out] uy_sgy - data stored in u matrix
744  * @param [in, out] uz_sgz - data stored in u matrix
745  * @param [in] dt_rho0_sgx - inner member of the equation
746  * @param [in] dt_rho0_sgy - inner member of the equation
747  * @param [in] dt_rho0_sgz - inner member of the equation
748  *
749  */
751  TRealMatrix& uy_sgy,
752  TRealMatrix& uz_sgz,
753  const TRealMatrix& dt_rho0_sgx,
754  const TRealMatrix& dt_rho0_sgy,
755  const TRealMatrix& dt_rho0_sgz)
756 {
757  CUDACompute_p0_Velocity<false>
759  (ux_sgx.GetDeviceData(),
760  uy_sgy.GetDeviceData(),
761  uz_sgz.GetDeviceData(),
762  dt_rho0_sgx.GetDeviceData(),
763  dt_rho0_sgy.GetDeviceData(),
764  dt_rho0_sgz.GetDeviceData());
765 
766  // check for errors
767  checkCudaErrors(cudaGetLastError());
768 }// end of Compute_p0_Velocity
769 //--------------------------------------------------------------------------------------------------
770 
771 /**
772  * Interface to CUDA Compute u = dt ./ rho0_sgx .* ifft(FFT).
773  * if rho0_sgx is scalar, uniform case.
774  *
775  * @param [in, out] ux_sgx - Data stored in u matrix
776  * @param [in, out] uy_sgy - Data stored in u matrix
777  * @param [in, out] uz_sgz - Data stored in u matrix
778  */
780  TRealMatrix& uy_sgy,
781  TRealMatrix& uz_sgz)
782 {
783  CUDACompute_p0_Velocity<true>
785  (ux_sgx.GetDeviceData(),
786  uy_sgy.GetDeviceData(),
787  uz_sgz.GetDeviceData());
788 
789  // check for errors
790  checkCudaErrors(cudaGetLastError());
791 }// end of Compute_p0_Velocity
792 //--------------------------------------------------------------------------------------------------
793 
794 
795 
796 /**
797  * CUDA kernel to Compute u = dt./rho0_sgy .* ifft (FFT).
798  * if rho0_sg is scalar, nonuniform non uniform grid, y component.
799  *
800  * @param [in, out] ux_sgx
801  * @param [in, out] uy_sgy
802  * @param [in, out] uz_sgz
803  * @param [in] dxudxn_sgx
804  * @param [in] dyudyn_sgy
805  * @param [in] dzudzn_sgz
806  */
807 __global__ void CUDACompute_p0_VelocityScalarNonUniform(float* ux_sgx,
808  float* uy_sgy,
809  float* uz_sgz,
810  const float* dxudxn_sgx,
811  const float* dyudyn_sgy,
812  const float* dzudzn_sgz)
813 {
814  const float dividerX = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgx_scalar;
815  const float dividerY = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgy_scalar;
816  const float dividerZ = cudaDeviceConstants.fftDivider * 0.5f * cudaDeviceConstants.rho0_sgz_scalar;
817 
818  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
819  {
820  const dim3 coords = GetReal3DCoords(i);
821 
822  ux_sgx[i] *= dividerX * dxudxn_sgx[coords.x];
823  uy_sgy[i] *= dividerY * dyudyn_sgy[coords.y];
824  uz_sgz[i] *= dividerZ * dzudzn_sgz[coords.z];
825  }
826 }// end of CUDACompute_p0_VelocityScalarNonUniform
827 //--------------------------------------------------------------------------------------------------
828 
829 
830 /**
831  * Interface to CUDA kernel to Compute u = dt./rho0_sgy .* ifft (FFT).
832  * if rho0_sgx is scalar, nonuniform non uniform Compute_ddx_kappa_fft_pgrid, y component.
833  *
834  * @param [in, out] ux_sgx
835  * @param [in, out] uy_sgy
836  * @param [in, out] uz_sgz
837  * @param [in] dxudxn_sgx
838  * @param [in] dyudyn_sgy
839  * @param [in] dzudzn_sgz
840  */
842  TRealMatrix& uy_sgy,
843  TRealMatrix& uz_sgz,
844  const TRealMatrix& dxudxn_sgx,
845  const TRealMatrix& dyudyn_sgy,
846  const TRealMatrix& dzudzn_sgz)
847 {
848  CUDACompute_p0_VelocityScalarNonUniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
849  (ux_sgx.GetDeviceData(),
850  uy_sgy.GetDeviceData(),
851  uz_sgz.GetDeviceData(),
852  dxudxn_sgx.GetDeviceData(),
853  dxudxn_sgx.GetDeviceData(),
854  dxudxn_sgx.GetDeviceData());
855 // check for errors
856  checkCudaErrors(cudaGetLastError());
857 }// end of Compute_p0_VelocityScalarNonUniform
858 //--------------------------------------------------------------------------------------------------
859 
860 
861 /**
862  * kernel which compute part of the new velocity term - gradient
863  * of p represented by:
864  * bsxfun(\@times, ddx_k_shift_pos, kappa .* p_k).
865  *
866  *
867  * @param [in, out] fft_x - matrix to store input for iFFT (p) /dx
868  * @param [out] fft_y - matrix to store input for iFFT (p) /dy
869  * @param [out] fft_z - matrix to store input for iFFT (p) /dz
870  *
871  * @param [in] kappa - Real matrix of kappa
872  *
873  * @param [in] ddx - precomputed value of ddx_k_shift_pos
874  * @param [in] ddy - precomputed value of ddy_k_shift_pos
875  * @param [in] ddz - precomputed value of ddz_k_shift_pos
876  */
877 __global__ void CUDAComputePressurelGradient(cuFloatComplex* fft_x,
878  cuFloatComplex* fft_y,
879  cuFloatComplex* fft_z,
880  const float* kappa,
881  const cuFloatComplex* ddx,
882  const cuFloatComplex* ddy,
883  const cuFloatComplex* ddz)
884 {
885  for(auto i = GetIndex(); i < cudaDeviceConstants.nElementsComplex; i += GetStride())
886  {
887  const dim3 coords = GetComplex3DCoords(i);
888 
889  const cuFloatComplex p_k_el = fft_x[i] * kappa[i];
890 
891  fft_x[i] = cuCmulf(p_k_el, ddx[coords.x]);
892  fft_y[i] = cuCmulf(p_k_el, ddy[coords.y]);
893  fft_z[i] = cuCmulf(p_k_el, ddz[coords.z]);
894  }
895 }// end of CUDAComputePressurelGradient
896 //--------------------------------------------------------------------------------------------------
897 
898 /**
899  * Interface to kernel which computes the spectral part of pressure gradient calculation
900  * bsxfun(\@times, ddx_k_shift_pos, kappa .* p_k).
901  *
902  * @param [out] fft_x - matrix to store input for iFFT (p) /dx
903  * @param [out] fft_y - matrix to store input for iFFT (p) /dy
904  * @param [out] fft_z - matrix to store input for iFFT (p) /dz
905  *
906  * @param [in] kappa - Real matrix of kappa
907  *
908  * @param [in] ddx - precomputed value of ddx_k_shift_pos
909  * @param [in] ddy - precomputed value of ddy_k_shift_pos
910  * @param [in] ddz - precomputed value of ddz_k_shift_pos
911  */
913  TCUFFTComplexMatrix& fft_y,
914  TCUFFTComplexMatrix& fft_z,
915  const TRealMatrix& kappa,
916  const TComplexMatrix& ddx,
917  const TComplexMatrix& ddy,
918  const TComplexMatrix& ddz)
919 {
920  CUDAComputePressurelGradient<<<GetSolverGridSize1D(),GetSolverBlockSize1D()>>>
921  (reinterpret_cast<cuFloatComplex*>(fft_x.GetDeviceData()),
922  reinterpret_cast<cuFloatComplex*>(fft_y.GetDeviceData()),
923  reinterpret_cast<cuFloatComplex*>(fft_z.GetDeviceData()),
924  kappa.GetDeviceData(),
925  reinterpret_cast<const cuFloatComplex*>(ddx.GetDeviceData()),
926  reinterpret_cast<const cuFloatComplex*>(ddy.GetDeviceData()),
927  reinterpret_cast<const cuFloatComplex*>(ddz.GetDeviceData()));
928 
929  // check for errors
930  checkCudaErrors(cudaGetLastError());
931 }// end of ComputePressurelGradient
932 //--------------------------------------------------------------------------------------------------
933 
934 
935 /**
936  * Kernel calculating the inner part of du, dy, dz on uniform grid.
937  * Complex numbers are passed as float2 structures.
938  *
939  * @param [in, out] fft_x - FFT of ux
940  * @param [in, out] fft_y - FFT of uy
941  * @param [in, out] fft_z - FFT of uz
942  * @param [in] kappa
943  * @param [in] ddx_neg - ddx_k_shift_neg
944  * @param [in] ddy_neg - ddy_k_shift_neg
945  * @param [in] ddz_neg - ddz_k_shift_neg
946  */
947 __global__ void CUDAComputeVelocityGradient(cuFloatComplex* fft_x,
948  cuFloatComplex* fft_y,
949  cuFloatComplex* fft_z,
950  const float* kappa,
951  const cuFloatComplex* ddx_neg,
952  const cuFloatComplex* ddy_neg,
953  const cuFloatComplex* ddz_neg)
954 {
955  for(auto i = GetIndex(); i < cudaDeviceConstants.nElementsComplex; i += GetStride())
956  {
957  const dim3 coords = GetComplex3DCoords(i);
958 
959  const cuFloatComplex ddx_neg_el = ddx_neg[coords.x];
960  const cuFloatComplex ddz_neg_el = ddz_neg[coords.z];
961  const cuFloatComplex ddy_neg_el = ddy_neg[coords.y];
962 
963  const float kappa_el = kappa[i] * cudaDeviceConstants.fftDivider;
964 
965  const cuFloatComplex fft_x_el = fft_x[i] * kappa_el;
966  const cuFloatComplex fft_y_el = fft_y[i] * kappa_el;
967  const cuFloatComplex fft_z_el = fft_z[i] * kappa_el;
968 
969  fft_x[i] = cuCmulf(fft_x_el, ddx_neg_el);
970  fft_y[i] = cuCmulf(fft_y_el, ddy_neg_el);
971  fft_z[i] = cuCmulf(fft_z_el, ddz_neg_el);
972  } // for
973 }// end of CUDAComputeVelocityGradient
974 //--------------------------------------------------------------------------------------------------
975 
976 /**
977  * Interface to kernel calculating the inner part of du, dy, dz on uniform grid.
978  * @param [in, out] fft_x - FFT of ux
979  * @param [in, out] fft_y - FFT of uy
980  * @param [in, out] fft_z - FFT of uz
981  * @param [in] kappa
982  * @param [in] ddx_k_shift_neg
983  * @param [in] ddy_k_shift_neg
984  * @param [in] ddz_k_shift_neg
985  */
987  TCUFFTComplexMatrix& fft_y,
988  TCUFFTComplexMatrix& fft_z,
989  const TRealMatrix& kappa,
990  const TComplexMatrix& ddx_k_shift_neg,
991  const TComplexMatrix& ddy_k_shift_neg,
992  const TComplexMatrix& ddz_k_shift_neg)
993 {
994  CUDAComputeVelocityGradient<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
995  (reinterpret_cast<cuFloatComplex *>(fft_x.GetDeviceData()),
996  reinterpret_cast<cuFloatComplex *>(fft_y.GetDeviceData()),
997  reinterpret_cast<cuFloatComplex *>(fft_z.GetDeviceData()),
998  kappa.GetDeviceData(),
999  reinterpret_cast<const cuFloatComplex *>(ddx_k_shift_neg.GetDeviceData()),
1000  reinterpret_cast<const cuFloatComplex *>(ddy_k_shift_neg.GetDeviceData()),
1001  reinterpret_cast<const cuFloatComplex *>(ddz_k_shift_neg.GetDeviceData()));
1002 
1003  // check for errors
1004  checkCudaErrors(cudaGetLastError());
1005 }// end of ComputeVelocityGradient
1006 //--------------------------------------------------------------------------------------------------
1007 
1008 
1009 /**
1010  * CUDA kernel to shift du, dy and dz on non-uniform grid.
1011  *
1012  * @param [in,out] duxdx
1013  * @param [in,out] duydy
1014  * @param [in,out] duzdz
1015  * @param [in] duxdxn
1016  * @param [in] duydyn
1017  * @param [in] duzdzn
1018  */
1019 __global__ void CUDAComputeVelocityGradientNonuniform(float* duxdx,
1020  float* duydy,
1021  float* duzdz,
1022  const float* duxdxn,
1023  const float* duydyn,
1024  const float* duzdzn)
1025 {
1026  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1027  {
1028  const dim3 coords = GetReal3DCoords(i);
1029 
1030  duxdx[i] *= duxdxn[coords.x];
1031  duydy[i] *= duydyn[coords.y];
1032  duzdz[i] *= duzdzn[coords.z];
1033  }
1034 }// end of CUDAComputeVelocityGradientNonuniform
1035 //--------------------------------------------------------------------------------------------------
1036 
1037 /**
1038  * Interface to CUDA kernel which shift new values of dux, duy and duz on non-uniform grid.
1039  *
1040  * @param [in,out] duxdx
1041  * @param [in,out] duydy
1042  * @param [in,out] duzdz
1043  * @param [in] dxudxn
1044  * @param [in] dyudyn
1045  * @param [in] dzudzn
1046  */
1048  TRealMatrix& duydy,
1049  TRealMatrix& duzdz,
1050  const TRealMatrix& dxudxn,
1051  const TRealMatrix& dyudyn,
1052  const TRealMatrix& dzudzn)
1053 {
1054  CUDAComputeVelocityGradientNonuniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1055  (duxdx.GetDeviceData(),
1056  duydy.GetDeviceData(),
1057  duzdz.GetDeviceData(),
1058  dxudxn.GetDeviceData(),
1059  dyudyn.GetDeviceData(),
1060  dzudzn.GetDeviceData());
1061 
1062  // check for errors
1063  checkCudaErrors(cudaGetLastError());
1064 }// end of ComputeVelocityGradientNonuniform
1065 //--------------------------------------------------------------------------------------------------
1066 
1067 
1068 /**
1069  * CUDA kernel to add initial pressure p0 into p, rhox, rhoy, rhoz.
1070  * c is a matrix. Heterogeneity is treated by a template
1071  * @param [out] p - pressure
1072  * @param [out] rhox
1073  * @param [out] rhoy
1074  * @param [out] rhoz
1075  * @param [in] p0 - intial pressure
1076  * @param [in] c2 - sound speed
1077  */
1078 template<bool Is_c0_scalar>
1079 __global__ void CUDACompute_p0_AddInitialPressure(float* p,
1080  float* rhox,
1081  float* rhoy,
1082  float* rhoz,
1083  const float* p0,
1084  const float* c2 = nullptr)
1085 {
1086  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1087  {
1088  float tmp = p[i] = p0[i];
1089 
1090  tmp = (Is_c0_scalar) ? tmp / (3.0f * cudaDeviceConstants.c2): tmp / (3.0f * c2[i]);
1091 
1092  rhox[i] = tmp;
1093  rhoy[i] = tmp;
1094  rhoz[i] = tmp;
1095  }
1096 }// end of CUDACompute_p0_AddInitialPressure
1097 //--------------------------------------------------------------------------------------------------
1098 
1099 
1100 /**
1101  * Interface for kernel to add initial pressure p0 into p, rhox, rhoy, rhoz.
1102  *
1103  * @param [out] p - Pressure
1104  * @param [out] rhox - Density component
1105  * @param [out] rhoy - Density component
1106  * @param [out] rhoz - Density component
1107  * @param [in] p0 - intial pressure
1108  * @param [in] Is_c2_scalar - Scalar or vector?
1109  * @param [in] c2 - Sound speed
1110  */
1112  TRealMatrix& rhox,
1113  TRealMatrix& rhoy,
1114  TRealMatrix& rhoz,
1115  const TRealMatrix& p0,
1116  const bool Is_c2_scalar,
1117  const float* c2)
1118 {
1119  if (Is_c2_scalar)
1120  {
1121  CUDACompute_p0_AddInitialPressure<true>
1123  (p.GetDeviceData(),
1124  rhox.GetDeviceData(),
1125  rhoy.GetDeviceData(),
1126  rhoz.GetDeviceData(),
1127  p0.GetDeviceData());
1128  }
1129  else
1130  {
1131  CUDACompute_p0_AddInitialPressure<false>
1133  (p.GetDeviceData(),
1134  rhox.GetDeviceData(),
1135  rhoy.GetDeviceData(),
1136  rhoz.GetDeviceData(),
1137  p0.GetDeviceData(),
1138  c2);
1139  }
1140  // check for errors
1141  checkCudaErrors(cudaGetLastError());
1142 }// end of Compute_p0_AddInitialPressure
1143 //--------------------------------------------------------------------------------------------------
1144 
1145 /**
1146  * Interface to kernel which calculate new values of rho (acoustic density).
1147  * Non-linear, homogenous case.
1148  * @param [out] rhox - density x
1149  * @param [out] rhoy - density y
1150  * @param [out] rhoz - density y
1151  * @param [in] pml_x - pml x
1152  * @param [in] pml_y - pml y
1153  * @param [in] pml_z - pml z
1154  * @param [in] duxdx - gradient of velocity x
1155  * @param [in] duydy - gradient of velocity x
1156  * @param [in] duzdz - gradient of velocity z
1157  */
1158 __global__ void CUDAComputeDensityNonlinearHomogeneous(float* rhox,
1159  float* rhoy,
1160  float* rhoz,
1161  const float* pml_x,
1162  const float* pml_y,
1163  const float* pml_z,
1164  const float* duxdx,
1165  const float* duydy,
1166  const float* duzdz)
1167 {
1168  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1169  {
1170  const dim3 coords = GetReal3DCoords(i);
1171 
1172  const float pml_x_el = pml_x[coords.x];
1173  const float pml_y_el = pml_y[coords.y];
1174  const float pml_z_el = pml_z[coords.z];
1175 
1176  const float dux = duxdx[i];
1177  const float duy = duydy[i];
1178  const float duz = duzdz[i];
1179 
1180  rhox[i] = pml_x_el * ((pml_x_el * rhox[i] - cudaDeviceConstants.dt_rho0_scalar * dux) /
1181  (1.0f + cudaDeviceConstants.dt2 * dux));
1182  rhoy[i] = pml_y_el * ((pml_y_el * rhoy[i] - cudaDeviceConstants.dt_rho0_scalar * duy) /
1183  (1.0f + cudaDeviceConstants.dt2 * duy));
1184  rhoz[i] = pml_z_el * ((pml_z_el * rhoz[i] - cudaDeviceConstants.dt_rho0_scalar * duz) /
1185  (1.0f + cudaDeviceConstants.dt2 * duz));
1186  }
1187 }// end of CUDAComputeDensityNonlinearHomogeneous
1188 //--------------------------------------------------------------------------------------------------
1189 
1190 /**
1191  * Interface to kernel which calculate new values of rho (acoustic density).
1192  * Non-linear, homogenous case.
1193  *
1194  * @param [out] rhox - density x
1195  * @param [out] rhoy - density y
1196  * @param [out] rhoz - density y
1197  * @param [in] pml_x - pml x
1198  * @param [in] pml_y - pml y
1199  * @param [in] pml_z - pml z
1200  * @param [in] duxdx - gradient of velocity x
1201  * @param [in] duydy - gradient of velocity x
1202  * @param [in] duzdz - gradient of velocity z
1203  */
1205  TRealMatrix& rhoy,
1206  TRealMatrix& rhoz,
1207  const TRealMatrix& pml_x,
1208  const TRealMatrix& pml_y,
1209  const TRealMatrix& pml_z,
1210  const TRealMatrix& duxdx,
1211  const TRealMatrix& duydy,
1212  const TRealMatrix& duzdz)
1213 {
1214  CUDAComputeDensityNonlinearHomogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1215  (rhox.GetDeviceData(),
1216  rhoy.GetDeviceData(),
1217  rhoz.GetDeviceData(),
1218  pml_x.GetDeviceData(),
1219  pml_y.GetDeviceData(),
1220  pml_z.GetDeviceData(),
1221  duxdx.GetDeviceData(),
1222  duydy.GetDeviceData(),
1223  duzdz.GetDeviceData());
1224  // check for errors
1225  checkCudaErrors(cudaGetLastError());
1226 }// end of ComputeDensityNonlinearHomogeneous
1227 //--------------------------------------------------------------------------------------------------
1228 
1229 /**
1230  * CUDA kernel which calculate new values of rho (acoustic density).
1231  * Non-linear, heterogenous case.
1232  * @param [out] rhox - density x
1233  * @param [out] rhoy - density y
1234  * @param [out] rhoz - density y
1235  * @param [in] pml_x - pml x
1236  * @param [in] pml_y - pml y
1237  * @param [in] pml_z - pml z
1238  * @param [in] duxdx - gradient of velocity x
1239  * @param [in] duydy - gradient of velocity x
1240  * @param [in] duzdz - gradient of velocity z
1241  * @param [in] rho0 - initial density (matrix here)
1242  */
1243 __global__ void CUDAComputeDensityNonlinearHeterogeneous(float* rhox,
1244  float* rhoy,
1245  float* rhoz,
1246  const float* pml_x,
1247  const float* pml_y,
1248  const float* pml_z,
1249  const float* duxdx,
1250  const float* duydy,
1251  const float* duzdz,
1252  const float* rho0)
1253 {
1254  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1255  {
1256  const dim3 coords = GetReal3DCoords(i);
1257 
1258  const float pml_x_el = pml_x[coords.x];
1259  const float pml_y_el = pml_y[coords.y];
1260  const float pml_z_el = pml_z[coords.z];
1261 
1262  const float dt_rho0 = cudaDeviceConstants.dt * rho0[i];
1263 
1264  const float dux = duxdx[i];
1265  const float duy = duydy[i];
1266  const float duz = duzdz[i];
1267 
1268  rhox[i] = pml_x_el * ((pml_x_el * rhox[i] - dt_rho0 * dux) / (1.0f + cudaDeviceConstants.dt2 * dux));
1269  rhoy[i] = pml_y_el * ((pml_y_el * rhoy[i] - dt_rho0 * duy) / (1.0f + cudaDeviceConstants.dt2 * duy));
1270  rhoz[i] = pml_z_el * ((pml_z_el * rhoz[i] - dt_rho0 * duz) / (1.0f + cudaDeviceConstants.dt2 * duz));
1271  }
1272 }//end of CUDAComputeDensityNonlinearHeterogeneous
1273 //--------------------------------------------------------------------------------------------------
1274 
1275 /*
1276  * Interface to kernel which calculate new values of rho (acoustic density).
1277  * Non-linear, heterogenous case.
1278  * @param [out] rhox - density x
1279  * @param [out] rhoy - density y
1280  * @param [out] rhoz - density y
1281  * @param [in] pml_x - pml x
1282  * @param [in] pml_y - pml y
1283  * @param [in] pml_z - pml z
1284  * @param [in] duxdx - gradient of velocity x
1285  * @param [in] duydy - gradient of velocity x
1286  * @param [in] duzdz - gradient of velocity z
1287  * @param [in] rho0 - initial density (matrix here)
1288  */
1290  TRealMatrix& rhoy,
1291  TRealMatrix& rhoz,
1292  const TRealMatrix& pml_x,
1293  const TRealMatrix& pml_y,
1294  const TRealMatrix& pml_z,
1295  const TRealMatrix& duxdx,
1296  const TRealMatrix& duydy,
1297  const TRealMatrix& duzdz,
1298  const TRealMatrix& rho0)
1299 {
1300  CUDAComputeDensityNonlinearHeterogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D() >>>
1301  (rhox.GetDeviceData(),
1302  rhoy.GetDeviceData(),
1303  rhoz.GetDeviceData(),
1304  pml_x.GetDeviceData(),
1305  pml_y.GetDeviceData(),
1306  pml_z.GetDeviceData(),
1307  duxdx.GetDeviceData(),
1308  duydy.GetDeviceData(),
1309  duzdz.GetDeviceData(),
1310  rho0.GetDeviceData());
1311 
1312  // check for errors
1313  checkCudaErrors(cudaGetLastError());
1314 }// end of ComputeDensityNonlinearHeterogeneous
1315 //--------------------------------------------------------------------------------------------------
1316 
1317 /**
1318  * Interface to kernel which calculate new values of rho (acoustic density).
1319  * Linear, homogenous case.
1320  *
1321  * @param [out] rhox - Density x
1322  * @param [out] rhoy - Density y
1323  * @param [out] rhoz - Density y
1324  * @param [in] pml_x - pml x
1325  * @param [in] pml_y - pml y
1326  * @param [in] pml_z - pml z
1327  * @param [in] duxdx - Gradient of velocity x
1328  * @param [in] duydy - Gradient of velocity x
1329  * @param [in] duzdz - Gradient of velocity z
1330  */
1331 __global__ void CUDAComputeDensityLinearHomogeneous(float* rhox,
1332  float* rhoy,
1333  float* rhoz,
1334  const float* pml_x,
1335  const float* pml_y,
1336  const float* pml_z,
1337  const float* duxdx,
1338  const float* duydy,
1339  const float* duzdz)
1340 {
1341  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1342  {
1343  const dim3 coords = GetReal3DCoords(i);
1344 
1345  const float pml_x_el = pml_x[coords.x];
1346  const float pml_y_el = pml_y[coords.y];
1347  const float pml_z_el = pml_z[coords.z];
1348 
1349  rhox[i] = pml_x_el * (pml_x_el * rhox[i] - cudaDeviceConstants.dt_rho0_scalar * duxdx[i]);
1350  rhoy[i] = pml_y_el * (pml_y_el * rhoy[i] - cudaDeviceConstants.dt_rho0_scalar * duydy[i]);
1351  rhoz[i] = pml_z_el * (pml_z_el * rhoz[i] - cudaDeviceConstants.dt_rho0_scalar * duzdz[i]);
1352  }
1353 }// end of CUDAComputeDensityLinearHomogeneous
1354 //--------------------------------------------------------------------------------------------------
1355 
1356 /**
1357  * Interface to kernel which calculate new values of rho (acoustic density).
1358  * Linear, homogenous case.
1359  * @param [out] rhox - density x
1360  * @param [out] rhoy - density y
1361  * @param [out] rhoz - density y
1362  * @param [in] pml_x - pml x
1363  * @param [in] pml_y - pml y
1364  * @param [in] pml_z - pml z
1365  * @param [in] duxdx - gradient of velocity x
1366  * @param [in] duydy - gradient of velocity x
1367  * @param [in] duzdz - gradient of velocity z
1368  */
1370  TRealMatrix& rhoy,
1371  TRealMatrix& rhoz,
1372  const TRealMatrix& pml_x,
1373  const TRealMatrix& pml_y,
1374  const TRealMatrix& pml_z,
1375  const TRealMatrix& duxdx,
1376  const TRealMatrix& duydy,
1377  const TRealMatrix& duzdz)
1378 {
1379  CUDAComputeDensityLinearHomogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D() >>>
1380  (rhox.GetDeviceData(),
1381  rhoy.GetDeviceData(),
1382  rhoz.GetDeviceData(),
1383  pml_x.GetDeviceData(),
1384  pml_y.GetDeviceData(),
1385  pml_z.GetDeviceData(),
1386  duxdx.GetDeviceData(),
1387  duydy.GetDeviceData(),
1388  duzdz.GetDeviceData());
1389  // check for errors
1390  checkCudaErrors(cudaGetLastError());
1391 }// end of ComputeDensityLinearHomogeneous
1392 //--------------------------------------------------------------------------------------------------
1393 
1394 /**
1395  * CUDA kernel which calculate new values of rho (acoustic density).
1396  * Linear, heterogenous case.
1397  *
1398  * @param [out] rhox - density x
1399  * @param [out] rhoy - density y
1400  * @param [out] rhoz - density y
1401  * @param [in] pml_x - pml x
1402  * @param [in] pml_y - pml y
1403  * @param [in] pml_z - pml z
1404  * @param [in] duxdx - gradient of velocity x
1405  * @param [in] duydy - gradient of velocity x
1406  * @param [in] duzdz - gradient of velocity z
1407  * @param [in] rho0 - initial density (matrix here)
1408  */
1409 __global__ void CUDAComputeDensityLinearHeterogeneous(float* rhox,
1410  float* rhoy,
1411  float* rhoz,
1412  const float* pml_x,
1413  const float* pml_y,
1414  const float* pml_z,
1415  const float* duxdx,
1416  const float* duydy,
1417  const float* duzdz,
1418  const float* rho0)
1419 {
1420  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1421  {
1422  const dim3 coords = GetReal3DCoords(i);
1423 
1424  const float pml_x_el = pml_x[coords.x];
1425  const float pml_y_el = pml_y[coords.y];
1426  const float pml_z_el = pml_z[coords.z];
1427 
1428  const float dt_rho0 = cudaDeviceConstants.dt * rho0[i];
1429 
1430  rhox[i] = pml_x_el * (pml_x_el * rhox[i] - dt_rho0 * duxdx[i]);
1431  rhoy[i] = pml_y_el * (pml_y_el * rhoy[i] - dt_rho0 * duydy[i]);
1432  rhoz[i] = pml_z_el * (pml_z_el * rhoz[i] - dt_rho0 * duzdz[i]);
1433  }
1434 }// end of CUDACompute_rhoxyz_linear_heterogeneous
1435 //--------------------------------------------------------------------------------------------------
1436 
1437 /*
1438  * Interface to kernel which calculate new values of rho (acoustic density).
1439  * Linear, heterogenous case.
1440  * @param [out] rhox - Density x
1441  * @param [out] rhoy - Density y
1442  * @param [out] rhoz - Density y
1443  * @param [in] pml_x - pml x
1444  * @param [in] pml_y - pml y
1445  * @param [in] pml_z - pml z
1446  * @param [in] duxdx - Gradient of velocity x
1447  * @param [in] duydy - Gradient of velocity x
1448  * @param [in] duzdz - Gradient of velocity z
1449  * @param [in] rho0 - initial density (matrix here)
1450  */
1452  TRealMatrix& rhoy,
1453  TRealMatrix& rhoz,
1454  const TRealMatrix& pml_x,
1455  const TRealMatrix& pml_y,
1456  const TRealMatrix& pml_z,
1457  const TRealMatrix& duxdx,
1458  const TRealMatrix& duydy,
1459  const TRealMatrix& duzdz,
1460  const TRealMatrix& rho0)
1461 {
1462  CUDAComputeDensityLinearHeterogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1463  (rhox.GetDeviceData(),
1464  rhoy.GetDeviceData(),
1465  rhoz.GetDeviceData(),
1466  pml_x.GetDeviceData(),
1467  pml_y.GetDeviceData(),
1468  pml_z.GetDeviceData(),
1469  duxdx.GetDeviceData(),
1470  duydy.GetDeviceData(),
1471  duzdz.GetDeviceData(),
1472  rho0.GetDeviceData());
1473 
1474  // check for errors
1475  checkCudaErrors(cudaGetLastError());
1476 }// end of ComputeDensityLinearHeterogeneous
1477 //--------------------------------------------------------------------------------------------------
1478 
1479 
1480 /**
1481  * CUDA kernel which calculates three temporary sums in the new pressure formula \n
1482  * non-linear absorbing case. Homogeneous and heterogenous variants are treated using templates.
1483  * Homogeneous variables are in constant memory.
1484  *
1485  * @param [out] rho_sum - rhox_sgx + rhoy_sgy + rhoz_sgz
1486  * @param [out] BonA_sum - BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
1487  * @param [out] du_sum - rho0* (duxdx + duydy + duzdz)
1488  * @param [in] rhox, - Acoustic density X
1489  * @param [in] rhoy, - Acoustic density Y
1490  * @param [in] rhoz, - Acoustic density Z
1491  * @param [in] duxdx - Gradient of velocity in X
1492  * @param [in] duydy - Gradient of velocity in X
1493  * @param [in] duzdz - Gradient of velocity in X
1494  * @param [in] BonA_matrix - Heterogeneous value for BonA
1495  * @param [in] rho0_matrix - Heterogeneous value for rho0
1496  *
1497  *
1498  */
1499 template <bool is_BonA_scalar, bool is_rho0_scalar>
1500 __global__ void CUDAComputePressurePartsNonLinear(float* rho_sum,
1501  float* BonA_sum,
1502  float* du_sum,
1503  const float* rhox,
1504  const float* rhoy,
1505  const float* rhoz,
1506  const float* duxdx,
1507  const float* duydy,
1508  const float* duzdz,
1509  const float* BonA_matrix,
1510  const float* rho0_matrix)
1511 {
1512  for (auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1513  {
1514  const float BonA = (is_BonA_scalar) ? cudaDeviceConstants.BonA_scalar : BonA_matrix[i];
1515  const float rho0 = (is_rho0_scalar) ? cudaDeviceConstants.rho0_scalar : rho0_matrix[i];
1516 
1517  const float rho_xyz_el = rhox[i] + rhoy[i] + rhoz[i];
1518 
1519  rho_sum[i] = rho_xyz_el;
1520  BonA_sum[i] = ((BonA * rho_xyz_el * rho_xyz_el) / (2.0f * rho0)) + rho_xyz_el;
1521  du_sum[i] = rho0 * (duxdx[i] + duydy[i] + duzdz[i]);
1522  }
1523 }// end of CUDACalculate_SumRho_BonA_SumDu
1524 //--------------------------------------------------------------------------------------------------
1525 
1526 /**
1527  * Interface to kernel which calculates three temporary sums in the new pressure formula \n
1528  * non-linear absorbing case. Scalar values are in constant memory
1529  *
1530  * @param [out] rho_sum - rhox_sgx + rhoy_sgy + rhoz_sgz
1531  * @param [out] BonA_sum - BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
1532  * @param [out] du_sum - rho0* (duxdx + duydy + duzdz)
1533  * @param [in] rhox, - Acoustic density X
1534  * @param [in] rhoy, - Acoustic density Y
1535  * @param [in] rhoz, - Acoustic density Z
1536  * @param [in] duxdx - Gradient of velocity in X
1537  * @param [in] duydy - Gradient of velocity in X
1538  * @param [in] duzdz - Gradient of velocity in X
1539  * @param [in] is_BonA_scalar - Is BonA a scalar value (homogeneous)
1540  * @param [in] BonA_matrix - Heterogeneous value for BonA
1541  * @param [in] is_rho0_scalar - Is rho0 a scalar value (homogeneous)
1542  * @param [in] rho0_matrix - Heterogeneous value for rho0
1543  */
1545  TRealMatrix& BonA_sum,
1546  TRealMatrix& du_sum,
1547  const TRealMatrix& rhox,
1548  const TRealMatrix& rhoy,
1549  const TRealMatrix& rhoz,
1550  const TRealMatrix& duxdx,
1551  const TRealMatrix& duydy,
1552  const TRealMatrix& duzdz,
1553  const bool is_BonA_scalar,
1554  const float* BonA_matrix,
1555  const bool is_rho0_scalar,
1556  const float* rho0_matrix)
1557 {
1558  // all variants are treated by templates, here you can see all 4 variants
1559  if (is_BonA_scalar)
1560  {
1561  if (is_rho0_scalar)
1562  {
1563  CUDAComputePressurePartsNonLinear<true, true>
1565  (rho_sum.GetDeviceData(),
1566  BonA_sum.GetDeviceData(),
1567  du_sum.GetDeviceData(),
1568  rhox.GetDeviceData(),
1569  rhoy.GetDeviceData(),
1570  rhoz.GetDeviceData(),
1571  duxdx.GetDeviceData(),
1572  duydy.GetDeviceData(),
1573  duzdz.GetDeviceData(),
1574  BonA_matrix,
1575  rho0_matrix);
1576  }
1577  else
1578  {
1579  CUDAComputePressurePartsNonLinear<true, false>
1581  (rho_sum.GetDeviceData(),
1582  BonA_sum.GetDeviceData(),
1583  du_sum.GetDeviceData(),
1584  rhox.GetDeviceData(),
1585  rhoy.GetDeviceData(),
1586  rhoz.GetDeviceData(),
1587  duxdx.GetDeviceData(),
1588  duydy.GetDeviceData(),
1589  duzdz.GetDeviceData(),
1590  BonA_matrix,
1591  rho0_matrix);
1592  }
1593  }
1594  else // BonA is false
1595  {
1596  if (is_rho0_scalar)
1597  {
1598  CUDAComputePressurePartsNonLinear<false, true>
1600  (rho_sum.GetDeviceData(),
1601  BonA_sum.GetDeviceData(),
1602  du_sum.GetDeviceData(),
1603  rhox.GetDeviceData(),
1604  rhoy.GetDeviceData(),
1605  rhoz.GetDeviceData(),
1606  duxdx.GetDeviceData(),
1607  duydy.GetDeviceData(),
1608  duzdz.GetDeviceData(),
1609  BonA_matrix,
1610  rho0_matrix);
1611  }
1612  else
1613  {
1614  CUDAComputePressurePartsNonLinear<false, false>
1616  (rho_sum.GetDeviceData(),
1617  BonA_sum.GetDeviceData(),
1618  du_sum.GetDeviceData(),
1619  rhox.GetDeviceData(),
1620  rhoy.GetDeviceData(),
1621  rhoz.GetDeviceData(),
1622  duxdx.GetDeviceData(),
1623  duydy.GetDeviceData(),
1624  duzdz.GetDeviceData(),
1625  BonA_matrix,
1626  rho0_matrix);
1627  }
1628  }
1629  // check for errors
1630  checkCudaErrors(cudaGetLastError());
1631 }// end of ComputePressurePartsNonLinear
1632 //--------------------------------------------------------------------------------------------------
1633 
1634 
1635 /**
1636  * CUDA kernel which computes absorbing term with abosrb_nabla1 and absorb_nabla2.
1637  * Calculate fft_1 = absorb_nabla1 .* fft_1 \n
1638  * Calculate fft_2 = absorb_nabla2 .* fft_2 \n
1639  *
1640  * @param [in,out] fft1 - Nabla1 part
1641  * @param [in,out] fft2 - Nabla2 part
1642  * @param [in] nabla1
1643  * @param [in] nabla2
1644  */
1645 __global__ void CUDACompute_Absorb_nabla1_2(cuFloatComplex* fft1,
1646  cuFloatComplex* fft2,
1647  const float* nabla1,
1648  const float* nabla2)
1649 {
1650  for(auto i = GetIndex(); i < cudaDeviceConstants.nElementsComplex; i += GetStride())
1651  {
1652  fft1[i] *= nabla1[i];
1653  fft2[i] *= nabla2[i];
1654  }
1655 }// end of CUDACompute_Absorb_nabla1_2
1656 //--------------------------------------------------------------------------------------------------
1657 
1658 /**
1659  * Interface to kernel which computes absorbing term with abosrb_nabla1 and absorb_nabla2. \n
1660  * Calculate fft_1 = absorb_nabla1 .* fft_1 \n
1661  * Calculate fft_2 = absorb_nabla2 .* fft_2 \n
1662  *
1663  * @param [in,out] fft1 - Nabla1 part
1664  * @param [in,out] fft2 - Nabla2 part
1665  * @param [in] absorb_nabla1
1666  * @param [in] absorb_nabla2
1667  */
1669  TCUFFTComplexMatrix& fft2,
1670  const TRealMatrix& absorb_nabla1,
1671  const TRealMatrix& absorb_nabla2)
1672 {
1673  CUDACompute_Absorb_nabla1_2<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1674  (reinterpret_cast<cuFloatComplex*> (fft1.GetDeviceData()),
1675  reinterpret_cast<cuFloatComplex*> (fft2.GetDeviceData()),
1676  absorb_nabla1.GetDeviceData(),
1677  absorb_nabla2.GetDeviceData());
1678 
1679  // check for errors
1680  checkCudaErrors(cudaGetLastError());
1681 }// end of ComputeAbsorbtionTerm
1682 //--------------------------------------------------------------------------------------------------
1683 
1684 
1685 /**
1686  * CUDA Sum sub-terms to calculate new pressure, non-linear case.
1687  *
1688  * @param [out] p - new value of pressure
1689  * @param [in] BonA_temp - rho0 * (duxdx + duydy + duzdz)
1690  * @param [in] c2_matrix
1691  * @param [in] absorb_tau
1692  * @param [in] tau_matrix
1693  * @param [in] absorb_eta - BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
1694  * @param [in] eta_matrix
1695  */
1696 template <bool is_c2_scalar, bool is_tau_eta_scalar>
1697 __global__ void CUDASumPressureTermsNonlinear(float* p,
1698  const float* BonA_temp,
1699  const float* c2_matrix,
1700  const float* absorb_tau,
1701  const float* tau_matrix,
1702  const float* absorb_eta,
1703  const float* eta_matrix)
1704 {
1705  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1706  {
1707  const float c2 = (is_c2_scalar) ? cudaDeviceConstants.c2 : c2_matrix[i];
1708  const float tau = (is_tau_eta_scalar) ? cudaDeviceConstants.absorb_tau_scalar : tau_matrix[i];
1709  const float eta = (is_tau_eta_scalar) ? cudaDeviceConstants.absorb_eta_scalar : eta_matrix[i];
1710 
1711  p[i] = c2 * (BonA_temp[i] + (cudaDeviceConstants.fftDivider *
1712  ((absorb_tau[i] * tau) - (absorb_eta[i] * eta))));
1713  }
1714 }// end of CUDASumPressureTermsNonlinear
1715 //--------------------------------------------------------------------------------------------------
1716 
1717 /**
1718  * Interface to CUDA Sum sub-terms to calculate new pressure, non-linear case.
1719  * @param [in,out] p - New value of pressure
1720  * @param [in] BonA_temp - rho0 * (duxdx + duydy + duzdz)
1721  * @param [in] is_c2_scalar
1722  * @param [in] c2_matrix
1723  * @param [in] is_tau_eta_scalar
1724  * @param [in] absorb_tau
1725  * @param [in] tau_matrix
1726  * @param [in] absorb_eta - BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
1727  * @param [in] eta_matrix
1728  */
1730  const TRealMatrix& BonA_temp,
1731  const bool is_c2_scalar,
1732  const float* c2_matrix,
1733  const bool is_tau_eta_scalar,
1734  const float* absorb_tau,
1735  const float* tau_matrix,
1736  const float* absorb_eta,
1737  const float* eta_matrix)
1738 {
1739  if (is_c2_scalar)
1740  {
1741  if (is_tau_eta_scalar)
1742  {
1743  CUDASumPressureTermsNonlinear<true, true>
1745  (p.GetDeviceData(),
1746  BonA_temp.GetDeviceData(),
1747  c2_matrix,
1748  absorb_tau,
1749  tau_matrix,
1750  absorb_eta,
1751  eta_matrix);
1752  }
1753  else
1754  {
1755  CUDASumPressureTermsNonlinear<true, false>
1757  (p.GetDeviceData(),
1758  BonA_temp.GetDeviceData(),
1759  c2_matrix,
1760  absorb_tau,
1761  tau_matrix,
1762  absorb_eta,
1763  eta_matrix);
1764  }
1765  }
1766  else
1767  { // c2 is matrix
1768  if (is_tau_eta_scalar)
1769  {
1770  CUDASumPressureTermsNonlinear<false, true>
1772  (p.GetDeviceData(),
1773  BonA_temp.GetDeviceData(),
1774  c2_matrix,
1775  absorb_tau,
1776  tau_matrix,
1777  absorb_eta,
1778  eta_matrix);
1779  }
1780  else
1781  {
1782  CUDASumPressureTermsNonlinear<false, false>
1784  (p.GetDeviceData(),
1785  BonA_temp.GetDeviceData(),
1786  c2_matrix,
1787  absorb_tau,
1788  tau_matrix,
1789  absorb_eta,
1790  eta_matrix);
1791  }
1792  }
1793  // check for errors
1794  checkCudaErrors(cudaGetLastError());
1795 }// end of SumPressureTermsNonlinear
1796 //--------------------------------------------------------------------------------------------------
1797 
1798 
1799 /**
1800  * CUDA kernel that sums sub-terms to calculate new pressure, linear case.
1801  *
1802  * @param [out] p - new value of p
1803  * @param [in] absorb_tau_temp - sub-term with absorb_tau
1804  * @param [in] absorb_eta_temp - sub-term with absorb_eta
1805  * @param [in] sum_rhoxyz - rhox_sgx + rhoy_sgy + rhoz_sgz
1806  * @param [in] c2_matrix
1807  * @param [in] tau_matrix
1808  * @param [in] eta_matrix
1809  */
1810 template <bool is_c2_scalar, bool is_tau_eta_scalar>
1811 __global__ void CUDASumPressureTermsLinear(float* p,
1812  const float* absorb_tau_temp,
1813  const float* absorb_eta_temp,
1814  const float* sum_rhoxyz,
1815  const float* c2_matrix,
1816  const float* tau_matrix,
1817  const float* eta_matrix)
1818 {
1819  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1820  {
1821  const float c2 = (is_c2_scalar) ? cudaDeviceConstants.c2 : c2_matrix[i];
1822  const float tau = (is_tau_eta_scalar) ? cudaDeviceConstants.absorb_tau_scalar : tau_matrix[i];
1823  const float eta = (is_tau_eta_scalar) ? cudaDeviceConstants.absorb_eta_scalar : eta_matrix[i];
1824 
1825  p[i] = c2 * (sum_rhoxyz[i] + (cudaDeviceConstants.fftDivider *
1826  (absorb_tau_temp[i] * tau - absorb_eta_temp[i] * eta)));
1827  }
1828 }// end of CUDASumPressureTermsLinear
1829 //--------------------------------------------------------------------------------------------------
1830 
1831 
1832 /**
1833  * Interface to kernel that sums sub-terms to calculate new pressure, linear case.
1834  * @param [out] p - New value of p
1835  * @param [in] absorb_tau_temp - Sub-term with absorb_tau
1836  * @param [in] absorb_eta_temp - Sub-term with absorb_eta
1837  * @param [in] sum_rhoxyz - rhox_sgx + rhoy_sgy + rhoz_sgz
1838  * @param [in] is_c2_scalar
1839  * @param [in] c2_matrix
1840  * @param [in] is_tau_eta_scalar
1841  * @param [in] tau_matrix
1842  * @param [in] eta_matrix
1843  */
1845  const TRealMatrix& absorb_tau_temp,
1846  const TRealMatrix& absorb_eta_temp,
1847  const TRealMatrix& sum_rhoxyz,
1848  const bool is_c2_scalar,
1849  const float* c2_matrix,
1850  const bool is_tau_eta_scalar,
1851  const float* tau_matrix,
1852  const float* eta_matrix)
1853 {
1854  if (is_c2_scalar)
1855  {
1856  if (is_tau_eta_scalar)
1857  {
1858  CUDASumPressureTermsLinear<true,true>
1860  (p.GetDeviceData(),
1861  absorb_tau_temp.GetDeviceData(),
1862  absorb_eta_temp.GetDeviceData(),
1863  sum_rhoxyz.GetDeviceData(),
1864  c2_matrix,
1865  tau_matrix,
1866  eta_matrix);
1867  }
1868  else
1869  {
1870  CUDASumPressureTermsLinear<true,false>
1872  (p.GetDeviceData(),
1873  absorb_tau_temp.GetDeviceData(),
1874  absorb_eta_temp.GetDeviceData(),
1875  sum_rhoxyz.GetDeviceData(),
1876  c2_matrix,
1877  tau_matrix,
1878  eta_matrix);
1879  }
1880  }
1881  else
1882  {
1883  if (is_tau_eta_scalar)
1884  {
1885  CUDASumPressureTermsLinear<false,true>
1887  (p.GetDeviceData(),
1888  absorb_tau_temp.GetDeviceData(),
1889  absorb_eta_temp.GetDeviceData(),
1890  sum_rhoxyz.GetDeviceData(),
1891  c2_matrix,
1892  tau_matrix,
1893  eta_matrix);
1894  }
1895  else
1896  {
1897  CUDASumPressureTermsLinear<false,false>
1899  (p.GetDeviceData(),
1900  absorb_tau_temp.GetDeviceData(),
1901  absorb_eta_temp.GetDeviceData(),
1902  sum_rhoxyz.GetDeviceData(),
1903  c2_matrix,
1904  tau_matrix,
1905  eta_matrix);
1906  }
1907  }
1908  // check for errors
1909  checkCudaErrors(cudaGetLastError());
1910 }// end of SumPressureTermsLinea
1911 //--------------------------------------------------------------------------------------------------
1912 
1913 
1914 /**
1915  * CUDA kernel that sums sub-terms for new p, non-linear lossless case.
1916  *
1917  * @param [out] p - New value of pressure
1918  * @param [in] rhox
1919  * @param [in] rhoy
1920  * @param [in] rhoz
1921  * @param [in] c2_matrix
1922  * @param [in] BonA_matrix
1923  * @param [in] rho0_matrix
1924  */
1925 template<bool is_c2_scalar, bool is_BonA_scalar, bool is_rho0_scalar>
1926 __global__ void CUDASumPressureNonlinearLossless(float* p,
1927  const float* rhox,
1928  const float* rhoy,
1929  const float* rhoz,
1930  const float* c2_matrix,
1931  const float* BonA_matrix,
1932  const float* rho0_matrix)
1933 {
1934  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
1935  {
1936  const float c2 = (is_c2_scalar) ? cudaDeviceConstants.c2 : c2_matrix[i];
1937  const float BonA = (is_BonA_scalar) ? cudaDeviceConstants.BonA_scalar : BonA_matrix[i];
1938  const float rho0 = (is_rho0_scalar) ? cudaDeviceConstants.rho0_scalar : rho0_matrix[i];
1939 
1940  const float sum_rho = rhox[i] + rhoy[i] + rhoz[i];
1941 
1942  p[i] = c2 * (sum_rho + (BonA * (sum_rho * sum_rho) / (2.0f * rho0)));
1943  }
1944 }// end of CUDASum_new_p_nonlinear_lossless
1945 //--------------------------------------------------------------------------------------------------
1946 
1947 /**
1948  * Interface to kernel that sums sub-terms for new p, non-linear lossless case.
1949  * @param [out] p - New value of pressure
1950  * @param [in] rhox
1951  * @param [in] rhoy
1952  * @param [in] rhoz
1953  * @param [in] is_c2_scalar
1954  * @param [in] c2_matrix
1955  * @param [in] is_BonA_scalar
1956  * @param [in] BonA_matrix
1957  * @param [in] is_rho0_scalar
1958  * @param [in] rho0_matrix
1959  */
1961  const TRealMatrix& rhox,
1962  const TRealMatrix& rhoy,
1963  const TRealMatrix& rhoz,
1964  const bool is_c2_scalar,
1965  const float* c2_matrix,
1966  const bool is_BonA_scalar,
1967  const float* BonA_matrix,
1968  const bool is_rho0_scalar,
1969  const float* rho0_matrix)
1970 {
1971  if (is_c2_scalar)
1972  {
1973  if (is_BonA_scalar)
1974  {
1975  if (is_rho0_scalar)
1976  {
1977  CUDASumPressureNonlinearLossless<true, true, true>
1979  (p.GetDeviceData(),
1980  rhox.GetDeviceData(),
1981  rhoy.GetDeviceData(),
1982  rhoz.GetDeviceData(),
1983  c2_matrix,
1984  BonA_matrix,
1985  rho0_matrix);
1986  }
1987  else
1988  {
1989  CUDASumPressureNonlinearLossless<true, true, false>
1991  (p.GetDeviceData(),
1992  rhox.GetDeviceData(),
1993  rhoy.GetDeviceData(),
1994  rhoz.GetDeviceData(),
1995  c2_matrix,
1996  BonA_matrix,
1997  rho0_matrix);
1998  }
1999  }// is_BonA_scalar= true
2000  else
2001  {
2002  if (is_rho0_scalar)
2003  {
2004  CUDASumPressureNonlinearLossless<true, false, true>
2006  (p.GetDeviceData(),
2007  rhox.GetDeviceData(),
2008  rhoy.GetDeviceData(),
2009  rhoz.GetDeviceData(),
2010  c2_matrix,
2011  BonA_matrix,
2012  rho0_matrix);
2013  }
2014  else
2015  {
2016  CUDASumPressureNonlinearLossless<true, false, false>
2018  (p.GetDeviceData(),
2019  rhox.GetDeviceData(),
2020  rhoy.GetDeviceData(),
2021  rhoz.GetDeviceData(),
2022  c2_matrix,
2023  BonA_matrix,
2024  rho0_matrix);
2025  }
2026  }
2027  }
2028  else
2029  { // is_c2_scalar == false
2030  if (is_BonA_scalar)
2031  {
2032  if (is_rho0_scalar)
2033  {
2034  CUDASumPressureNonlinearLossless<false, true, true>
2036  (p.GetDeviceData(),
2037  rhox.GetDeviceData(),
2038  rhoy.GetDeviceData(),
2039  rhoz.GetDeviceData(),
2040  c2_matrix,
2041  BonA_matrix,
2042  rho0_matrix);
2043  }
2044  else
2045  {
2046  CUDASumPressureNonlinearLossless<false, true, false>
2048  (p.GetDeviceData(),
2049  rhox.GetDeviceData(),
2050  rhoy.GetDeviceData(),
2051  rhoz.GetDeviceData(),
2052  c2_matrix,
2053  BonA_matrix,
2054  rho0_matrix);
2055  }
2056  }// is_BonA_scalar= true
2057  else
2058  {
2059  if (is_rho0_scalar)
2060  {
2061  CUDASumPressureNonlinearLossless<false, false, true>
2063  (p.GetDeviceData(),
2064  rhox.GetDeviceData(),
2065  rhoy.GetDeviceData(),
2066  rhoz.GetDeviceData(),
2067  c2_matrix,
2068  BonA_matrix,
2069  rho0_matrix);
2070  }
2071  else
2072  {
2073  CUDASumPressureNonlinearLossless<false, false, false>
2075  (p.GetDeviceData(),
2076  rhox.GetDeviceData(),
2077  rhoy.GetDeviceData(),
2078  rhoz.GetDeviceData(),
2079  c2_matrix,
2080  BonA_matrix,
2081  rho0_matrix);
2082  }
2083  }
2084  }
2085 
2086  // check for errors
2087  checkCudaErrors(cudaGetLastError());
2088 }// end of SumPressureNonlinearLossless
2089 //--------------------------------------------------------------------------------------------------
2090 
2091 
2092 /**
2093  * CUDA kernel that Calculates two temporary sums in the new pressure
2094  * formula, linear absorbing case.
2095  *
2096  * @param [out] sum_rhoxyz - rhox_sgx + rhoy_sgy + rhoz_sgz
2097  * @param [out] sum_rho0_du - rho0* (duxdx + duydy + duzdz);
2098  * @param [in] rhox
2099  * @param [in] rhoy
2100  * @param [in] rhoz
2101  * @param [in] dux
2102  * @param [in] duy
2103  * @param [in] duz
2104  * @param [in] rho0_matrix
2105  */
2106 template<bool is_rho0_scalar>
2107 __global__ void CUDAComputePressurePartsLinear(float* sum_rhoxyz,
2108  float* sum_rho0_du,
2109  const float* rhox,
2110  const float* rhoy,
2111  const float* rhoz,
2112  const float* dux,
2113  const float* duy,
2114  const float* duz,
2115  const float* rho0_matrix)
2116 {
2117  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
2118  {
2119  const float rho0 = (is_rho0_scalar) ? cudaDeviceConstants.rho0_scalar : rho0_matrix[i];
2120 
2121  sum_rhoxyz[i] = rhox[i] + rhoy[i] + rhoz[i];
2122  sum_rho0_du[i] = rho0 * (dux[i] + duy[i] + duz[i]);
2123  }
2124 }// end of CUDACalculate_SumRho_SumRhoDu
2125 //------------------------------------------------------------------------------
2126 
2127 /**
2128  * Interface to kernel that Calculates two temporary sums in the new pressure
2129  * formula, linear absorbing case.
2130  * @param [out] sum_rhoxyz - rhox_sgx + rhoy_sgy + rhoz_sgz
2131  * @param [out] sum_rho0_du - rho0* (duxdx + duydy + duzdz);
2132  * @param [in] rhox
2133  * @param [in] rhoy
2134  * @param [in] rhoz
2135  * @param [in] duxdx
2136  * @param [in] duydy
2137  * @param [in] duzdz
2138  * @param [in] is_rho0_scalar
2139  * @param [in] rho0_matrix
2140  */
2142  TRealMatrix& sum_rho0_du,
2143  const TRealMatrix& rhox,
2144  const TRealMatrix& rhoy,
2145  const TRealMatrix& rhoz,
2146  const TRealMatrix& duxdx,
2147  const TRealMatrix& duydy,
2148  const TRealMatrix& duzdz,
2149  const bool is_rho0_scalar,
2150  const float* rho0_matrix)
2151 {
2152  if (is_rho0_scalar)
2153  {
2154  CUDAComputePressurePartsLinear<true>
2156  (sum_rhoxyz.GetDeviceData(),
2157  sum_rho0_du.GetDeviceData(),
2158  rhox.GetDeviceData(),
2159  rhoy.GetDeviceData(),
2160  rhoz.GetDeviceData(),
2161  duxdx.GetDeviceData(),
2162  duydy.GetDeviceData(),
2163  duzdz.GetDeviceData(),
2164  rho0_matrix);
2165  }
2166  else
2167  {
2168  CUDAComputePressurePartsLinear<false>
2170  (sum_rhoxyz.GetDeviceData(),
2171  sum_rho0_du.GetDeviceData(),
2172  rhox.GetDeviceData(),
2173  rhoy.GetDeviceData(),
2174  rhoz.GetDeviceData(),
2175  duxdx.GetDeviceData(),
2176  duydy.GetDeviceData(),
2177  duzdz.GetDeviceData(),
2178  rho0_matrix);
2179  }
2180  // check for errors
2181  checkCudaErrors(cudaGetLastError());
2182 }// end of Calculate_SumRho_SumRhoDu
2183 //--------------------------------------------------------------------------------------------------
2184 
2185 /**
2186  * CUDA kernel that sums sub-terms for new p, linear lossless case.
2187  *
2188  * @param [out] p
2189  * @param [in] rhox
2190  * @param [in] rhoy
2191  * @param [in] rhoz
2192  * @param [in] c2_matrix
2193 
2194  */
2195 template <bool is_c2_scalar>
2196 __global__ void CUDASum_new_p_linear_lossless(float* p,
2197  const float* rhox,
2198  const float* rhoy,
2199  const float* rhoz,
2200  const float* c2_matrix)
2201 {
2202  for(auto i = GetIndex(); i < cudaDeviceConstants.nElements; i += GetStride())
2203  {
2204  const float c2 = (is_c2_scalar) ? cudaDeviceConstants.c2 : c2_matrix[i];
2205  p[i] = c2 * (rhox[i] + rhoy[i] + rhoz[i]);
2206  }
2207 }// end of CUDASum_new_p_linear_lossless
2208 //--------------------------------------------------------------------------------------------------
2209 
2210 /**
2211  * Interface to kernel that sums sub-terms for new p, linear lossless case.
2212  * @param [out] p
2213  * @param [in] rhox
2214  * @param [in] rhoy
2215  * @param [in] rhoz
2216  * @param [in] is_c2_scalar
2217  * @param [in] c2_matrix
2218  */
2220  const TRealMatrix& rhox,
2221  const TRealMatrix& rhoy,
2222  const TRealMatrix& rhoz,
2223  const bool is_c2_scalar,
2224  const float* c2_matrix)
2225 {
2226  if (is_c2_scalar)
2227  {
2228  CUDASum_new_p_linear_lossless<true>
2230  (p.GetDeviceData(),
2231  rhox.GetDeviceData(),
2232  rhoy.GetDeviceData(),
2233  rhoz.GetDeviceData(),
2234  c2_matrix);
2235  }
2236  else
2237  {
2238  CUDASum_new_p_linear_lossless<false>
2240  (p.GetDeviceData(),
2241  rhox.GetDeviceData(),
2242  rhoy.GetDeviceData(),
2243  rhoz.GetDeviceData(),
2244  c2_matrix);
2245  }
2246  // check for errors
2247  checkCudaErrors(cudaGetLastError());
2248 }// end of Sum_new_p_linear_lossless
2249 //--------------------------------------------------------------------------------------------------
2250 
2251 
2252 
2253 /**
2254  * CUDA kernel to transpose a 3D matrix in XY planes if the dim sizes are divisible by 32 in X and
2255  * Y axes.
2256  * Every block in a 1D grid transposes a few slabs.
2257  * Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles.
2258  * Each tile is processed by a single 32-thread warp.
2259  * The shared memory is used to coalesce memory accesses and the padding is to eliminate bank
2260  * conflicts.
2261  *
2262  * @param [out] outputMatrix - Output matrix
2263  * @param [in] inputMatrix - Input matrix
2264  * @param [in] dimSizes - Dimension sizes of the original matrix
2265  *
2266  * @warning The size X and Y dimensions have to be divisible by 32
2267  *
2268  * @warning A blockDim.x has to be 32 (one warp) \n
2269  * blockDim.y has to between 1 and 4 (for tiles at once) \n
2270  * blockDim.z must stay 1 \n
2271  * Grid has to be organized (N, 1 ,1)
2272  *
2273  */
2274 __global__ void CUDATrasnposeReal3DMatrixXYSquare(float* outputMatrix,
2275  const float* inputMatrix,
2276  const dim3 dimSizes)
2277 {
2278  // this size is fixed shared memory
2279  // we transpose 4 tiles of 32*32 at the same time, +1 solves bank conflicts
2280  ///@todo - What about Warp shuffle?
2281  ///@todo http://www.pixel.io/blog/2013/3/25/fast-matrix-transposition-on-kepler-without-using-shared-mem.html
2282  volatile __shared__ float sharedTile[4][32][32+1];
2283 
2284  // run over all slabs, one block per slab
2285  for (auto slabIdx = blockIdx.x; slabIdx < dimSizes.z; slabIdx += gridDim.x)
2286  {
2287  // calculate offset of the slab
2288  const float * inputSlab = inputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2289  float * outputSlab = outputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2290 
2291  dim3 tileIdx {0,0,0};
2292  dim3 tileCount {dimSizes.x >> 5, dimSizes.y >> 5, 1};
2293 
2294  // go over all all tiles in the row. Transpose 4 rows at the same time
2295  for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2296  {
2297  // go over all tiles in the row
2298  for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2299  {
2300  // Go over one tile and load data, unroll does not help
2301  for (auto row = 0; row < 32; row++)
2302  {
2303  sharedTile[threadIdx.y][row][threadIdx.x]
2304  = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2305  (tileIdx.x * 32) + threadIdx.x];
2306  } // load data
2307 
2308  // no need for barrier - warp synchronous programming
2309 
2310  // Go over one tile and store data, unroll does not help
2311  for (auto row = 0; row < 32; row ++)
2312  {
2313  outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2314  (tileIdx.y * 32) + threadIdx.x]
2315  = sharedTile[threadIdx.y][threadIdx.x][row];
2316 
2317  } // store data
2318  } // tile X
2319  }// tile Y
2320  } //slab
2321 }// end of cudaTrasnposeReal3DMatrixXYSquare
2322 //--------------------------------------------------------------------------------------------------
2323 
2324 
2325 /**
2326  * CUDA kernel to transpose a 3D matrix in XY planes of any dimension sizes
2327  * Every block in a 1D grid transposes a few slabs.
2328  * Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles.
2329  * Each tile is processed by a single 32-thread warp.
2330  * The shared memory is used to coalesce memory accesses and the padding is to eliminate bank
2331  * conflicts. First the full tiles are transposed, then the remainder in the X, then Y and finally
2332  * the last bit in the bottom right corner.
2333  *
2334  * @param [out] outputMatrix - Output matrix
2335  * @param [in] inputMatrix - Input matrix
2336  * @param [in] dimSizes - Dimension sizes of the original matrix
2337  *
2338  * @warning The size X and Y dimensions have to be divisible by 32
2339  *
2340  * @warning A blockDim.x has to be 32 (one warp) \n
2341  * blockDim.y has to between 1 and 4 (for tiles at once) \n
2342  * blockDim.z must stay 1 \n
2343  * Grid has to be organized (N, 1 ,1)
2344  *
2345  */
2346 __global__ void CUDATrasnposeReal3DMatrixXYRect(float* outputMatrix,
2347  const float* inputMatrix,
2348  const dim3 dimSizes)
2349 {
2350  // this size is fixed shared memory
2351  // we transpose 4 tiles of 32*32 at the same time, +1 solves bank conflicts
2352  volatile __shared__ float sharedTile[4][32][32+1];
2353 
2354  // run over all slabs, one block per slab
2355  for (auto slabIdx = blockIdx.x; slabIdx < dimSizes.z; slabIdx += gridDim.x)
2356  {
2357  // calculate offset of the slab
2358  const float * inputSlab = inputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2359  float * outputSlab = outputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2360 
2361  dim3 tileIdx = {0,0,0};
2362  dim3 tileCount = {dimSizes.x >> 5, dimSizes.y >> 5, 1};
2363 
2364  // go over all all tiles in the row. Transpose 4 rows at the same time
2365  for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2366  {
2367  //--------------------------- full tiles in X --------------------------//
2368  // go over all full tiles in the row
2369  for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2370  {
2371  // Go over one tile and load data, unroll does not help
2372  for (auto row = 0; row < 32; row++)
2373  {
2374  sharedTile[threadIdx.y][row][threadIdx.x]
2375  = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2376  (tileIdx.x * 32) + threadIdx.x];
2377  } // load data
2378  // no need for barrier - warp synchronous programming
2379  // Go over one tile and store data, unroll does not help
2380  for (auto row = 0; row < 32; row ++)
2381  {
2382  outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2383  (tileIdx.y * 32) + threadIdx.x]
2384  = sharedTile[threadIdx.y][threadIdx.x][row];
2385 
2386  } // store data
2387  } // tile X
2388 
2389  //--------------------------- reminders in X ---------------------------//
2390  // go over the remainder tile in X (those that can't fill a 32-warps)
2391  if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2392  {
2393  for (auto row = 0; row < 32; row++)
2394  {
2395  sharedTile[threadIdx.y][row][threadIdx.x]
2396  = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2397  (tileCount.x * 32) + threadIdx.x];
2398  }
2399  }// load
2400 
2401  // go over the remainder tile in X (those that can't fill a 32-warp)
2402  for (auto row = 0; (tileCount.x * 32 + row) < dimSizes.x; row++)
2403  {
2404  outputSlab[(tileCount.x * 32 + row) * dimSizes.y +
2405  (tileIdx.y * 32) + threadIdx.x]
2406  = sharedTile[threadIdx.y][threadIdx.x][row];
2407  }// store
2408  }// tile Y
2409 
2410  //--------------------------- reminders in Y ---------------------------//
2411  // go over the remainder tile in y (those that can't fill 32 warps)
2412  // go over all full tiles in the row, first in parallel
2413  for (tileIdx.x = threadIdx.y; tileIdx.x < tileCount.x; tileIdx.x += blockDim.y)
2414  {
2415  // go over the remainder tile in Y (only a few rows)
2416  for (auto row = 0; (tileCount.y * 32 + row) < dimSizes.y; row++)
2417  {
2418  sharedTile[threadIdx.y][row][threadIdx.x]
2419  = inputSlab[(tileCount.y * 32 + row) * dimSizes.x +
2420  (tileIdx.x * 32) + threadIdx.x];
2421  } // load
2422 
2423  // go over the remainder tile in Y (and store only columns)
2424  if ((tileCount.y * 32 + threadIdx.x) < dimSizes.y)
2425  {
2426  for (auto row = 0; row < 32 ; row++)
2427  {
2428  outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2429  (tileCount.y * 32) + threadIdx.x]
2430  = sharedTile[threadIdx.y][threadIdx.x][row];
2431  }
2432  }// store
2433  }// reminder Y
2434 
2435  //------------------------ reminder in X and Y -----------------------------//
2436  if (threadIdx.y == 0)
2437  {
2438  // go over the remainder tile in X and Y (only a few rows and colls)
2439  if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2440  {
2441  for (auto row = 0; (tileCount.y * 32 + row) < dimSizes.y; row++)
2442  {
2443  sharedTile[threadIdx.y][row][threadIdx.x]
2444  = inputSlab[(tileCount.y * 32 + row) * dimSizes.x +
2445  (tileCount.x * 32) + threadIdx.x];
2446  } // load
2447  }
2448 
2449  // go over the remainder tile in Y (and store only colls)
2450  if ((tileCount.y * 32 + threadIdx.x) < dimSizes.y)
2451  {
2452  for (auto row = 0; (tileCount.x * 32 + row) < dimSizes.x ; row++)
2453  {
2454  outputSlab[(tileCount.x * 32 + row) * dimSizes.y +
2455  (tileCount.y * 32) + threadIdx.x]
2456  = sharedTile[threadIdx.y][threadIdx.x][row];
2457  }
2458  }// store
2459  }// reminder X and Y
2460  } //slab
2461 }// end of cudaTrasnposeReal3DMatrixXYRect
2462 //--------------------------------------------------------------------------------------------------
2463 
2464 
2465 /**
2466  * Transpose a real 3D matrix in the X-Y direction. It is done out-of-place.
2467  *
2468  * @param [out] outputMatrix - Output matrix data
2469  * @param [in] inputMatrix - Input matrix data
2470  * @param [in] dimSizes - Dimension sizes of the original matrix
2471  */
2473  const float* inputMatrix,
2474  const dim3& dimSizes)
2475 {
2476  // fixed size at the moment, may be tuned based on the domain shape in the future
2477  if ((dimSizes.x % 32 == 0) && (dimSizes.y % 32 == 0))
2478  {
2479  // when the dims are multiples of 32, then use a faster implementation
2480  CUDATrasnposeReal3DMatrixXYSquare<<<GetSolverTransposeGirdSize(),GetSolverTransposeBlockSize() >>>
2481  (outputMatrix, inputMatrix, dimSizes);
2482  }
2483  else
2484  {
2485  CUDATrasnposeReal3DMatrixXYRect<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2486  (outputMatrix, inputMatrix, dimSizes);
2487  }
2488 
2489  // check for errors
2490  checkCudaErrors(cudaGetLastError());
2491 }// end of TrasposeReal3DMatrixXY
2492 //--------------------------------------------------------------------------------------------------
2493 
2494 
2495 
2496 /**
2497  * CUDA kernel to transpose a 3D matrix in XZ planes if the dim sizes are divisible
2498  * by 32 in X and Z axes.
2499  * Every block in a 1D grid transposes a few slabs.
2500  * Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles.
2501  * Each tile is processed by a single 32-thread warp.
2502  * The shared memory is used to coalesce memory accesses and the padding is to
2503  * eliminate bank conflicts.
2504  *
2505  * @param [out] outputMatrix - Output matrix
2506  * @param [in] inputMatrix - Input matrix
2507  * @param [in] dimSizes - Dimension sizes of the original matrix
2508  *
2509  * @warning The size X and Z dimensions have to be divisible by 32
2510  *
2511  * @warning A blockDim.x has to be 32 (one warp) \n
2512  * blockDim.y has to between 1 and 4 (for tiles at once) \n
2513  * blockDim.z must stay 1 \n
2514  * Grid has to be organized (N, 1 ,1)
2515  *
2516  */
2517 __global__ void CUDATrasnposeReal3DMatrixXZSquare(float* outputMatrix,
2518  const float* inputMatrix,
2519  const dim3 dimSizes)
2520 {
2521  // this size is fixed shared memory
2522  // we transpose 4 tiles of 32*32 at the same time, +1 solves bank conflicts
2523  volatile __shared__ float shared_tile[4][32][32+1];
2524 
2525  // run over all XZ slabs, one block per slab
2526  for (auto row = blockIdx.x; row < dimSizes.y; row += gridDim.x)
2527  {
2528  dim3 tileIdx = {0,0,0};
2529  dim3 tileCount = {dimSizes.x >> 5, dimSizes.z >> 5, 1};
2530 
2531  // go over all all tiles in the slab. Transpose multiple slabs at the same time
2532  for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2533  {
2534  // go over all tiles in the row
2535  for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x ++)
2536  {
2537  // Go over one tile and load data, unroll does not help
2538  for (auto slab = 0; slab < 32; slab++)
2539  {
2540  shared_tile[threadIdx.y][slab][threadIdx.x]
2541  = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2542  (row * dimSizes.x) +
2543  tileIdx.x * 32 + threadIdx.x];
2544  } // load data
2545  // no need for barrier - warp synchronous programming
2546 
2547  // Go over one tile and store data, unroll does not help
2548  for (auto slab = 0; slab < 32; slab++)
2549  {
2550  outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2551  row * dimSizes.z +
2552  tileIdx.y * 32 + threadIdx.x]
2553  = shared_tile[threadIdx.y][threadIdx.x][slab];
2554  } // store data
2555  } // tile X
2556  }// tile Y
2557  } //slab
2558 }// end of cudaTrasnposeReal3DMatrixXZSquare
2559 //--------------------------------------------------------------------------------------------------
2560 
2561 
2562 
2563 /**
2564  * CUDA kernel to transpose a 3D matrix in XZ planes of any dimension sizes
2565  * Every block in a 1D grid transposes a few slabs.
2566  * Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles.
2567  * Each tile is processed by a single 32-thread warp.
2568  * The shared memory is used to coalesce memory accesses and the padding is to
2569  * eliminate bank conflicts.
2570  * First the full tiles are transposed, then the remainder in the X, then Y and finally the last
2571  * bit in the bottom right corner.
2572  *
2573  * @param [out] outputMatrix - Output matrix
2574  * @param [in] inputMatrix - Input matrix
2575  * @param [in] dimSizes - Dimension sizes of the original matrix
2576  *
2577  * @warning The size X and Z dimensions have to be divisible by 32
2578  *
2579  * @warning A blockDim.x has to be 32 (one warp) \n
2580  * blockDim.y has to between 1 and 4 (for tiles at once) \n
2581  * blockDim.z must stay 1 \n
2582  * Grid has to be organized (N, 1 ,1)
2583  *
2584  */
2585 __global__ void CUDATrasnposeReal3DMatrixXZRect(float* outputMatrix,
2586  const float* inputMatrix,
2587  const dim3 dimSizes)
2588 {
2589  // this size is fixed shared memory
2590  // we transpose 4 tiles of 32*32 at the same time, +1 solves bank conflicts
2591  volatile __shared__ float shared_tile[4][32][32+1];
2592 
2593  // run over all XZ slabs, one block per slab
2594  for (auto row = blockIdx.x; row < dimSizes.y; row += gridDim.x)
2595  {
2596  dim3 tileIdx = {0,0,0};
2597  dim3 tileCount = {dimSizes.x >> 5, dimSizes.z >> 5, 1};
2598 
2599  // go over all all tiles in the XZ slab. Transpose multiple slabs at the same time (on per Z)
2600  for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2601  {
2602  // go over all tiles in the row
2603  for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2604  {
2605  // Go over one tile and load data, unroll does not help
2606  for (auto slab = 0; slab < 32; slab++)
2607  {
2608  shared_tile[threadIdx.y][slab][threadIdx.x]
2609  = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2610  row * dimSizes.x +
2611  tileIdx.x * 32 + threadIdx.x];
2612  } // load data
2613 
2614  // no need for barrier - warp synchronous programming
2615 
2616  // Go over one tile and store data, unroll does not help
2617  for (auto slab = 0; slab < 32; slab++)
2618  {
2619  outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2620  row * dimSizes.z +
2621  tileIdx.y * 32 + threadIdx.x]
2622  = shared_tile[threadIdx.y][threadIdx.x][slab];
2623  } // store data
2624  } // tile X
2625 
2626  //--------------------------- reminders in X ---------------------------//
2627  // go over the remainder tile in X (those that can't fill a 32-warp)
2628  if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2629  {
2630  for (auto slab = 0; slab < 32; slab++)
2631  {
2632  shared_tile[threadIdx.y][slab][threadIdx.x]
2633  = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2634  row * dimSizes.x +
2635  tileCount.x * 32 + threadIdx.x];
2636  }
2637  }// load
2638 
2639  // go over the remainder tile in X (those that can't fill 32 warps)
2640  for (auto slab = 0; (tileCount.x * 32 + slab) < dimSizes.x; slab++)
2641  {
2642  outputMatrix[(tileCount.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2643  row * dimSizes.z +
2644  tileIdx.y * 32 + threadIdx.x]
2645  = shared_tile[threadIdx.y][threadIdx.x][slab];
2646  }// store
2647  }// tile Y
2648 
2649  //--------------------------- reminders in Z -----------------------------//
2650  // go over the remainder tile in z (those that can't fill a 32-warp)
2651  // go over all full tiles in the row, first in parallel
2652  for (tileIdx.x = threadIdx.y; tileIdx.x < tileCount.x; tileIdx.x += blockDim.y)
2653  {
2654  // go over the remainder tile in Y (only a few rows)
2655  for (auto slab = 0; (tileCount.y * 32 + slab) < dimSizes.z; slab++)
2656  {
2657  shared_tile[threadIdx.y][slab][threadIdx.x]
2658  = inputMatrix[(tileCount.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2659  row * dimSizes.x +
2660  tileIdx.x * 32 + threadIdx.x];
2661 
2662  } // load
2663 
2664  // go over the remainder tile in Y (and store only cullomns)
2665  if ((tileCount.y * 32 + threadIdx.x) < dimSizes.z)
2666  {
2667  for (auto slab = 0; slab < 32; slab++)
2668  {
2669  outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2670  row * dimSizes.z +
2671  tileCount.y * 32 + threadIdx.x]
2672  = shared_tile[threadIdx.y][threadIdx.x][slab];
2673  }
2674  }// store
2675  }// reminder Y
2676 
2677  //------------------------ reminder in X and Z -----------------------------//
2678  if (threadIdx.y == 0)
2679  {
2680  // go over the remainder tile in X and Y (only a few rows and colls)
2681  if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2682  {
2683  for (auto slab = 0; (tileCount.y * 32 + slab) < dimSizes.z; slab++)
2684  {
2685  shared_tile[threadIdx.y][slab][threadIdx.x]
2686  = inputMatrix[(tileCount.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2687  row * dimSizes.x +
2688  tileCount.x * 32 + threadIdx.x];
2689  } // load
2690  }
2691 
2692  // go over the remainder tile in Z (and store only colls)
2693  if ((tileCount.y * 32 + threadIdx.x) < dimSizes.z)
2694  {
2695  for (auto slab = 0; (tileCount.x * 32 + slab) < dimSizes.x ; slab++)
2696  {
2697  outputMatrix[(tileCount.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2698  row * dimSizes.z +
2699  tileCount.y * 32 + threadIdx.x]
2700  = shared_tile[threadIdx.y][threadIdx.x][slab];
2701  }
2702  }// store
2703  }// reminder X and Y
2704  } //slab
2705 }// end of cudaTrasnposeReal3DMatrixXZRect
2706 //--------------------------------------------------------------------------------------------------
2707 
2708 
2709 
2710 /**
2711  * Transpose a real 3D matrix in the X-Y direction. It is done out-of-place
2712  * @param [out] outputMatrix - Output matrix
2713  * @param [in] inputMatrix - Input matrix
2714  * @param [in] dimSizes - Dimension sizes of the original matrix
2715  */
2717  const float* inputMatrix,
2718  const dim3& dimSizes)
2719 {
2720  if ((dimSizes.x % 32 == 0) && (dimSizes.y % 32 == 0))
2721  {
2722  // when the dims are multiples of 32, then use a faster implementation
2723  CUDATrasnposeReal3DMatrixXZSquare<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2724  (outputMatrix, inputMatrix, dimSizes);
2725  }
2726  else
2727  {
2728  CUDATrasnposeReal3DMatrixXZRect<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2729  (outputMatrix, inputMatrix, dimSizes);
2730  }
2731 
2732  // check for errors
2733  checkCudaErrors(cudaGetLastError());
2734 }// end of TrasposeReal3DMatrixXZ
2735 //--------------------------------------------------------------------------------------------------
2736 
2737 
2738 /**
2739  * CUDA kernel to compute velocity shift in the X direction.
2740  *
2741  * @param [in, out] cufft_shift_temp - Matrix to calculate 1D FFT to
2742  * @param [in] x_shift_neg_r
2743  */
2744 __global__ void CUDAComputeVelocityShiftInX(cuFloatComplex* cufft_shift_temp,
2745  const cuFloatComplex* x_shift_neg_r)
2746 {
2747  for (auto i = GetIndex(); i < cudaDeviceConstants.nElementsComplex; i += GetStride())
2748  {
2749  const auto x = i % cudaDeviceConstants.nxComplex;
2750 
2751  cufft_shift_temp[i] = cuCmulf(cufft_shift_temp[i], x_shift_neg_r[x]) * cudaDeviceConstants.fftDividerX;
2752  }
2753 }// end of CUDAComputeVelocityShiftInX
2754 //--------------------------------------------------------------------------------------------------
2755 
2756 
2757 /**
2758  * Compute the velocity shift in Fourier space over the X axis.
2759  * This kernel work with the original space.
2760  *
2761  * @param [in,out] cufft_shift_temp - Matrix to calculate 1D FFT to
2762  * @param [in] x_shift_neg_r
2763  */
2765  const TComplexMatrix& x_shift_neg_r)
2766 {
2767  CUDAComputeVelocityShiftInX<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2768  (reinterpret_cast<cuFloatComplex*> (cufft_shift_temp.GetDeviceData()),
2769  reinterpret_cast<const cuFloatComplex*> (x_shift_neg_r.GetDeviceData()));
2770  // check for errors
2771  checkCudaErrors(cudaGetLastError());
2772  }// end of ComputeVelocityShiftInX
2773 //--------------------------------------------------------------------------------------------------
2774 
2775 
2776 
2777 /**
2778  * CUDA kernel to compute velocity shift in Y. The matrix is XY transposed.
2779  * @param [in, out] cufft_shift_temp - Matrix to calculate 1D FFT to
2780  * @param [in] y_shift_neg_r
2781  */
2782 __global__ void CUDAComputeVelocityShiftInY(cuFloatComplex* cufft_shift_temp,
2783  const cuFloatComplex* y_shift_neg_r)
2784 {
2785  const auto ny_2 = cudaDeviceConstants.ny / 2 + 1;
2786  const auto nElements = cudaDeviceConstants.nx * ny_2 * cudaDeviceConstants.nz;
2787 
2788  for (auto i = GetIndex(); i < nElements; i += GetStride())
2789  {
2790  // rotated dimensions
2791  const auto y = i % ny_2;
2792 
2793  cufft_shift_temp[i] = cuCmulf(cufft_shift_temp[i], y_shift_neg_r[y]) * cudaDeviceConstants.fftDividerY;
2794  }
2795 }// end of CUDAComputeVelocityShiftInY
2796 //--------------------------------------------------------------------------------------------------
2797 
2798 /**
2799  * Compute the velocity shift in Fourier space over the Y axis.
2800  * This kernel work with the transposed space.
2801  *
2802  * @param [in,out] cufft_shift_temp - Matrix to calculate 1D FFT to
2803  * @param [in] y_shift_neg_r
2804  */
2806  const TComplexMatrix& y_shift_neg_r)
2807 {
2808  CUDAComputeVelocityShiftInY<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2809  (reinterpret_cast<cuFloatComplex*> (cufft_shift_temp.GetDeviceData()),
2810  reinterpret_cast<const cuFloatComplex*> (y_shift_neg_r.GetDeviceData()));
2811  // check for errors
2812  checkCudaErrors(cudaGetLastError());
2813 }// end of ComputeVelocityShiftInY
2814 //--------------------------------------------------------------------------------------------------
2815 
2816 
2817 /**
2818  * CUDA kernel to compute velocity shift in Z. The matrix is XZ transposed.
2819  *
2820  * @param [in, out] cufft_shift_temp - Matrix to calculate 1D FFT to
2821  * @param [in] z_shift_neg_r
2822  */
2823 __global__ void CUDAComputeVelocityShiftInZ(cuFloatComplex* cufft_shift_temp,
2824  const cuFloatComplex* z_shift_neg_r)
2825 {
2826  const auto nz_2 = cudaDeviceConstants.nz / 2 + 1;
2827  const auto nElements = cudaDeviceConstants.nx * cudaDeviceConstants.ny * nz_2;
2828 
2829  for (auto i = GetIndex(); i < nElements; i += GetStride())
2830  {
2831  // rotated dimensions
2832  const auto z = i % nz_2;
2833 
2834  cufft_shift_temp[i] = cuCmulf(cufft_shift_temp[i], z_shift_neg_r[z]) * cudaDeviceConstants.fftDividerZ;
2835  }
2836 }// end of CUDAComputeVelocityShiftInZ
2837 //---------------------------------------------------------------------------------------------------
2838 
2839 /**
2840  * Compute the velocity shift in Fourier space over the Z axis.
2841  * This kernel work with the transposed space.
2842  *
2843  * @param [in,out] cufft_shift_temp - Matrix to calculate 1D FFT to
2844  * @param [in] z_shift_neg_r
2845  */
2847  const TComplexMatrix& z_shift_neg_r)
2848 {
2849  CUDAComputeVelocityShiftInZ<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2850  (reinterpret_cast<cuFloatComplex*> (cufft_shift_temp.GetDeviceData()),
2851  reinterpret_cast<const cuFloatComplex*> (z_shift_neg_r.GetDeviceData()));
2852  // check for errors
2853  checkCudaErrors(cudaGetLastError());
2854 }// end of ComputeVelocityShiftInZ
2855 //--------------------------------------------------------------------------------------------------
__global__ void CUDACompute_Absorb_nabla1_2(cuFloatComplex *fft1, cuFloatComplex *fft2, const float *nabla1, const float *nabla2)
__global__ void CUDAComputePressurePartsLinear(float *sum_rhoxyz, float *sum_rho0_du, const float *rhox, const float *rhoy, const float *rhoz, const float *dux, const float *duy, const float *duz, const float *rho0_matrix)
__global__ void CUDAComputeVelocity(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dt_rho0_sgx, const float *dt_rho0_sgy, const float *dt_rho0_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
unsigned int u_source_index_size
size of the u source index
__global__ void CUDAComputeVelocityScalarNonuniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
void AddVelocitySource(TRealMatrix &uxyz_sgxyz, const TRealMatrix &u_source_input, const TIndexMatrix &u_source_index, const size_t t_index)
Add in velocity source terms.
__global__ void CUDAComputeVelocityGradient(cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx_neg, const cuFloatComplex *ddy_neg, const cuFloatComplex *ddz_neg)
dim3 GetSolverTransposeGirdSize()
__global__ void CUDATrasnposeReal3DMatrixXZRect(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
unsigned int nx
size of X dimension.
float fftDivider
normalization constant for 3D FFT.
int GetCUDACodeVersion()
Get the CUDA architecture and GPU code version the code was compiled with.
__global__ void CUDATrasnposeReal3DMatrixXZSquare(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
float BonA_scalar
BonA value for homogeneous case.
__global__ void CUDATrasnposeReal3DMatrixXYRect(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
The header file for the class for storing constants residing in CUDA constant memory.
virtual size_t * GetDeviceData()
Get raw GPU data out of the class (for direct GPU kernel access).
Structure for CUDA parameters to be placed in constant memory. Only 32b values are used...
unsigned int nxComplex
size of complex X dimension.
__global__ void CUDAComputeVelocityGradientNonuniform(float *duxdx, float *duydy, float *duzdz, const float *duxdxn, const float *duydyn, const float *duzdzn)
__global__ void CUDAAddTransducerSource(float *ux_sgx, const size_t *u_source_index, size_t *delay_mask, const float *transducer_signal)
__global__ void CUDACompute_p0_VelocityScalarNonUniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz)
__global__ void CUDAComputeDensityLinearHeterogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
__global__ void CUDAGetCUDACodeVersion(int *cudaCodeVersion)
void SumPressureLinearLossless(TRealMatrix &p, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const bool is_c2_scalar, const float *c2_matrix)
Sum sub-terms for new p, linear lossless case.
__global__ void CUDATrasnposeReal3DMatrixXYSquare(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
void ComputeVelocityGradientNonuniform(TRealMatrix &duxdx, TRealMatrix &duydy, TRealMatrix &duzdz, const TRealMatrix &dxudxn, const TRealMatrix &dyudyn, const TRealMatrix &dzudzn)
Shift gradient of acoustic velocity on non-uniform grid.
static TParameters & GetInstance()
Get instance of the singleton class.
Definition: Parameters.cpp:70
__global__ void CUDASumPressureTermsNonlinear(float *p, const float *BonA_temp, const float *c2_matrix, const float *absorb_tau, const float *tau_matrix, const float *absorb_eta, const float *eta_matrix)
__device__ unsigned int GetIndex()
Get global 1D coordinate for 1D CUDA block.
Definition: CUDAUtils.cuh:57
__global__ void CUDAComputePressurelGradient(cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx, const cuFloatComplex *ddy, const cuFloatComplex *ddz)
float rho0_sgx_scalar
dt / rho0_sgx in homogeneous case
__global__ void CUDAComputeDensityNonlinearHomogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
void ComputePressurePartsNonLinear(TRealMatrix &rho_sum, TRealMatrix &BonA_sum, TRealMatrix &du_sum, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const bool is_BonA_scalar, const float *BonA_matrix, const bool is_rho0_scalar, const float *rho0_matrix)
Calculate three temporary sums in the new pressure formula, non-linear absorbing case.
void AddPressureSource(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &p_source_input, const TIndexMatrix &p_source_index, const size_t t_index)
Add in pressure source term.
dim3 GetSolverTransposeBlockSize() const
Get block size for the transposition kernels.
void ComputeVelocityScalarNonuniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &dxudxn_sgx, const TRealMatrix &dyudyn_sgy, const TRealMatrix &dzudzn_sgz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity, scalar, non-uniform case.
__global__ void CUDAComputeVelocityShiftInY(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *y_shift_neg_r)
void AddTransducerSource(TRealMatrix &ux_sgx, const TIndexMatrix &u_source_index, TIndexMatrix &delay_mask, const TRealMatrix &transducer_signal)
Add transducer data source to X component.
void SumPressureTermsNonlinear(TRealMatrix &p, const TRealMatrix &BonA_temp, const bool is_c2_scalar, const float *c2_matrix, const bool is_tau_eta_scalar, const float *absorb_tau, const float *tau_matrix, const float *absorb_eta, const float *eta_matrix)
Sum sub-terms to calculate new pressure, non-linear case.
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
Definition: Logger.h:209
unsigned int nz
size of Z dimension.
__global__ void CUDASumPressureNonlinearLossless(float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix, const float *BonA_matrix, const float *rho0_matrix)
float fftDividerX
normalization constant for 1D FFT over X.
__device__ dim3 GetReal3DCoords(const unsigned int i)
Get 3D coordinates for a real matrix form a 1D index.
Definition: CUDAUtils.cuh:82
void Compute_p0_AddInitialPressure(TRealMatrix &p, TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &p0, const bool Is_c2_scalar, const float *c2)
Add initial pressure to p0 (as p0 source).
float rho0_scalar
rho0 in homogeneous case
void ComputePressurelGradient(TCUFFTComplexMatrix &fft_x, TCUFFTComplexMatrix &fft_y, TCUFFTComplexMatrix &fft_z, const TRealMatrix &kappa, const TComplexMatrix &ddx, const TComplexMatrix &ddy, const TComplexMatrix &ddz)
Compute part of the new velocity - gradient of p.
The header file containing a class responsible for printing out info and error messages (stdout...
unsigned int p_source_many
p source many
void ComputeDensityNonlinearHeterogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const TRealMatrix &rho0)
Calculate acoustic density for non-linear case, heterogenous case.
__global__ void CUDAComputeDensityLinearHomogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
void ComputeVelocityGradient(TCUFFTComplexMatrix &fft_x, TCUFFTComplexMatrix &fft_y, TCUFFTComplexMatrix &fft_z, const TRealMatrix &kappa, const TComplexMatrix &ddx_k_shift_neg, const TComplexMatrix &ddy_k_shift_neg, const TComplexMatrix &ddz_k_shift_neg)
Compute gradient of acoustic velocity on uniform grid.
unsigned int u_source_mode
u source mode
const TCUDAParameters & GetCudaParameters() const
Get class with CUDA Parameters (runtime setup), cosnt version.
Definition: Parameters.h:76
void ComputeDensityLinearHomogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz)
Calculate acoustic density for linear case, homogenous case.
unsigned int nElementsComplex
complex number of elements.
dim3 GetSolverTransposeBlockSize()
The header file with CUDA utility functions. These routines are to be inlined.
unsigned int p_source_index_size
size of the p_source mask
__global__ void CUDACompute_p0_AddInitialPressure(float *p, float *rhox, float *rhoy, float *rhoz, const float *p0, const float *c2=nullptr)
__global__ void CUDAAddVelocitySource(float *uxyz_sgxyz, const float *u_source_input, const size_t *u_source_index, const size_t t_index)
void ComputeVelocity(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &dt_rho0_sgx, const TRealMatrix &dt_rho0_sgy, const TRealMatrix &dt_rho0_sgz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity for default case (heterogeneous).
__device__ unsigned int GetStride()
Get x-stride for 3D CUDA block (for processing multiple grid points by a single thread).
Definition: CUDAUtils.cuh:69
int GetSolverGridSize1D() const
Get number of block for 1D grid used by kSpaceSolver.
dim3 GetSolverTransposeGirdSize() const
Get grid size for the transposition kernels.
void ComputeDensityNonlinearHomogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz)
Calculate acoustic density for non-linear case, homogenous case.
virtual size_t GetElementCount() const
Get total element count of the matrix.
The class for real matrices.
Definition: RealMatrix.h:45
void ComputePressurePartsLinear(TRealMatrix &sum_rhoxyz, TRealMatrix &sum_rho0_du, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const bool is_rho0_scalar, const float *rho0_matrix)
Calculate two temporary sums in the new pressure formula, linear absorbing case.
__global__ void CUDASum_new_p_linear_lossless(float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix)
void ComputeVelocityShiftInX(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &x_shift_neg_r)
Compute the velocity shift in Fourier space over the X axis.
void SumPressureTermsLinear(TRealMatrix &p, const TRealMatrix &absorb_tau_temp, const TRealMatrix &absorb_eta_temp, const TRealMatrix &sum_rhoxyz, const bool is_c2_scalar, const float *c2_matrix, const bool is_tau_eta_scalar, const float *tau_matrix, const float *eta_matrix)
Sum sub-terms to calculate new pressure, linear case.
__global__ void CUDAComputePressurePartsNonLinear(float *rho_sum, float *BonA_sum, float *du_sum, const float *rhox, const float *rhoy, const float *rhoz, const float *duxdx, const float *duydy, const float *duzdz, const float *BonA_matrix, const float *rho0_matrix)
float fftDividerY
normalization constant for 1D FFT over Y.
The class for 64b unsigned integers (indices). It is used for sensor_mask_index or sensor_corners_mas...
Definition: IndexMatrix.h:46
__global__ void CUDAComputeVelocityShiftInX(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *x_shift_neg_r)
__global__ void CUDAAddPressureSource(float *rhox, float *rhoy, float *rhoz, const float *p_source_input, const size_t *p_source_index, const size_t t_index)
float rho0_sgy_scalar
dt / rho0_sgy in homogeneous case
void ComputeDensityLinearHeterogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const TRealMatrix &rho0)
Calculate acoustic density for linear case, heterogeneous case.
unsigned int p_source_mode
p source mode
unsigned int u_source_many
u source many
__constant__ TCUDADeviceConstants cudaDeviceConstants
This variable holds basic simulation constants for GPU.
void ComputeAbsorbtionTerm(TCUFFTComplexMatrix &fft1, TCUFFTComplexMatrix &fft2, const TRealMatrix &absorb_nabla1, const TRealMatrix &absorb_nabla2)
Compute absorbing term with abosrb_nabla1 and absorb_nabla2.
void Compute_p0_VelocityScalarNonUniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &dxudxn_sgx, const TRealMatrix &dyudyn_sgy, const TRealMatrix &dzudzn_sgz)
Compute acoustic velocity for initial pressure problem, if rho0_sgx is scalar, non uniform grid...
void Compute_p0_Velocity(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &dt_rho0_sgx, const TRealMatrix &dt_rho0_sgy, const TRealMatrix &dt_rho0_sgz)
Compute velocity for the initial pressure problem.
Name space for all CUDA kernels used in the 3D solver.
void TrasposeReal3DMatrixXY(float *outputMatrix, const float *inputMatrix, const dim3 &dimSizes)
Transpose a real 3D matrix in the X-Y direction.
int GetSolverBlockSize1D() const
Get number of threads for 1D block used by kSpaceSolver.
unsigned int nElements
total number of elements.
void ComputeVelocityScalarUniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity, scalar and uniform case.
__global__ void CUDAComputeVelocityShiftInZ(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *z_shift_neg_r)
unsigned int ny
size of Y dimension.
__device__ dim3 GetComplex3DCoords(const unsigned int i)
Get a 3D coordinates for a complex matrix form a 1D index.
Definition: CUDAUtils.cuh:97
int GetSolverGridSize1D()
virtual float * GetDeviceData()
Get raw GPU data out of the class (for direct GPU kernel access).
void ComputeVelocityShiftInZ(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &z_shift_neg_r)
Compute the velocity shift in Fourier space over the Z axis.
float rho0_sgz_scalar
dt / rho0_sgz in homogeneous case
float fftDividerZ
normalization constant for 1D FFT over Z.
__global__ void CUDAComputeDensityNonlinearHeterogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
The class for complex matrices.
Definition: ComplexMatrix.h:54
float dt_rho0_scalar
dt * rho0 in homogeneous case
float absorb_tau_scalar
Absorb_tau value for homogeneous case.
float absorb_eta_scalar
Absorb_eta value for homogeneous case.
void TrasposeReal3DMatrixXZ(float *outputMatrix, const float *inputMatrix, const dim3 &dimSizes)
Transpose a real 3D matrix in the X-Y direction.
Class implementing 3D Real-To-Complex and Complex-To-Real transforms using CUDA FFT interface...
void SumPressureNonlinearLossless(TRealMatrix &p, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const bool is_c2_scalar, const float *c2_matrix, const bool is_BonA_scalar, const float *BonA_matrix, const bool is_rho0_scalar, const float *rho0_matrix)
Sum sub-terms for new p, linear lossless case.
__global__ void CUDACompute_p0_Velocity(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dt_rho0_sgx=nullptr, const float *dt_rho0_sgy=nullptr, const float *dt_rho0_sgz=nullptr)
int GetSolverBlockSize1D()
__global__ void CUDASumPressureTermsLinear(float *p, const float *absorb_tau_temp, const float *absorb_eta_temp, const float *sum_rhoxyz, const float *c2_matrix, const float *tau_matrix, const float *eta_matrix)
void ComputeVelocityShiftInY(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &y_shift_neg_r)
Compute the velocity shift in Fourier space over the Y axis.
__global__ void CUDAComputeVelocityScalarUniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *pml_x, const float *pml_y, const float *pml_z)