kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SolverCUDAKernels.cu File Reference

The implementation file containing the all CUDA kernels for the GPU implementation. More...

#include <cuComplex.h>
#include <KSpaceSolver/SolverCUDAKernels.cuh>
#include <Parameters/CUDADeviceConstants.cuh>
#include <Logger/Logger.h>
#include <Utils/CUDAUtils.cuh>
Include dependency graph for SolverCUDAKernels.cu:

Go to the source code of this file.

Functions

int GetSolverBlockSize1D ()
 
int GetSolverGridSize1D ()
 
dim3 GetSolverTransposeBlockSize ()
 
dim3 GetSolverTransposeGirdSize ()
 
__global__ void CUDAGetCUDACodeVersion (int *cudaCodeVersion)
 
__global__ void CUDAComputeVelocity (float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dt_rho0_sgx, const float *dt_rho0_sgy, const float *dt_rho0_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
 
__global__ void CUDAComputeVelocityScalarUniform (float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *pml_x, const float *pml_y, const float *pml_z)
 
__global__ void CUDAComputeVelocityScalarNonuniform (float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
 
__global__ void CUDAAddTransducerSource (float *ux_sgx, const size_t *u_source_index, size_t *delay_mask, const float *transducer_signal)
 
__global__ void CUDAAddVelocitySource (float *uxyz_sgxyz, const float *u_source_input, const size_t *u_source_index, const size_t t_index)
 
__global__ void CUDAAddPressureSource (float *rhox, float *rhoy, float *rhoz, const float *p_source_input, const size_t *p_source_index, const size_t t_index)
 
template<bool Is_rho0_scalar>
__global__ void CUDACompute_p0_Velocity (float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dt_rho0_sgx=nullptr, const float *dt_rho0_sgy=nullptr, const float *dt_rho0_sgz=nullptr)
 
__global__ void CUDACompute_p0_VelocityScalarNonUniform (float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz)
 
__global__ void CUDAComputePressurelGradient (cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx, const cuFloatComplex *ddy, const cuFloatComplex *ddz)
 
__global__ void CUDAComputeVelocityGradient (cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx_neg, const cuFloatComplex *ddy_neg, const cuFloatComplex *ddz_neg)
 
__global__ void CUDAComputeVelocityGradientNonuniform (float *duxdx, float *duydy, float *duzdz, const float *duxdxn, const float *duydyn, const float *duzdzn)
 
template<bool Is_c0_scalar>
__global__ void CUDACompute_p0_AddInitialPressure (float *p, float *rhox, float *rhoy, float *rhoz, const float *p0, const float *c2=nullptr)
 
__global__ void CUDAComputeDensityNonlinearHomogeneous (float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
 
__global__ void CUDAComputeDensityNonlinearHeterogeneous (float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
 
__global__ void CUDAComputeDensityLinearHomogeneous (float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
 
__global__ void CUDAComputeDensityLinearHeterogeneous (float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
 
template<bool is_BonA_scalar, bool is_rho0_scalar>
__global__ void CUDAComputePressurePartsNonLinear (float *rho_sum, float *BonA_sum, float *du_sum, const float *rhox, const float *rhoy, const float *rhoz, const float *duxdx, const float *duydy, const float *duzdz, const float *BonA_matrix, const float *rho0_matrix)
 
__global__ void CUDACompute_Absorb_nabla1_2 (cuFloatComplex *fft1, cuFloatComplex *fft2, const float *nabla1, const float *nabla2)
 
template<bool is_c2_scalar, bool is_tau_eta_scalar>
__global__ void CUDASumPressureTermsNonlinear (float *p, const float *BonA_temp, const float *c2_matrix, const float *absorb_tau, const float *tau_matrix, const float *absorb_eta, const float *eta_matrix)
 
template<bool is_c2_scalar, bool is_tau_eta_scalar>
__global__ void CUDASumPressureTermsLinear (float *p, const float *absorb_tau_temp, const float *absorb_eta_temp, const float *sum_rhoxyz, const float *c2_matrix, const float *tau_matrix, const float *eta_matrix)
 
template<bool is_c2_scalar, bool is_BonA_scalar, bool is_rho0_scalar>
__global__ void CUDASumPressureNonlinearLossless (float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix, const float *BonA_matrix, const float *rho0_matrix)
 
template<bool is_rho0_scalar>
__global__ void CUDAComputePressurePartsLinear (float *sum_rhoxyz, float *sum_rho0_du, const float *rhox, const float *rhoy, const float *rhoz, const float *dux, const float *duy, const float *duz, const float *rho0_matrix)
 
template<bool is_c2_scalar>
__global__ void CUDASum_new_p_linear_lossless (float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix)
 
__global__ void CUDATrasnposeReal3DMatrixXYSquare (float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
 
__global__ void CUDATrasnposeReal3DMatrixXYRect (float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
 
__global__ void CUDATrasnposeReal3DMatrixXZSquare (float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
 
__global__ void CUDATrasnposeReal3DMatrixXZRect (float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
 
__global__ void CUDAComputeVelocityShiftInX (cuFloatComplex *cufft_shift_temp, const cuFloatComplex *x_shift_neg_r)
 
__global__ void CUDAComputeVelocityShiftInY (cuFloatComplex *cufft_shift_temp, const cuFloatComplex *y_shift_neg_r)
 
__global__ void CUDAComputeVelocityShiftInZ (cuFloatComplex *cufft_shift_temp, const cuFloatComplex *z_shift_neg_r)
 

Variables

__constant__ TCUDADeviceConstants cudaDeviceConstants
 This variable holds basic simulation constants for GPU. More...
 

Detailed Description

Author
Jiri Jaros
Faculty of Information Technology
Brno University of Technology
jaros.nosp@m.jir@.nosp@m.fit.v.nosp@m.utbr.nosp@m..cz
Version
kspaceFirstOrder3D 3.4
Date
11 March 2013, 13:10 (created)
27 July 2016, 15:09 (revised)

License

This file is part of the C++ extension of the k-Wave Toolbox (http://www.k-wave.org).
Copyright (C) 2016 Jiri Jaros and Bradley Treeby.

This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public License along with k-Wave. If not, see http://www.gnu.org/licenses/.

Definition in file SolverCUDAKernels.cu.

Function Documentation

__global__ void CUDAAddPressureSource ( float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  p_source_input,
const size_t *  p_source_index,
const size_t  t_index 
)

CUDA kernel to add p_source to acoustic density.

Parameters
[out]rhox- Acoustic density
[out]rhoy- Acoustic density
[out]rhoz- Acoustic density
[in]p_source_input- Source input to add
[in]p_source_index- Index matrix with source
[in]t_index- Actual timestep

Definition at line 600 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAAddTransducerSource ( float *  ux_sgx,
const size_t *  u_source_index,
size_t *  delay_mask,
const float *  transducer_signal 
)

CUDA kernel adding transducer data to ux_sgx

Parameters
[in,out]ux_sgx- Here we add the signal
[in]u_source_index- Where to add the signal (source)
[in,out]delay_mask- Delay mask to push the signal in the domain (incremented per invocation)
[in]transducer_signal- Transducer signal

Definition at line 474 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAAddVelocitySource ( float *  uxyz_sgxyz,
const float *  u_source_input,
const size_t *  u_source_index,
const size_t  t_index 
)

CUDA kernel to add in velocity source terms.

Parameters
[in,out]uxyz_sgxyz- velocity matrix to update
[in]u_source_input- Source input to add
[in]u_source_index- Index matrix
[in]t_index- Actual time step

Definition at line 526 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDACompute_Absorb_nabla1_2 ( cuFloatComplex *  fft1,
cuFloatComplex *  fft2,
const float *  nabla1,
const float *  nabla2 
)

CUDA kernel which computes absorbing term with abosrb_nabla1 and absorb_nabla2. Calculate fft_1 = absorb_nabla1 .* fft_1
Calculate fft_2 = absorb_nabla2 .* fft_2

Parameters
[in,out]fft1- Nabla1 part
[in,out]fft2- Nabla2 part
[in]nabla1
[in]nabla2

Definition at line 1645 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool Is_c0_scalar>
__global__ void CUDACompute_p0_AddInitialPressure ( float *  p,
float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  p0,
const float *  c2 = nullptr 
)

CUDA kernel to add initial pressure p0 into p, rhox, rhoy, rhoz. c is a matrix. Heterogeneity is treated by a template

Parameters
[out]p- pressure
[out]rhox
[out]rhoy
[out]rhoz
[in]p0- intial pressure
[in]c2- sound speed

Definition at line 1079 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool Is_rho0_scalar>
__global__ void CUDACompute_p0_Velocity ( float *  ux_sgx,
float *  uy_sgy,
float *  uz_sgz,
const float *  dt_rho0_sgx = nullptr,
const float *  dt_rho0_sgy = nullptr,
const float *  dt_rho0_sgz = nullptr 
)

CUDA kernel Compute u = dt ./ rho0_sgx .* u.

Parameters
[in,out]ux_sgx- data stored in u matrix
[in,out]uy_sgy- data stored in u matrix
[in,out]uz_sgz- data stored in u matrix
[in]dt_rho0_sgx- inner member of the equation
[in]dt_rho0_sgy- inner member of the equation
[in]dt_rho0_sgz- inner member of the equation

Definition at line 704 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDACompute_p0_VelocityScalarNonUniform ( float *  ux_sgx,
float *  uy_sgy,
float *  uz_sgz,
const float *  dxudxn_sgx,
const float *  dyudyn_sgy,
const float *  dzudzn_sgz 
)

CUDA kernel to Compute u = dt./rho0_sgy .* ifft (FFT). if rho0_sg is scalar, nonuniform non uniform grid, y component.

Parameters
[in,out]ux_sgx
[in,out]uy_sgy
[in,out]uz_sgz
[in]dxudxn_sgx
[in]dyudyn_sgy
[in]dzudzn_sgz

Definition at line 807 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeDensityLinearHeterogeneous ( float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z,
const float *  duxdx,
const float *  duydy,
const float *  duzdz,
const float *  rho0 
)

CUDA kernel which calculate new values of rho (acoustic density). Linear, heterogenous case.

Parameters
[out]rhox- density x
[out]rhoy- density y
[out]rhoz- density y
[in]pml_x- pml x
[in]pml_y- pml y
[in]pml_z- pml z
[in]duxdx- gradient of velocity x
[in]duydy- gradient of velocity x
[in]duzdz- gradient of velocity z
[in]rho0- initial density (matrix here)

Definition at line 1409 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeDensityLinearHomogeneous ( float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z,
const float *  duxdx,
const float *  duydy,
const float *  duzdz 
)

Interface to kernel which calculate new values of rho (acoustic density). Linear, homogenous case.

Parameters
[out]rhox- Density x
[out]rhoy- Density y
[out]rhoz- Density y
[in]pml_x- pml x
[in]pml_y- pml y
[in]pml_z- pml z
[in]duxdx- Gradient of velocity x
[in]duydy- Gradient of velocity x
[in]duzdz- Gradient of velocity z

Definition at line 1331 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeDensityNonlinearHeterogeneous ( float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z,
const float *  duxdx,
const float *  duydy,
const float *  duzdz,
const float *  rho0 
)

CUDA kernel which calculate new values of rho (acoustic density). Non-linear, heterogenous case.

Parameters
[out]rhox- density x
[out]rhoy- density y
[out]rhoz- density y
[in]pml_x- pml x
[in]pml_y- pml y
[in]pml_z- pml z
[in]duxdx- gradient of velocity x
[in]duydy- gradient of velocity x
[in]duzdz- gradient of velocity z
[in]rho0- initial density (matrix here)

Definition at line 1243 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeDensityNonlinearHomogeneous ( float *  rhox,
float *  rhoy,
float *  rhoz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z,
const float *  duxdx,
const float *  duydy,
const float *  duzdz 
)

Interface to kernel which calculate new values of rho (acoustic density). Non-linear, homogenous case.

Parameters
[out]rhox- density x
[out]rhoy- density y
[out]rhoz- density y
[in]pml_x- pml x
[in]pml_y- pml y
[in]pml_z- pml z
[in]duxdx- gradient of velocity x
[in]duydy- gradient of velocity x
[in]duzdz- gradient of velocity z

Definition at line 1158 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputePressurelGradient ( cuFloatComplex *  fft_x,
cuFloatComplex *  fft_y,
cuFloatComplex *  fft_z,
const float *  kappa,
const cuFloatComplex *  ddx,
const cuFloatComplex *  ddy,
const cuFloatComplex *  ddz 
)

kernel which compute part of the new velocity term - gradient of p represented by: bsxfun(@times, ddx_k_shift_pos, kappa .* p_k).

Parameters
[in,out]fft_x- matrix to store input for iFFT (p) /dx
[out]fft_y- matrix to store input for iFFT (p) /dy
[out]fft_z- matrix to store input for iFFT (p) /dz
[in]kappa- Real matrix of kappa
[in]ddx- precomputed value of ddx_k_shift_pos
[in]ddy- precomputed value of ddy_k_shift_pos
[in]ddz- precomputed value of ddz_k_shift_pos

Definition at line 877 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool is_rho0_scalar>
__global__ void CUDAComputePressurePartsLinear ( float *  sum_rhoxyz,
float *  sum_rho0_du,
const float *  rhox,
const float *  rhoy,
const float *  rhoz,
const float *  dux,
const float *  duy,
const float *  duz,
const float *  rho0_matrix 
)

CUDA kernel that Calculates two temporary sums in the new pressure formula, linear absorbing case.

Parameters
[out]sum_rhoxyz- rhox_sgx + rhoy_sgy + rhoz_sgz
[out]sum_rho0_du- rho0* (duxdx + duydy + duzdz);
[in]rhox
[in]rhoy
[in]rhoz
[in]dux
[in]duy
[in]duz
[in]rho0_matrix

Definition at line 2107 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool is_BonA_scalar, bool is_rho0_scalar>
__global__ void CUDAComputePressurePartsNonLinear ( float *  rho_sum,
float *  BonA_sum,
float *  du_sum,
const float *  rhox,
const float *  rhoy,
const float *  rhoz,
const float *  duxdx,
const float *  duydy,
const float *  duzdz,
const float *  BonA_matrix,
const float *  rho0_matrix 
)

CUDA kernel which calculates three temporary sums in the new pressure formula
non-linear absorbing case. Homogeneous and heterogenous variants are treated using templates. Homogeneous variables are in constant memory.

Parameters
[out]rho_sum- rhox_sgx + rhoy_sgy + rhoz_sgz
[out]BonA_sum- BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
[out]du_sum- rho0* (duxdx + duydy + duzdz)
[in]rhox,-Acoustic density X
[in]rhoy,-Acoustic density Y
[in]rhoz,-Acoustic density Z
[in]duxdx- Gradient of velocity in X
[in]duydy- Gradient of velocity in X
[in]duzdz- Gradient of velocity in X
[in]BonA_matrix- Heterogeneous value for BonA
[in]rho0_matrix- Heterogeneous value for rho0

Definition at line 1500 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocity ( float *  ux_sgx,
float *  uy_sgy,
float *  uz_sgz,
const float *  ifft_x,
const float *  ifft_y,
const float *  ifft_z,
const float *  dt_rho0_sgx,
const float *  dt_rho0_sgy,
const float *  dt_rho0_sgz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z 
)

CUDA kernel to calculate ux_sgx, uy_sgy, uz_sgz. Default (heterogeneous case).

Parameters
[in,out]ux_sgx
[in,out]uy_sgy
[in,out]uz_sgz
[in]ifft_x
[in]ifft_y
[in]ifft_z
[in]dt_rho0_sgx
[in]dt_rho0_sgy
[in]dt_rho0_sgz
[in]pml_x
[in]pml_y
[in]pml_z

Definition at line 199 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityGradient ( cuFloatComplex *  fft_x,
cuFloatComplex *  fft_y,
cuFloatComplex *  fft_z,
const float *  kappa,
const cuFloatComplex *  ddx_neg,
const cuFloatComplex *  ddy_neg,
const cuFloatComplex *  ddz_neg 
)

Kernel calculating the inner part of du, dy, dz on uniform grid. Complex numbers are passed as float2 structures.

Parameters
[in,out]fft_x- FFT of ux
[in,out]fft_y- FFT of uy
[in,out]fft_z- FFT of uz
[in]kappa
[in]ddx_neg- ddx_k_shift_neg
[in]ddy_neg- ddy_k_shift_neg
[in]ddz_neg- ddz_k_shift_neg

Definition at line 947 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityGradientNonuniform ( float *  duxdx,
float *  duydy,
float *  duzdz,
const float *  duxdxn,
const float *  duydyn,
const float *  duzdzn 
)

CUDA kernel to shift du, dy and dz on non-uniform grid.

Parameters
[in,out]duxdx
[in,out]duydy
[in,out]duzdz
[in]duxdxn
[in]duydyn
[in]duzdzn

Definition at line 1019 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityScalarNonuniform ( float *  ux_sgx,
float *  uy_sgy,
float *  uz_sgz,
const float *  ifft_x,
const float *  ifft_y,
const float *  ifft_z,
const float *  dxudxn_sgx,
const float *  dyudyn_sgy,
const float *  dzudzn_sgz,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z 
)

CUDA kernel to calculate ux_sgx, uy_sgy and uz_sgz. This is the case for rho0 being a scalar and a non-uniform grid.

Parameters
[in,out]ux_sgx- updated value of ux_sgx
[in,out]uy_sgy- updated value of ux_sgx
[in,out]uz_sgz- updated value of ux_sgx
[in]ifft_x- gradient of X
[in]ifft_y- gradient of X
[in]ifft_z- gradient of X
[in]dxudxn_sgx- matrix dx shift
[in]dyudyn_sgy- matrix dy shift
[in]dzudzn_sgz- matrix dz shift
[in]pml_x- matrix of pml_x
[in]pml_y- matrix of pml_x
[in]pml_z- matrix of pml_x

Definition at line 381 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityScalarUniform ( float *  ux_sgx,
float *  uy_sgy,
float *  uz_sgz,
const float *  ifft_x,
const float *  ifft_y,
const float *  ifft_z,
const float *  pml_x,
const float *  pml_y,
const float *  pml_z 
)

CUDA kernel to calculate ux_sgx, uy_sgy, uz_sgz. This is the case for rho0 being a scalar and a uniform grid.

Parameters
[in,out]ux_sgx- new value of ux
[in,out]uy_sgy- new value of uy
[in,out]uz_sgz- new value of ux
[in]ifft_x- gradient for X
[in]ifft_y- gradient for Y
[in]ifft_z- gradient for Z
[in]pml_x
[in]pml_y
[in]pml_z

Definition at line 295 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityShiftInX ( cuFloatComplex *  cufft_shift_temp,
const cuFloatComplex *  x_shift_neg_r 
)

CUDA kernel to compute velocity shift in the X direction.

Parameters
[in,out]cufft_shift_temp- Matrix to calculate 1D FFT to
[in]x_shift_neg_r

Definition at line 2744 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityShiftInY ( cuFloatComplex *  cufft_shift_temp,
const cuFloatComplex *  y_shift_neg_r 
)

CUDA kernel to compute velocity shift in Y. The matrix is XY transposed.

Parameters
[in,out]cufft_shift_temp- Matrix to calculate 1D FFT to
[in]y_shift_neg_r

Definition at line 2782 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAComputeVelocityShiftInZ ( cuFloatComplex *  cufft_shift_temp,
const cuFloatComplex *  z_shift_neg_r 
)

CUDA kernel to compute velocity shift in Z. The matrix is XZ transposed.

Parameters
[in,out]cufft_shift_temp- Matrix to calculate 1D FFT to
[in]z_shift_neg_r

Definition at line 2823 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDAGetCUDACodeVersion ( int *  cudaCodeVersion)

Kernel to find out the version of the code. The list of GPUs can be found at https://en.wikipedia.org/wiki/CUDA

Parameters
[out]cudaCodeVersion

Definition at line 112 of file SolverCUDAKernels.cu.

template<bool is_c2_scalar>
__global__ void CUDASum_new_p_linear_lossless ( float *  p,
const float *  rhox,
const float *  rhoy,
const float *  rhoz,
const float *  c2_matrix 
)

CUDA kernel that sums sub-terms for new p, linear lossless case.

Parameters
[out]p
[in]rhox
[in]rhoy
[in]rhoz
[in]c2_matrix

Definition at line 2196 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool is_c2_scalar, bool is_BonA_scalar, bool is_rho0_scalar>
__global__ void CUDASumPressureNonlinearLossless ( float *  p,
const float *  rhox,
const float *  rhoy,
const float *  rhoz,
const float *  c2_matrix,
const float *  BonA_matrix,
const float *  rho0_matrix 
)

CUDA kernel that sums sub-terms for new p, non-linear lossless case.

Parameters
[out]p- New value of pressure
[in]rhox
[in]rhoy
[in]rhoz
[in]c2_matrix
[in]BonA_matrix
[in]rho0_matrix

Definition at line 1926 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool is_c2_scalar, bool is_tau_eta_scalar>
__global__ void CUDASumPressureTermsLinear ( float *  p,
const float *  absorb_tau_temp,
const float *  absorb_eta_temp,
const float *  sum_rhoxyz,
const float *  c2_matrix,
const float *  tau_matrix,
const float *  eta_matrix 
)

CUDA kernel that sums sub-terms to calculate new pressure, linear case.

Parameters
[out]p- new value of p
[in]absorb_tau_temp- sub-term with absorb_tau
[in]absorb_eta_temp- sub-term with absorb_eta
[in]sum_rhoxyz- rhox_sgx + rhoy_sgy + rhoz_sgz
[in]c2_matrix
[in]tau_matrix
[in]eta_matrix

Definition at line 1811 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

template<bool is_c2_scalar, bool is_tau_eta_scalar>
__global__ void CUDASumPressureTermsNonlinear ( float *  p,
const float *  BonA_temp,
const float *  c2_matrix,
const float *  absorb_tau,
const float *  tau_matrix,
const float *  absorb_eta,
const float *  eta_matrix 
)

CUDA Sum sub-terms to calculate new pressure, non-linear case.

Parameters
[out]p- new value of pressure
[in]BonA_temp- rho0 * (duxdx + duydy + duzdz)
[in]c2_matrix
[in]absorb_tau
[in]tau_matrix
[in]absorb_eta- BonA + rho ^2 / 2 rho0 + (rhox_sgx + rhoy_sgy + rhoz_sgz)
[in]eta_matrix

Definition at line 1697 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

__global__ void CUDATrasnposeReal3DMatrixXYRect ( float *  outputMatrix,
const float *  inputMatrix,
const dim3  dimSizes 
)

CUDA kernel to transpose a 3D matrix in XY planes of any dimension sizes Every block in a 1D grid transposes a few slabs. Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles. Each tile is processed by a single 32-thread warp. The shared memory is used to coalesce memory accesses and the padding is to eliminate bank conflicts. First the full tiles are transposed, then the remainder in the X, then Y and finally the last bit in the bottom right corner.

Parameters
[out]outputMatrix- Output matrix
[in]inputMatrix- Input matrix
[in]dimSizes- Dimension sizes of the original matrix
Warning
The size X and Y dimensions have to be divisible by 32
A blockDim.x has to be 32 (one warp)
blockDim.y has to between 1 and 4 (for tiles at once)
blockDim.z must stay 1
Grid has to be organized (N, 1 ,1)

Definition at line 2346 of file SolverCUDAKernels.cu.

__global__ void CUDATrasnposeReal3DMatrixXYSquare ( float *  outputMatrix,
const float *  inputMatrix,
const dim3  dimSizes 
)

CUDA kernel to transpose a 3D matrix in XY planes if the dim sizes are divisible by 32 in X and Y axes. Every block in a 1D grid transposes a few slabs. Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles. Each tile is processed by a single 32-thread warp. The shared memory is used to coalesce memory accesses and the padding is to eliminate bank conflicts.

Parameters
[out]outputMatrix- Output matrix
[in]inputMatrix- Input matrix
[in]dimSizes- Dimension sizes of the original matrix
Warning
The size X and Y dimensions have to be divisible by 32
A blockDim.x has to be 32 (one warp)
blockDim.y has to between 1 and 4 (for tiles at once)
blockDim.z must stay 1
Grid has to be organized (N, 1 ,1)
Todo:
  • What about Warp shuffle?
Todo:
http://www.pixel.io/blog/2013/3/25/fast-matrix-transposition-on-kepler-without-using-shared-mem.html

Definition at line 2274 of file SolverCUDAKernels.cu.

__global__ void CUDATrasnposeReal3DMatrixXZRect ( float *  outputMatrix,
const float *  inputMatrix,
const dim3  dimSizes 
)

CUDA kernel to transpose a 3D matrix in XZ planes of any dimension sizes Every block in a 1D grid transposes a few slabs. Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles. Each tile is processed by a single 32-thread warp. The shared memory is used to coalesce memory accesses and the padding is to eliminate bank conflicts. First the full tiles are transposed, then the remainder in the X, then Y and finally the last bit in the bottom right corner.

Parameters
[out]outputMatrix- Output matrix
[in]inputMatrix- Input matrix
[in]dimSizes- Dimension sizes of the original matrix
Warning
The size X and Z dimensions have to be divisible by 32
A blockDim.x has to be 32 (one warp)
blockDim.y has to between 1 and 4 (for tiles at once)
blockDim.z must stay 1
Grid has to be organized (N, 1 ,1)

Definition at line 2585 of file SolverCUDAKernels.cu.

__global__ void CUDATrasnposeReal3DMatrixXZSquare ( float *  outputMatrix,
const float *  inputMatrix,
const dim3  dimSizes 
)

CUDA kernel to transpose a 3D matrix in XZ planes if the dim sizes are divisible by 32 in X and Z axes. Every block in a 1D grid transposes a few slabs. Every block is composed of a 2D mesh of threads. The y dim is for up to 4 tiles. Each tile is processed by a single 32-thread warp. The shared memory is used to coalesce memory accesses and the padding is to eliminate bank conflicts.

Parameters
[out]outputMatrix- Output matrix
[in]inputMatrix- Input matrix
[in]dimSizes- Dimension sizes of the original matrix
Warning
The size X and Z dimensions have to be divisible by 32
A blockDim.x has to be 32 (one warp)
blockDim.y has to between 1 and 4 (for tiles at once)
blockDim.z must stay 1
Grid has to be organized (N, 1 ,1)

Definition at line 2517 of file SolverCUDAKernels.cu.

int GetSolverBlockSize1D ( )
inline

Get block size for 1D kernels.

Returns
1D block size

Definition at line 66 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

Here is the caller graph for this function:

int GetSolverGridSize1D ( )
inline

Get grid size for 1D kernels.

Returns
1D grid size

Definition at line 76 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

Here is the caller graph for this function:

dim3 GetSolverTransposeBlockSize ( )
inline

Get block size for the transposition kernels.

Returns
3D grid size

Definition at line 86 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

dim3 GetSolverTransposeGirdSize ( )
inline

Get grid size for complex 3D kernels

Returns
3D grid size

Definition at line 96 of file SolverCUDAKernels.cu.

Here is the call graph for this function:

Variable Documentation

cudaDeviceConstants

This variable holds necessary simulation constants in the CUDA GPU memory. The variable is defined in TCUDADeviceConstants.cu

This variable holds necessary simulation constants in the CUDA GPU. memory. This variable is imported as extern into other CUDA units

Definition at line 54 of file CUDADeviceConstants.cu.