32 #include <cuComplex.h>
114 *cudaCodeVersion = -1;
116 #if (__CUDA_ARCH__ == 530)
117 *cudaCodeVersion = 53;
118 #elif (__CUDA_ARCH__ == 520)
119 *cudaCodeVersion = 52;
120 #elif (__CUDA_ARCH__ == 500)
121 *cudaCodeVersion = 50;
122 #elif (__CUDA_ARCH__ == 370)
123 *cudaCodeVersion = 37;
124 #elif (__CUDA_ARCH__ == 350)
125 *cudaCodeVersion = 35;
126 #elif (__CUDA_ARCH__ == 320)
127 *cudaCodeVersion = 32;
128 #elif (__CUDA_ARCH__ == 300)
129 *cudaCodeVersion = 30;
130 #elif (__CUDA_ARCH__ == 210)
131 *cudaCodeVersion = 21;
132 #elif (__CUDA_ARCH__ == 200)
133 *cudaCodeVersion = 20;
145 int* hCudaCodeVersion;
146 int* dCudaCodeVersion;
149 int cudaCodeVersion = 0;
150 cudaError_t cudaError;
153 cudaError = cudaHostAlloc<int>(&hCudaCodeVersion,
155 cudaHostRegisterPortable | cudaHostRegisterMapped);
158 if (cudaError == cudaSuccess)
160 checkCudaErrors(cudaHostGetDevicePointer<int>(&dCudaCodeVersion, hCudaCodeVersion, 0));
163 CUDAGetCUDACodeVersion<<<1,1>>>(dCudaCodeVersion);
164 cudaDeviceSynchronize();
165 if (cudaGetLastError() != cudaSuccess)
172 cudaCodeVersion = *hCudaCodeVersion;
178 return (cudaCodeVersion);
205 const float* dt_rho0_sgx,
206 const float* dt_rho0_sgy,
207 const float* dt_rho0_sgz,
220 const float pml_x_data = pml_x[coords.x];
221 const float pml_y_data = pml_y[coords.y];
222 const float pml_z_data = pml_z[coords.z];
224 ux_sgx[i] = (ux_sgx[i] * pml_x_data - ifft_x_el) * pml_x_data;
225 uy_sgy[i] = (uy_sgy[i] * pml_y_data - ifft_y_el) * pml_y_data;
226 uz_sgz[i] = (uz_sgz[i] * pml_z_data - ifft_z_el) * pml_z_data;
260 CUDAComputeVelocity<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
313 const float pml_x_el = pml_x[coords.x];
314 const float pml_y_el = pml_y[coords.y];
315 const float pml_z_el = pml_z[coords.z];
317 ux_sgx[i] = (ux_sgx[i] * pml_x_el - Divider_X * ifft_x[i]) * pml_x_el;
318 uy_sgy[i] = (uy_sgy[i] * pml_y_el - Divider_Y * ifft_y[i]) * pml_y_el;
319 uz_sgz[i] = (uz_sgz[i] * pml_z_el - Divider_Z * ifft_z[i]) * pml_z_el;
348 CUDAComputeVelocityScalarUniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
387 const float* dxudxn_sgx,
388 const float* dyudyn_sgy,
389 const float* dzudzn_sgz,
402 const float pml_x_el = pml_x[coords.x];
403 const float pml_y_el = pml_y[coords.y];
404 const float pml_z_el = pml_z[coords.z];
406 const float ifft_x_el = Divider_X * dxudxn_sgx[coords.x] * ifft_x[i];
407 const float ifft_y_el = Divider_Y * dyudyn_sgy[coords.y] * ifft_y[i];
408 const float ifft_z_el = Divider_Z * dzudzn_sgz[coords.z] * ifft_z[i];
410 ux_sgx[i] = (ux_sgx[i] * pml_x_el - ifft_x_el) * pml_x_el;
411 uy_sgy[i] = (uy_sgy[i] * pml_y_el - ifft_y_el) * pml_y_el;
412 uz_sgz[i] = (uz_sgz[i] * pml_z_el - ifft_z_el) * pml_z_el;
446 CUDAComputeVelocityScalarNonuniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
475 const size_t* u_source_index,
477 const float* transducer_signal)
481 ux_sgx[u_source_index[i]] += transducer_signal[delay_mask[i]];
507 CUDAAddTransducerSource<<<gridSize, GetSolverBlockSize1D()>>>
527 const float* u_source_input,
528 const size_t* u_source_index,
529 const size_t t_index)
539 u_source_input[index2D + i];
548 u_source_input[index2D + i];
566 const size_t t_index)
578 CUDAAddVelocitySource<<< gridSize, GetSolverBlockSize1D()>>>
603 const float* p_source_input,
604 const size_t* p_source_index,
605 const size_t t_index)
616 rhox[p_source_index[i]] = p_source_input[index2D];
617 rhoy[p_source_index[i]] = p_source_input[index2D];
618 rhoz[p_source_index[i]] = p_source_input[index2D];
625 rhox[p_source_index[i]] = p_source_input[index2D + i];
626 rhoy[p_source_index[i]] = p_source_input[index2D + i];
627 rhoz[p_source_index[i]] = p_source_input[index2D + i];
638 rhox[p_source_index[i]] += p_source_input[index2D];
639 rhoy[p_source_index[i]] += p_source_input[index2D];
640 rhoz[p_source_index[i]] += p_source_input[index2D];
647 rhox[p_source_index[i]] += p_source_input[index2D + i];
648 rhoy[p_source_index[i]] += p_source_input[index2D + i];
649 rhoz[p_source_index[i]] += p_source_input[index2D + i];
671 const size_t t_index)
679 CUDAAddPressureSource<<<gridSize,GetSolverBlockSize1D()>>>
703 template <
bool Is_rho0_scalar>
707 const float* dt_rho0_sgx =
nullptr,
708 const float* dt_rho0_sgy =
nullptr,
709 const float* dt_rho0_sgz =
nullptr)
720 ux_sgx[i] *= dividerX;
721 uy_sgy[i] *= dividerY;
722 uz_sgz[i] *= dividerZ;
731 ux_sgx[i] *= dt_rho0_sgx[i] * divider;
732 uy_sgy[i] *= dt_rho0_sgy[i] * divider;
733 uz_sgz[i] *= dt_rho0_sgz[i] * divider;
757 CUDACompute_p0_Velocity<false>
783 CUDACompute_p0_Velocity<true>
810 const float* dxudxn_sgx,
811 const float* dyudyn_sgy,
812 const float* dzudzn_sgz)
822 ux_sgx[i] *= dividerX * dxudxn_sgx[coords.x];
823 uy_sgy[i] *= dividerY * dyudyn_sgy[coords.y];
824 uz_sgz[i] *= dividerZ * dzudzn_sgz[coords.z];
848 CUDACompute_p0_VelocityScalarNonUniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
878 cuFloatComplex* fft_y,
879 cuFloatComplex* fft_z,
881 const cuFloatComplex* ddx,
882 const cuFloatComplex* ddy,
883 const cuFloatComplex* ddz)
889 const cuFloatComplex p_k_el = fft_x[i] * kappa[i];
891 fft_x[i] = cuCmulf(p_k_el, ddx[coords.x]);
892 fft_y[i] = cuCmulf(p_k_el, ddy[coords.y]);
893 fft_z[i] = cuCmulf(p_k_el, ddz[coords.z]);
920 CUDAComputePressurelGradient<<<GetSolverGridSize1D(),GetSolverBlockSize1D()>>>
925 reinterpret_cast<const cuFloatComplex*
>(ddx.
GetDeviceData()),
926 reinterpret_cast<const cuFloatComplex*>(ddy.
GetDeviceData()),
927 reinterpret_cast<const cuFloatComplex*>(ddz.
GetDeviceData()));
948 cuFloatComplex* fft_y,
949 cuFloatComplex* fft_z,
951 const cuFloatComplex* ddx_neg,
952 const cuFloatComplex* ddy_neg,
953 const cuFloatComplex* ddz_neg)
959 const cuFloatComplex ddx_neg_el = ddx_neg[coords.x];
960 const cuFloatComplex ddz_neg_el = ddz_neg[coords.z];
961 const cuFloatComplex ddy_neg_el = ddy_neg[coords.y];
965 const cuFloatComplex fft_x_el = fft_x[i] * kappa_el;
966 const cuFloatComplex fft_y_el = fft_y[i] * kappa_el;
967 const cuFloatComplex fft_z_el = fft_z[i] * kappa_el;
969 fft_x[i] = cuCmulf(fft_x_el, ddx_neg_el);
970 fft_y[i] = cuCmulf(fft_y_el, ddy_neg_el);
971 fft_z[i] = cuCmulf(fft_z_el, ddz_neg_el);
994 CUDAComputeVelocityGradient<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
999 reinterpret_cast<const cuFloatComplex *
>(ddx_k_shift_neg.
GetDeviceData()),
1000 reinterpret_cast<const cuFloatComplex *>(ddy_k_shift_neg.
GetDeviceData()),
1001 reinterpret_cast<const cuFloatComplex *>(ddz_k_shift_neg.
GetDeviceData()));
1022 const float* duxdxn,
1023 const float* duydyn,
1024 const float* duzdzn)
1030 duxdx[i] *= duxdxn[coords.x];
1031 duydy[i] *= duydyn[coords.y];
1032 duzdz[i] *= duzdzn[coords.z];
1054 CUDAComputeVelocityGradientNonuniform<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1078 template<
bool Is_c0_scalar>
1084 const float* c2 =
nullptr)
1088 float tmp = p[i] = p0[i];
1116 const bool Is_c2_scalar,
1121 CUDACompute_p0_AddInitialPressure<true>
1131 CUDACompute_p0_AddInitialPressure<false>
1172 const float pml_x_el = pml_x[coords.x];
1173 const float pml_y_el = pml_y[coords.y];
1174 const float pml_z_el = pml_z[coords.z];
1176 const float dux = duxdx[i];
1177 const float duy = duydy[i];
1178 const float duz = duzdz[i];
1214 CUDAComputeDensityNonlinearHomogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1258 const float pml_x_el = pml_x[coords.x];
1259 const float pml_y_el = pml_y[coords.y];
1260 const float pml_z_el = pml_z[coords.z];
1264 const float dux = duxdx[i];
1265 const float duy = duydy[i];
1266 const float duz = duzdz[i];
1300 CUDAComputeDensityNonlinearHeterogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D() >>>
1345 const float pml_x_el = pml_x[coords.x];
1346 const float pml_y_el = pml_y[coords.y];
1347 const float pml_z_el = pml_z[coords.z];
1379 CUDAComputeDensityLinearHomogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D() >>>
1424 const float pml_x_el = pml_x[coords.x];
1425 const float pml_y_el = pml_y[coords.y];
1426 const float pml_z_el = pml_z[coords.z];
1430 rhox[i] = pml_x_el * (pml_x_el * rhox[i] - dt_rho0 * duxdx[i]);
1431 rhoy[i] = pml_y_el * (pml_y_el * rhoy[i] - dt_rho0 * duydy[i]);
1432 rhoz[i] = pml_z_el * (pml_z_el * rhoz[i] - dt_rho0 * duzdz[i]);
1462 CUDAComputeDensityLinearHeterogeneous<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1499 template <
bool is_BonA_scalar,
bool is_rho0_scalar>
1509 const float* BonA_matrix,
1510 const float* rho0_matrix)
1517 const float rho_xyz_el = rhox[i] + rhoy[i] + rhoz[i];
1519 rho_sum[i] = rho_xyz_el;
1520 BonA_sum[i] = ((BonA * rho_xyz_el * rho_xyz_el) / (2.0f * rho0)) + rho_xyz_el;
1521 du_sum[i] = rho0 * (duxdx[i] + duydy[i] + duzdz[i]);
1553 const bool is_BonA_scalar,
1554 const float* BonA_matrix,
1555 const bool is_rho0_scalar,
1556 const float* rho0_matrix)
1563 CUDAComputePressurePartsNonLinear<true, true>
1579 CUDAComputePressurePartsNonLinear<true, false>
1598 CUDAComputePressurePartsNonLinear<false, true>
1614 CUDAComputePressurePartsNonLinear<false, false>
1646 cuFloatComplex* fft2,
1647 const float* nabla1,
1648 const float* nabla2)
1652 fft1[i] *= nabla1[i];
1653 fft2[i] *= nabla2[i];
1673 CUDACompute_Absorb_nabla1_2<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
1696 template <
bool is_c2_scalar,
bool is_tau_eta_scalar>
1698 const float* BonA_temp,
1699 const float* c2_matrix,
1700 const float* absorb_tau,
1701 const float* tau_matrix,
1702 const float* absorb_eta,
1703 const float* eta_matrix)
1712 ((absorb_tau[i] * tau) - (absorb_eta[i] * eta))));
1731 const bool is_c2_scalar,
1732 const float* c2_matrix,
1733 const bool is_tau_eta_scalar,
1734 const float* absorb_tau,
1735 const float* tau_matrix,
1736 const float* absorb_eta,
1737 const float* eta_matrix)
1741 if (is_tau_eta_scalar)
1743 CUDASumPressureTermsNonlinear<true, true>
1755 CUDASumPressureTermsNonlinear<true, false>
1768 if (is_tau_eta_scalar)
1770 CUDASumPressureTermsNonlinear<false, true>
1782 CUDASumPressureTermsNonlinear<false, false>
1810 template <
bool is_c2_scalar,
bool is_tau_eta_scalar>
1812 const float* absorb_tau_temp,
1813 const float* absorb_eta_temp,
1814 const float* sum_rhoxyz,
1815 const float* c2_matrix,
1816 const float* tau_matrix,
1817 const float* eta_matrix)
1826 (absorb_tau_temp[i] * tau - absorb_eta_temp[i] * eta)));
1848 const bool is_c2_scalar,
1849 const float* c2_matrix,
1850 const bool is_tau_eta_scalar,
1851 const float* tau_matrix,
1852 const float* eta_matrix)
1856 if (is_tau_eta_scalar)
1858 CUDASumPressureTermsLinear<true,true>
1870 CUDASumPressureTermsLinear<true,false>
1883 if (is_tau_eta_scalar)
1885 CUDASumPressureTermsLinear<false,true>
1897 CUDASumPressureTermsLinear<false,false>
1925 template<
bool is_c2_scalar,
bool is_BonA_scalar,
bool is_rho0_scalar>
1930 const float* c2_matrix,
1931 const float* BonA_matrix,
1932 const float* rho0_matrix)
1940 const float sum_rho = rhox[i] + rhoy[i] + rhoz[i];
1942 p[i] = c2 * (sum_rho + (BonA * (sum_rho * sum_rho) / (2.0f * rho0)));
1964 const bool is_c2_scalar,
1965 const float* c2_matrix,
1966 const bool is_BonA_scalar,
1967 const float* BonA_matrix,
1968 const bool is_rho0_scalar,
1969 const float* rho0_matrix)
1977 CUDASumPressureNonlinearLossless<true, true, true>
1989 CUDASumPressureNonlinearLossless<true, true, false>
2004 CUDASumPressureNonlinearLossless<true, false, true>
2016 CUDASumPressureNonlinearLossless<true, false, false>
2034 CUDASumPressureNonlinearLossless<false, true, true>
2046 CUDASumPressureNonlinearLossless<false, true, false>
2061 CUDASumPressureNonlinearLossless<false, false, true>
2073 CUDASumPressureNonlinearLossless<false, false, false>
2106 template<
bool is_rho0_scalar>
2115 const float* rho0_matrix)
2121 sum_rhoxyz[i] = rhox[i] + rhoy[i] + rhoz[i];
2122 sum_rho0_du[i] = rho0 * (dux[i] + duy[i] + duz[i]);
2149 const bool is_rho0_scalar,
2150 const float* rho0_matrix)
2154 CUDAComputePressurePartsLinear<true>
2168 CUDAComputePressurePartsLinear<false>
2195 template <
bool is_c2_scalar>
2200 const float* c2_matrix)
2205 p[i] = c2 * (rhox[i] + rhoy[i] + rhoz[i]);
2223 const bool is_c2_scalar,
2224 const float* c2_matrix)
2228 CUDASum_new_p_linear_lossless<true>
2238 CUDASum_new_p_linear_lossless<false>
2275 const float* inputMatrix,
2276 const dim3 dimSizes)
2282 volatile __shared__
float sharedTile[4][32][32+1];
2285 for (
auto slabIdx = blockIdx.x; slabIdx < dimSizes.z; slabIdx += gridDim.x)
2288 const float * inputSlab = inputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2289 float * outputSlab = outputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2291 dim3 tileIdx {0,0,0};
2292 dim3 tileCount {dimSizes.x >> 5, dimSizes.y >> 5, 1};
2295 for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2298 for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2301 for (
auto row = 0; row < 32; row++)
2303 sharedTile[threadIdx.y][row][threadIdx.x]
2304 = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2305 (tileIdx.x * 32) + threadIdx.x];
2311 for (
auto row = 0; row < 32; row ++)
2313 outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2314 (tileIdx.y * 32) + threadIdx.x]
2315 = sharedTile[threadIdx.y][threadIdx.x][row];
2347 const float* inputMatrix,
2348 const dim3 dimSizes)
2352 volatile __shared__
float sharedTile[4][32][32+1];
2355 for (
auto slabIdx = blockIdx.x; slabIdx < dimSizes.z; slabIdx += gridDim.x)
2358 const float * inputSlab = inputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2359 float * outputSlab = outputMatrix + (dimSizes.x * dimSizes.y * slabIdx);
2361 dim3 tileIdx = {0,0,0};
2362 dim3 tileCount = {dimSizes.x >> 5, dimSizes.y >> 5, 1};
2365 for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2369 for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2372 for (
auto row = 0; row < 32; row++)
2374 sharedTile[threadIdx.y][row][threadIdx.x]
2375 = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2376 (tileIdx.x * 32) + threadIdx.x];
2380 for (
auto row = 0; row < 32; row ++)
2382 outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2383 (tileIdx.y * 32) + threadIdx.x]
2384 = sharedTile[threadIdx.y][threadIdx.x][row];
2391 if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2393 for (
auto row = 0; row < 32; row++)
2395 sharedTile[threadIdx.y][row][threadIdx.x]
2396 = inputSlab[(tileIdx.y * 32 + row) * dimSizes.x +
2397 (tileCount.x * 32) + threadIdx.x];
2402 for (
auto row = 0; (tileCount.x * 32 + row) < dimSizes.x; row++)
2404 outputSlab[(tileCount.x * 32 + row) * dimSizes.y +
2405 (tileIdx.y * 32) + threadIdx.x]
2406 = sharedTile[threadIdx.y][threadIdx.x][row];
2413 for (tileIdx.x = threadIdx.y; tileIdx.x < tileCount.x; tileIdx.x += blockDim.y)
2416 for (
auto row = 0; (tileCount.y * 32 + row) < dimSizes.y; row++)
2418 sharedTile[threadIdx.y][row][threadIdx.x]
2419 = inputSlab[(tileCount.y * 32 + row) * dimSizes.x +
2420 (tileIdx.x * 32) + threadIdx.x];
2424 if ((tileCount.y * 32 + threadIdx.x) < dimSizes.y)
2426 for (
auto row = 0; row < 32 ; row++)
2428 outputSlab[(tileIdx.x * 32 + row) * dimSizes.y +
2429 (tileCount.y * 32) + threadIdx.x]
2430 = sharedTile[threadIdx.y][threadIdx.x][row];
2436 if (threadIdx.y == 0)
2439 if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2441 for (
auto row = 0; (tileCount.y * 32 + row) < dimSizes.y; row++)
2443 sharedTile[threadIdx.y][row][threadIdx.x]
2444 = inputSlab[(tileCount.y * 32 + row) * dimSizes.x +
2445 (tileCount.x * 32) + threadIdx.x];
2450 if ((tileCount.y * 32 + threadIdx.x) < dimSizes.y)
2452 for (
auto row = 0; (tileCount.x * 32 + row) < dimSizes.x ; row++)
2454 outputSlab[(tileCount.x * 32 + row) * dimSizes.y +
2455 (tileCount.y * 32) + threadIdx.x]
2456 = sharedTile[threadIdx.y][threadIdx.x][row];
2473 const float* inputMatrix,
2474 const dim3& dimSizes)
2477 if ((dimSizes.x % 32 == 0) && (dimSizes.y % 32 == 0))
2480 CUDATrasnposeReal3DMatrixXYSquare<<<GetSolverTransposeGirdSize(),GetSolverTransposeBlockSize() >>>
2481 (outputMatrix, inputMatrix, dimSizes);
2485 CUDATrasnposeReal3DMatrixXYRect<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2486 (outputMatrix, inputMatrix, dimSizes);
2518 const float* inputMatrix,
2519 const dim3 dimSizes)
2523 volatile __shared__
float shared_tile[4][32][32+1];
2526 for (
auto row = blockIdx.x; row < dimSizes.y; row += gridDim.x)
2528 dim3 tileIdx = {0,0,0};
2529 dim3 tileCount = {dimSizes.x >> 5, dimSizes.z >> 5, 1};
2532 for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2535 for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x ++)
2538 for (
auto slab = 0; slab < 32; slab++)
2540 shared_tile[threadIdx.y][slab][threadIdx.x]
2541 = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2542 (row * dimSizes.x) +
2543 tileIdx.x * 32 + threadIdx.x];
2548 for (
auto slab = 0; slab < 32; slab++)
2550 outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2552 tileIdx.y * 32 + threadIdx.x]
2553 = shared_tile[threadIdx.y][threadIdx.x][slab];
2586 const float* inputMatrix,
2587 const dim3 dimSizes)
2591 volatile __shared__
float shared_tile[4][32][32+1];
2594 for (
auto row = blockIdx.x; row < dimSizes.y; row += gridDim.x)
2596 dim3 tileIdx = {0,0,0};
2597 dim3 tileCount = {dimSizes.x >> 5, dimSizes.z >> 5, 1};
2600 for (tileIdx.y = threadIdx.y; tileIdx.y < tileCount.y; tileIdx.y += blockDim.y)
2603 for (tileIdx.x = 0; tileIdx.x < tileCount.x; tileIdx.x++)
2606 for (
auto slab = 0; slab < 32; slab++)
2608 shared_tile[threadIdx.y][slab][threadIdx.x]
2609 = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2611 tileIdx.x * 32 + threadIdx.x];
2617 for (
auto slab = 0; slab < 32; slab++)
2619 outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2621 tileIdx.y * 32 + threadIdx.x]
2622 = shared_tile[threadIdx.y][threadIdx.x][slab];
2628 if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2630 for (
auto slab = 0; slab < 32; slab++)
2632 shared_tile[threadIdx.y][slab][threadIdx.x]
2633 = inputMatrix[(tileIdx.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2635 tileCount.x * 32 + threadIdx.x];
2640 for (
auto slab = 0; (tileCount.x * 32 + slab) < dimSizes.x; slab++)
2642 outputMatrix[(tileCount.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2644 tileIdx.y * 32 + threadIdx.x]
2645 = shared_tile[threadIdx.y][threadIdx.x][slab];
2652 for (tileIdx.x = threadIdx.y; tileIdx.x < tileCount.x; tileIdx.x += blockDim.y)
2655 for (
auto slab = 0; (tileCount.y * 32 + slab) < dimSizes.z; slab++)
2657 shared_tile[threadIdx.y][slab][threadIdx.x]
2658 = inputMatrix[(tileCount.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2660 tileIdx.x * 32 + threadIdx.x];
2665 if ((tileCount.y * 32 + threadIdx.x) < dimSizes.z)
2667 for (
auto slab = 0; slab < 32; slab++)
2669 outputMatrix[(tileIdx.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2671 tileCount.y * 32 + threadIdx.x]
2672 = shared_tile[threadIdx.y][threadIdx.x][slab];
2678 if (threadIdx.y == 0)
2681 if ((tileCount.x * 32 + threadIdx.x) < dimSizes.x)
2683 for (
auto slab = 0; (tileCount.y * 32 + slab) < dimSizes.z; slab++)
2685 shared_tile[threadIdx.y][slab][threadIdx.x]
2686 = inputMatrix[(tileCount.y * 32 + slab) * (dimSizes.x * dimSizes.y) +
2688 tileCount.x * 32 + threadIdx.x];
2693 if ((tileCount.y * 32 + threadIdx.x) < dimSizes.z)
2695 for (
auto slab = 0; (tileCount.x * 32 + slab) < dimSizes.x ; slab++)
2697 outputMatrix[(tileCount.x * 32 + slab) * (dimSizes.y * dimSizes.z) +
2699 tileCount.y * 32 + threadIdx.x]
2700 = shared_tile[threadIdx.y][threadIdx.x][slab];
2717 const float* inputMatrix,
2718 const dim3& dimSizes)
2720 if ((dimSizes.x % 32 == 0) && (dimSizes.y % 32 == 0))
2723 CUDATrasnposeReal3DMatrixXZSquare<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2724 (outputMatrix, inputMatrix, dimSizes);
2728 CUDATrasnposeReal3DMatrixXZRect<<<GetSolverTransposeGirdSize(), GetSolverTransposeBlockSize() >>>
2729 (outputMatrix, inputMatrix, dimSizes);
2745 const cuFloatComplex* x_shift_neg_r)
2767 CUDAComputeVelocityShiftInX<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2768 (
reinterpret_cast<cuFloatComplex*
> (cufft_shift_temp.
GetDeviceData()),
2769 reinterpret_cast<const cuFloatComplex*> (x_shift_neg_r.
GetDeviceData()));
2783 const cuFloatComplex* y_shift_neg_r)
2791 const auto y = i % ny_2;
2808 CUDAComputeVelocityShiftInY<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2809 (
reinterpret_cast<cuFloatComplex*
> (cufft_shift_temp.
GetDeviceData()),
2810 reinterpret_cast<const cuFloatComplex*> (y_shift_neg_r.
GetDeviceData()));
2824 const cuFloatComplex* z_shift_neg_r)
2832 const auto z = i % nz_2;
2849 CUDAComputeVelocityShiftInZ<<<GetSolverGridSize1D(), GetSolverBlockSize1D()>>>
2850 (
reinterpret_cast<cuFloatComplex*
> (cufft_shift_temp.
GetDeviceData()),
2851 reinterpret_cast<const cuFloatComplex*> (z_shift_neg_r.
GetDeviceData()));
__global__ void CUDACompute_Absorb_nabla1_2(cuFloatComplex *fft1, cuFloatComplex *fft2, const float *nabla1, const float *nabla2)
__global__ void CUDAComputePressurePartsLinear(float *sum_rhoxyz, float *sum_rho0_du, const float *rhox, const float *rhoy, const float *rhoz, const float *dux, const float *duy, const float *duz, const float *rho0_matrix)
__global__ void CUDAComputeVelocity(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dt_rho0_sgx, const float *dt_rho0_sgy, const float *dt_rho0_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
unsigned int u_source_index_size
size of the u source index
__global__ void CUDAComputeVelocityScalarNonuniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz, const float *pml_x, const float *pml_y, const float *pml_z)
void AddVelocitySource(TRealMatrix &uxyz_sgxyz, const TRealMatrix &u_source_input, const TIndexMatrix &u_source_index, const size_t t_index)
Add in velocity source terms.
__global__ void CUDAComputeVelocityGradient(cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx_neg, const cuFloatComplex *ddy_neg, const cuFloatComplex *ddz_neg)
dim3 GetSolverTransposeGirdSize()
__global__ void CUDATrasnposeReal3DMatrixXZRect(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
unsigned int nx
size of X dimension.
float fftDivider
normalization constant for 3D FFT.
int GetCUDACodeVersion()
Get the CUDA architecture and GPU code version the code was compiled with.
__global__ void CUDATrasnposeReal3DMatrixXZSquare(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
float BonA_scalar
BonA value for homogeneous case.
__global__ void CUDATrasnposeReal3DMatrixXYRect(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
The header file for the class for storing constants residing in CUDA constant memory.
virtual size_t * GetDeviceData()
Get raw GPU data out of the class (for direct GPU kernel access).
Structure for CUDA parameters to be placed in constant memory. Only 32b values are used...
unsigned int nxComplex
size of complex X dimension.
__global__ void CUDAComputeVelocityGradientNonuniform(float *duxdx, float *duydy, float *duzdz, const float *duxdxn, const float *duydyn, const float *duzdzn)
__global__ void CUDAAddTransducerSource(float *ux_sgx, const size_t *u_source_index, size_t *delay_mask, const float *transducer_signal)
__global__ void CUDACompute_p0_VelocityScalarNonUniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dxudxn_sgx, const float *dyudyn_sgy, const float *dzudzn_sgz)
__global__ void CUDAComputeDensityLinearHeterogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
__global__ void CUDAGetCUDACodeVersion(int *cudaCodeVersion)
void SumPressureLinearLossless(TRealMatrix &p, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const bool is_c2_scalar, const float *c2_matrix)
Sum sub-terms for new p, linear lossless case.
__global__ void CUDATrasnposeReal3DMatrixXYSquare(float *outputMatrix, const float *inputMatrix, const dim3 dimSizes)
void ComputeVelocityGradientNonuniform(TRealMatrix &duxdx, TRealMatrix &duydy, TRealMatrix &duzdz, const TRealMatrix &dxudxn, const TRealMatrix &dyudyn, const TRealMatrix &dzudzn)
Shift gradient of acoustic velocity on non-uniform grid.
static TParameters & GetInstance()
Get instance of the singleton class.
__global__ void CUDASumPressureTermsNonlinear(float *p, const float *BonA_temp, const float *c2_matrix, const float *absorb_tau, const float *tau_matrix, const float *absorb_eta, const float *eta_matrix)
__device__ unsigned int GetIndex()
Get global 1D coordinate for 1D CUDA block.
__global__ void CUDAComputePressurelGradient(cuFloatComplex *fft_x, cuFloatComplex *fft_y, cuFloatComplex *fft_z, const float *kappa, const cuFloatComplex *ddx, const cuFloatComplex *ddy, const cuFloatComplex *ddz)
float rho0_sgx_scalar
dt / rho0_sgx in homogeneous case
__global__ void CUDAComputeDensityNonlinearHomogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
void ComputePressurePartsNonLinear(TRealMatrix &rho_sum, TRealMatrix &BonA_sum, TRealMatrix &du_sum, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const bool is_BonA_scalar, const float *BonA_matrix, const bool is_rho0_scalar, const float *rho0_matrix)
Calculate three temporary sums in the new pressure formula, non-linear absorbing case.
void AddPressureSource(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &p_source_input, const TIndexMatrix &p_source_index, const size_t t_index)
Add in pressure source term.
dim3 GetSolverTransposeBlockSize() const
Get block size for the transposition kernels.
void ComputeVelocityScalarNonuniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &dxudxn_sgx, const TRealMatrix &dyudyn_sgy, const TRealMatrix &dzudzn_sgz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity, scalar, non-uniform case.
__global__ void CUDAComputeVelocityShiftInY(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *y_shift_neg_r)
void AddTransducerSource(TRealMatrix &ux_sgx, const TIndexMatrix &u_source_index, TIndexMatrix &delay_mask, const TRealMatrix &transducer_signal)
Add transducer data source to X component.
void SumPressureTermsNonlinear(TRealMatrix &p, const TRealMatrix &BonA_temp, const bool is_c2_scalar, const float *c2_matrix, const bool is_tau_eta_scalar, const float *absorb_tau, const float *tau_matrix, const float *absorb_eta, const float *eta_matrix)
Sum sub-terms to calculate new pressure, non-linear case.
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
unsigned int nz
size of Z dimension.
__global__ void CUDASumPressureNonlinearLossless(float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix, const float *BonA_matrix, const float *rho0_matrix)
float fftDividerX
normalization constant for 1D FFT over X.
__device__ dim3 GetReal3DCoords(const unsigned int i)
Get 3D coordinates for a real matrix form a 1D index.
void Compute_p0_AddInitialPressure(TRealMatrix &p, TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &p0, const bool Is_c2_scalar, const float *c2)
Add initial pressure to p0 (as p0 source).
float rho0_scalar
rho0 in homogeneous case
void ComputePressurelGradient(TCUFFTComplexMatrix &fft_x, TCUFFTComplexMatrix &fft_y, TCUFFTComplexMatrix &fft_z, const TRealMatrix &kappa, const TComplexMatrix &ddx, const TComplexMatrix &ddy, const TComplexMatrix &ddz)
Compute part of the new velocity - gradient of p.
The header file containing a class responsible for printing out info and error messages (stdout...
unsigned int p_source_many
p source many
void ComputeDensityNonlinearHeterogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const TRealMatrix &rho0)
Calculate acoustic density for non-linear case, heterogenous case.
__global__ void CUDAComputeDensityLinearHomogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz)
void ComputeVelocityGradient(TCUFFTComplexMatrix &fft_x, TCUFFTComplexMatrix &fft_y, TCUFFTComplexMatrix &fft_z, const TRealMatrix &kappa, const TComplexMatrix &ddx_k_shift_neg, const TComplexMatrix &ddy_k_shift_neg, const TComplexMatrix &ddz_k_shift_neg)
Compute gradient of acoustic velocity on uniform grid.
unsigned int u_source_mode
u source mode
const TCUDAParameters & GetCudaParameters() const
Get class with CUDA Parameters (runtime setup), cosnt version.
void ComputeDensityLinearHomogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz)
Calculate acoustic density for linear case, homogenous case.
unsigned int nElementsComplex
complex number of elements.
dim3 GetSolverTransposeBlockSize()
The header file with CUDA utility functions. These routines are to be inlined.
unsigned int p_source_index_size
size of the p_source mask
__global__ void CUDACompute_p0_AddInitialPressure(float *p, float *rhox, float *rhoy, float *rhoz, const float *p0, const float *c2=nullptr)
__global__ void CUDAAddVelocitySource(float *uxyz_sgxyz, const float *u_source_input, const size_t *u_source_index, const size_t t_index)
void ComputeVelocity(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &dt_rho0_sgx, const TRealMatrix &dt_rho0_sgy, const TRealMatrix &dt_rho0_sgz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity for default case (heterogeneous).
__device__ unsigned int GetStride()
Get x-stride for 3D CUDA block (for processing multiple grid points by a single thread).
int GetSolverGridSize1D() const
Get number of block for 1D grid used by kSpaceSolver.
dim3 GetSolverTransposeGirdSize() const
Get grid size for the transposition kernels.
void ComputeDensityNonlinearHomogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz)
Calculate acoustic density for non-linear case, homogenous case.
virtual size_t GetElementCount() const
Get total element count of the matrix.
The class for real matrices.
void ComputePressurePartsLinear(TRealMatrix &sum_rhoxyz, TRealMatrix &sum_rho0_du, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const bool is_rho0_scalar, const float *rho0_matrix)
Calculate two temporary sums in the new pressure formula, linear absorbing case.
__global__ void CUDASum_new_p_linear_lossless(float *p, const float *rhox, const float *rhoy, const float *rhoz, const float *c2_matrix)
void ComputeVelocityShiftInX(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &x_shift_neg_r)
Compute the velocity shift in Fourier space over the X axis.
void SumPressureTermsLinear(TRealMatrix &p, const TRealMatrix &absorb_tau_temp, const TRealMatrix &absorb_eta_temp, const TRealMatrix &sum_rhoxyz, const bool is_c2_scalar, const float *c2_matrix, const bool is_tau_eta_scalar, const float *tau_matrix, const float *eta_matrix)
Sum sub-terms to calculate new pressure, linear case.
__global__ void CUDAComputePressurePartsNonLinear(float *rho_sum, float *BonA_sum, float *du_sum, const float *rhox, const float *rhoy, const float *rhoz, const float *duxdx, const float *duydy, const float *duzdz, const float *BonA_matrix, const float *rho0_matrix)
float fftDividerY
normalization constant for 1D FFT over Y.
The class for 64b unsigned integers (indices). It is used for sensor_mask_index or sensor_corners_mas...
__global__ void CUDAComputeVelocityShiftInX(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *x_shift_neg_r)
__global__ void CUDAAddPressureSource(float *rhox, float *rhoy, float *rhoz, const float *p_source_input, const size_t *p_source_index, const size_t t_index)
float rho0_sgy_scalar
dt / rho0_sgy in homogeneous case
void ComputeDensityLinearHeterogeneous(TRealMatrix &rhox, TRealMatrix &rhoy, TRealMatrix &rhoz, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z, const TRealMatrix &duxdx, const TRealMatrix &duydy, const TRealMatrix &duzdz, const TRealMatrix &rho0)
Calculate acoustic density for linear case, heterogeneous case.
unsigned int p_source_mode
p source mode
unsigned int u_source_many
u source many
__constant__ TCUDADeviceConstants cudaDeviceConstants
This variable holds basic simulation constants for GPU.
void ComputeAbsorbtionTerm(TCUFFTComplexMatrix &fft1, TCUFFTComplexMatrix &fft2, const TRealMatrix &absorb_nabla1, const TRealMatrix &absorb_nabla2)
Compute absorbing term with abosrb_nabla1 and absorb_nabla2.
void Compute_p0_VelocityScalarNonUniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &dxudxn_sgx, const TRealMatrix &dyudyn_sgy, const TRealMatrix &dzudzn_sgz)
Compute acoustic velocity for initial pressure problem, if rho0_sgx is scalar, non uniform grid...
void Compute_p0_Velocity(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &dt_rho0_sgx, const TRealMatrix &dt_rho0_sgy, const TRealMatrix &dt_rho0_sgz)
Compute velocity for the initial pressure problem.
Name space for all CUDA kernels used in the 3D solver.
void TrasposeReal3DMatrixXY(float *outputMatrix, const float *inputMatrix, const dim3 &dimSizes)
Transpose a real 3D matrix in the X-Y direction.
int GetSolverBlockSize1D() const
Get number of threads for 1D block used by kSpaceSolver.
unsigned int nElements
total number of elements.
void ComputeVelocityScalarUniform(TRealMatrix &ux_sgx, TRealMatrix &uy_sgy, TRealMatrix &uz_sgz, const TRealMatrix &ifft_x, const TRealMatrix &ifft_y, const TRealMatrix &ifft_z, const TRealMatrix &pml_x, const TRealMatrix &pml_y, const TRealMatrix &pml_z)
Compute acoustic velocity, scalar and uniform case.
__global__ void CUDAComputeVelocityShiftInZ(cuFloatComplex *cufft_shift_temp, const cuFloatComplex *z_shift_neg_r)
unsigned int ny
size of Y dimension.
__device__ dim3 GetComplex3DCoords(const unsigned int i)
Get a 3D coordinates for a complex matrix form a 1D index.
int GetSolverGridSize1D()
virtual float * GetDeviceData()
Get raw GPU data out of the class (for direct GPU kernel access).
void ComputeVelocityShiftInZ(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &z_shift_neg_r)
Compute the velocity shift in Fourier space over the Z axis.
float rho0_sgz_scalar
dt / rho0_sgz in homogeneous case
float fftDividerZ
normalization constant for 1D FFT over Z.
__global__ void CUDAComputeDensityNonlinearHeterogeneous(float *rhox, float *rhoy, float *rhoz, const float *pml_x, const float *pml_y, const float *pml_z, const float *duxdx, const float *duydy, const float *duzdz, const float *rho0)
The class for complex matrices.
float dt_rho0_scalar
dt * rho0 in homogeneous case
float absorb_tau_scalar
Absorb_tau value for homogeneous case.
float absorb_eta_scalar
Absorb_eta value for homogeneous case.
void TrasposeReal3DMatrixXZ(float *outputMatrix, const float *inputMatrix, const dim3 &dimSizes)
Transpose a real 3D matrix in the X-Y direction.
Class implementing 3D Real-To-Complex and Complex-To-Real transforms using CUDA FFT interface...
void SumPressureNonlinearLossless(TRealMatrix &p, const TRealMatrix &rhox, const TRealMatrix &rhoy, const TRealMatrix &rhoz, const bool is_c2_scalar, const float *c2_matrix, const bool is_BonA_scalar, const float *BonA_matrix, const bool is_rho0_scalar, const float *rho0_matrix)
Sum sub-terms for new p, linear lossless case.
__global__ void CUDACompute_p0_Velocity(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *dt_rho0_sgx=nullptr, const float *dt_rho0_sgy=nullptr, const float *dt_rho0_sgz=nullptr)
int GetSolverBlockSize1D()
__global__ void CUDASumPressureTermsLinear(float *p, const float *absorb_tau_temp, const float *absorb_eta_temp, const float *sum_rhoxyz, const float *c2_matrix, const float *tau_matrix, const float *eta_matrix)
void ComputeVelocityShiftInY(TCUFFTComplexMatrix &cufft_shift_temp, const TComplexMatrix &y_shift_neg_r)
Compute the velocity shift in Fourier space over the Y axis.
__global__ void CUDAComputeVelocityScalarUniform(float *ux_sgx, float *uy_sgy, float *uz_sgz, const float *ifft_x, const float *ifft_y, const float *ifft_z, const float *pml_x, const float *pml_y, const float *pml_z)