kspaceFirstOrder3D-CUDA  1.1
The CUDA/C++ implementation of the k-wave toolbox for the time-domain simulation of acoustic wave fields in 3D
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CUDAParameters.cpp
Go to the documentation of this file.
1 /**
2  * @file CUDAParameters.cpp
3  *
4  * @author Jiri Jaros \n
5  * Faculty of Information Technology \n
6  * Brno University of Technology \n
7  * jarosjir@fit.vutbr.cz
8  *
9  * @brief The header file for the class for setting CUDA kernel parameters.
10  *
11  * @version kspaceFirstOrder3D 3.4
12  *
13  * @date 12 November 2015, 16:49 (created) \n
14  * 10 August 2016, 12:21 (revised)
15  *
16  * @section License
17  * This file is part of the C++ extension of the k-Wave Toolbox
18  * (http://www.k-wave.org).\n Copyright (C) 2016 Jiri Jaros and Bradley Treeby.
19  *
20  * This file is part of the k-Wave. k-Wave is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published by the Free Software
22  * Foundation, either version 3 of the License, or (at your option) any later version.
23  *
24  * k-Wave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
25  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
26  * General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public License along with k-Wave.
29  * If not, see http://www.gnu.org/licenses/.
30  */
31 
32 #include <stdexcept>
33 #include <cuda_runtime.h>
34 
37 #include <Parameters/Parameters.h>
38 
39 #include <Logger/Logger.h>
40 
42 
43 
44 //------------------------------------------------------------------------------------------------//
45 //------------------------------------------ CONSTANTS -------------------------------------------//
46 //------------------------------------------------------------------------------------------------//
47 
48 /**
49  * Constructor.
50  */
52  deviceIdx(DEFAULT_DEVICE_IDX),
53  solverBlockSize1D(UNDEFINDED_SIZE), solverGridSize1D(UNDEFINDED_SIZE),
54  solverTransposeBlockSize(UNDEFINDED_SIZE), solverTransposeGirdSize(UNDEFINDED_SIZE),
55  samplerBlockSize1D(UNDEFINDED_SIZE), samplerGridSize1D(UNDEFINDED_SIZE),
56  deviceProperties()
57 {
58 }// end of default constructor
59 //--------------------------------------------------------------------------------------------------
60 
61 
62 
63 
64 //------------------------------------------------------------------------------------------------//
65 //--------------------------------------- Public methods -----------------------------------------//
66 //------------------------------------------------------------------------------------------------//
67 
68 /**
69  * Return the name of device used.
70  *
71  * @return device name of the selected GPU
72  */
73 std::string TCUDAParameters::GetDeviceName() const
74 {
75  if (strcmp(deviceProperties.name, "") == 0)
76  {
77  return "N/A";
78  }
79  return deviceProperties.name;
80 }// end of GetDeviceName
81 //--------------------------------------------------------------------------------------------------
82 
83 /**
84  * Select cuda device for execution. If no device is specified, the first free is chosen. The
85  * routine also checks whether the CUDA runtime and driver version match and whether the GPU is
86  * supported by the code. If there is no free device is present, the code terminates
87  *
88  * @param [in] deviceIdx - Device index (default DEFAULT_DEVICE_IDX)
89  */
90 void TCUDAParameters::SelectDevice(const int deviceIdx)
91 {
92  // check CUDA driver version and if not sufficient, terminate
94 
95  this->deviceIdx = deviceIdx;
96 
97  //choose the GPU device with the most global memory
98  int nDevices;
99  checkCudaErrors(cudaGetDeviceCount(&nDevices));
100  cudaGetLastError();
101 
102  cudaError_t lastError;
103  //if the user does not provided a specific GPU, use the first one
104  if (deviceIdx == DEFAULT_DEVICE_IDX)
105  {
106  bool deviceFound = false;
107 
108  for (int testDevice = 0; testDevice < nDevices; testDevice++)
109  {
110  // try to set the GPU and reset it
111  cudaSetDevice(testDevice);
112  cudaDeviceReset();
113  lastError = cudaGetLastError();
114 
115  // Reset was done properly, test CUDA code version
116  if (lastError == cudaSuccess)
117  {
118  // Read the GPU SM version and the kernel version
119  bool cudaCodeVersionOK = CheckCUDACodeVersion();
120  lastError = cudaGetLastError();
121 
122  if (cudaCodeVersionOK && (lastError == cudaSuccess))
123  {
124  // acquire the GPU
125  this->deviceIdx = testDevice;
126  deviceFound = true;
127  break;
128  }
129  }
130  // GPU was busy, reset and continue
131  lastError = cudaDeviceReset();
132 
133  //clear last error
134  cudaGetLastError();
135  }
136 
137  if (!deviceFound)
138  {
139  throw std::runtime_error(ERR_FMT_NO_FREE_DEVICE);
140  }
141  }
142  else // select a device the user wants
143  {
144  // check if the specified device is acceptable -
145  // not busy, input parameter not out of bounds
146  if ((this->deviceIdx > nDevices - 1) || (this->deviceIdx < 0))
147  {
148  throw std::runtime_error(TLogger::FormatMessage(ERR_FMT_BAD_DEVICE_IDX,
149  this->deviceIdx,
150  nDevices-1));
151  }
152 
153  // set the device and copy it's properties
154  cudaSetDevice(this->deviceIdx);
155  cudaDeviceReset();
156  lastError = cudaGetLastError();
157 
158  bool cudaCodeVersionOK = CheckCUDACodeVersion();
159  lastError = cudaGetLastError();
160 
161  if ((lastError != cudaSuccess) || (!cudaCodeVersionOK))
162  {
163  lastError = cudaDeviceReset();
164 
165  throw std::runtime_error(TLogger::FormatMessage(ERR_FMT_DEVICE_IS_BUSY, this->deviceIdx));
166  }
167  }
168 
169  // Read the device that was allocated
170  checkCudaErrors(cudaGetDevice(&this->deviceIdx));
171  checkCudaErrors(cudaGetLastError());
172 
173  // Reset the device to be able to set the flags
174  checkCudaErrors(cudaDeviceReset());
175  checkCudaErrors(cudaGetLastError());
176 
177  // Enable mapped memory
178  checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
179 
180  // Get Device name
181  checkCudaErrors(cudaGetDeviceProperties(&deviceProperties, this->deviceIdx));
182 
183  // Check the GPU version
184  if (!CheckCUDACodeVersion())
185  {
186  throw std::runtime_error(TLogger::FormatMessage(ERR_FMT_GPU_NOT_SUPPORTED, this->deviceIdx));
187  }
188 }// end of SelectCUDADevice
189 //--------------------------------------------------------------------------------------------------
190 
191 
192 /**
193  * Set kernel configuration.
194  * Based on the dimension sizes, sensors masks, and the GPU architecture, adequate CUDA kernel
195  * configurations are selected.
196  */
198 {
199  const TParameters& params = TParameters::GetInstance();
200 
201  TDimensionSizes fullDims(params.GetFullDimensionSizes());
202 
203  // Set kernel configuration for 1D kernels
204  // The goal here is to have blocks of size 256 threads and at least 8 x times
205  // more blocks than SM processors - This gives us full potential on all
206  // Fermi, Kepler, Maxwell still not compromising the maximum number of blocks
207  // and threads.
208 
209  solverBlockSize1D = 256;
210  // Grid size is calculated based on the number of SM processors
211  solverGridSize1D = deviceProperties.multiProcessorCount * 8;
212 
213  // the grid size is to small, get 1 gridpoint per thread
214  if ((size_t(solverGridSize1D) * size_t(solverBlockSize1D)) > fullDims.GetElementCount())
215  {
216  solverGridSize1D = int((fullDims.GetElementCount() + size_t(solverBlockSize1D) - 1 ) / size_t(solverBlockSize1D));
217  }
218 
219  // Transposition works by processing for tiles of 32x32 by 4 warps. Every block
220  // is responsible for one 2D slab.
221  // Block size for the transposition kernels (only 128 threads)
222  solverTransposeBlockSize = dim3(32, 4 , 1);
223  // Grid size for the transposition kernels
224  solverTransposeGirdSize = dim3(deviceProperties.multiProcessorCount * 16, 1, 1);
225 
226 
227  // Set configuration for Streaming kernels. We always use 1D kernels of 256 threads
228  // and create as many blocks as necessary to fully utilise the GPU.
229  // The size of the grid is only tuned for linear sensor mask,
230  // since in this execution phase, we don't
231  // know how many elements there are in the cuboid sensor mask
232  samplerBlockSize1D = 256;
233 
234  samplerGridSize1D = deviceProperties.multiProcessorCount * 8;
235 
236  // tune number of blocks for index based sensor mask
237  if (params.Get_sensor_mask_type() == TParameters::TSensorMaskType::INDEX)
238  {
239  // the sensor mask is smaller than 2048 * SMs than use a smaller number of blocks
240  if ((size_t(samplerGridSize1D) * size_t(samplerBlockSize1D)) > params.Get_sensor_mask_index_size())
241  {
242  samplerGridSize1D = int((params.Get_sensor_mask_index_size() + size_t(samplerBlockSize1D) - 1 )
243  / size_t(samplerBlockSize1D));
244  }
245  }
246 
247 }// end of SetKernelConfiguration
248 //--------------------------------------------------------------------------------------------------
249 
250 
251 /**
252  * Upload useful simulation constants into device constant memory.
253  */
255 {
256  TCUDADeviceConstants constantsToTransfer;
257 
259  TDimensionSizes fullDimSizes = params.GetFullDimensionSizes();
260  TDimensionSizes reducedDimSizes = params.GetReducedDimensionSizes();
261 
262  // Set values for constant memory
263  constantsToTransfer.nx = fullDimSizes.nx;
264  constantsToTransfer.ny = fullDimSizes.ny;
265  constantsToTransfer.nz = fullDimSizes.nz;
266  constantsToTransfer.nElements = fullDimSizes.GetElementCount();
267  constantsToTransfer.slabSize = fullDimSizes.nx * fullDimSizes.ny;
268 
269  constantsToTransfer.nxComplex = reducedDimSizes.nx;
270  constantsToTransfer.nyComplex = reducedDimSizes.ny;
271  constantsToTransfer.nzComplex = reducedDimSizes.nz;
272  constantsToTransfer.nElementsComplex = reducedDimSizes.GetElementCount();
273  constantsToTransfer.slabSizeComplex = reducedDimSizes.nx * reducedDimSizes.ny;
274 
275  constantsToTransfer.fftDivider = 1.0f / fullDimSizes.GetElementCount();
276  constantsToTransfer.fftDividerX = 1.0f / fullDimSizes.nx;
277  constantsToTransfer.fftDividerY = 1.0f / fullDimSizes.ny;
278  constantsToTransfer.fftDividerZ = 1.0f / fullDimSizes.nz;
279 
280  constantsToTransfer.dt = params.Get_dt();
281  constantsToTransfer.dt2 = params.Get_dt() * 2.0f;
282  constantsToTransfer.c2 = params.Get_c0_scalar();
283 
284  constantsToTransfer.rho0_scalar = params.Get_rho0_scalar();
285  constantsToTransfer.dt_rho0_scalar = params.Get_rho0_scalar() * params.Get_dt();
286  constantsToTransfer.rho0_sgx_scalar = params.Get_rho0_sgx_scalar();
287  constantsToTransfer.rho0_sgy_scalar = params.Get_rho0_sgy_scalar(),
288  constantsToTransfer.rho0_sgz_scalar = params.Get_rho0_sgz_scalar(),
289 
290  constantsToTransfer.BonA_scalar = params.Get_BonA_scalar();
291  constantsToTransfer.absorb_tau_scalar = params.Get_absorb_tau_scalar();
292  constantsToTransfer.absorb_eta_scalar = params.Get_absorb_eta_scalar();
293 
294 
295  // source masks
296  constantsToTransfer.p_source_index_size = params.Get_p_source_index_size();
297  constantsToTransfer.p_source_mode = params.Get_p_source_mode();
298  constantsToTransfer.p_source_many = params.Get_p_source_many();
299 
300  constantsToTransfer.u_source_index_size = params.Get_u_source_index_size();
301  constantsToTransfer.u_source_mode = params.Get_u_source_mode();
302  constantsToTransfer.u_source_many = params.Get_u_source_many();
303 
304  constantsToTransfer.SetUpCUDADeviceConstatns();
305 }// end of SetUpDeviceConstants
306 //--------------------------------------------------------------------------------------------------
307 
308 
309 //------------------------------------------------------------------------------------------------//
310 //-------------------------------------- Protected methods ---------------------------------------//
311 //------------------------------------------------------------------------------------------------//
312 
313 
314 
315 /**
316  * Check whether the CUDA driver version installed is sufficient for the code.
317  * If anything goes wrong, throw an exception and exit/
318  *
319  * @throw runtime_error when the CUDA driver is too old.
320  */
322 {
323  int cudaRuntimeVersion;
324  int cudaDriverVersion;
325 
326  if (cudaRuntimeGetVersion(&cudaRuntimeVersion) != cudaSuccess)
327  {
328  throw std::runtime_error(ERR_FM_CANNOT_READ_CUDA_VERSION);
329  }
330 
331  if (cudaDriverGetVersion(&cudaDriverVersion) != cudaSuccess)
332  {
333  throw std::runtime_error(ERR_FM_CANNOT_READ_CUDA_VERSION);
334  }
335 
336  if (cudaDriverVersion < cudaRuntimeVersion)
337  {
339  cudaRuntimeVersion / 1000, (cudaRuntimeVersion % 100) / 10,
340  cudaDriverVersion / 1000, (cudaDriverVersion % 100) / 10));
341  }
342 }// end of CheckCUDAVersion
343 //--------------------------------------------------------------------------------------------------
344 
345 /**
346  * Check whether the GPU has SM 2.0 at least.
347  *
348  * @return the GPU version
349  */
351 {
352  return (SolverCUDAKernels::GetCUDACodeVersion() >= 20);
353 }// end of CheckCUDACodeVersion
354 //--------------------------------------------------------------------------------------------------
size_t nx
number of elements in the x direction
TDimensionSizes GetReducedDimensionSizes() const
Reduced dimension sizes of the simulation (complex classes).
Definition: Parameters.h:97
float & Get_c0_scalar()
Get c0_scalar value.
Definition: Parameters.h:187
int samplerGridSize1D
Number of blocks for the 1D data sampling kernels.
unsigned int u_source_index_size
size of the u source index
size_t Get_sensor_mask_index_size() const
Get sensor_mask_index_size value.
Definition: Parameters.h:168
dim3 solverTransposeBlockSize
Block size for the transposition kernels.
size_t Get_u_source_index_size() const
Get u_source_index_size value.
Definition: Parameters.h:173
int deviceIdx
Index of the device the code is being run on.
dim3 solverTransposeGirdSize
Grid size for the transposition kernels.
TErrorMessage ERR_FMT_NO_FREE_DEVICE
CUDATuner error message.
unsigned int nx
size of X dimension.
float fftDivider
normalization constant for 3D FFT.
int GetCUDACodeVersion()
Get the CUDA architecture and GPU code version the code was compiled with.
float BonA_scalar
BonA value for homogeneous case.
static const int DEFAULT_DEVICE_IDX
Default Device Index - no default GPU.
The header file for the class for storing constants residing in CUDA constant memory.
Structure for CUDA parameters to be placed in constant memory. Only 32b values are used...
unsigned int slabSize
2D Slab size
float & Get_BonA_scalar()
Get BonA_scalar value.
Definition: Parameters.h:197
unsigned int nxComplex
size of complex X dimension.
The header file for the class for setting CUDA kernel parameters.
static TParameters & GetInstance()
Get instance of the singleton class.
Definition: Parameters.cpp:70
int samplerBlockSize1D
Number of threads for the 1D data sampling kernels.
void SetUpDeviceConstants() const
Upload useful simulation constants into device constant memory.
float rho0_sgx_scalar
dt / rho0_sgx in homogeneous case
int solverBlockSize1D
Number of threads for 1D block used by kSpaceSolver.
size_t Get_u_source_many() const
Get u_source_many value.
Definition: Parameters.h:143
The header file containing the parameters of the simulation.
TErrorMessage ERR_FMT_BAD_DEVICE_IDX
CUDATuner error message.
#define checkCudaErrors(val)
Macro checking cuda errors and printing the file name and line. Inspired by CUDA common checking rout...
Definition: Logger.h:209
unsigned int nzComplex
size of complex Z dimension.
unsigned int nz
size of Z dimension.
std::string GetDeviceName() const
Get the name of the device used.
void SelectDevice(const int DeviceIdx=DEFAULT_DEVICE_IDX)
Select cuda device for execution.
float fftDividerX
normalization constant for 1D FFT over X.
float rho0_scalar
rho0 in homogeneous case
TDimensionSizes GetFullDimensionSizes() const
Full dimension sizes of the simulation (real classes).
Definition: Parameters.h:95
The header file containing a class responsible for printing out info and error messages (stdout...
unsigned int p_source_many
p source many
unsigned int u_source_mode
u source mode
TCUDAParameters()
Default constructor - only friend class can create an instance.
TErrorMessage ERR_FMT_DEVICE_IS_BUSY
CUDATuner error message.
Class storing all parameters of the simulation.
Definition: Parameters.h:49
size_t Get_p_source_mode() const
Get p_source_mode value.
Definition: Parameters.h:154
unsigned int nElementsComplex
complex number of elements.
size_t Get_p_source_many() const
Get p_source_many value.
Definition: Parameters.h:152
unsigned int p_source_index_size
size of the p_source mask
size_t GetElementCount() const
Get element count, in 3D only spatial domain, in 4D with time.
size_t ny
number of elements in the y direction
float Get_dt() const
Get dt value.
Definition: Parameters.h:109
float & Get_absorb_eta_scalar()
Get absorb_eta_scalar value.
Definition: Parameters.h:190
TErrorMessage ERR_FMT_INSUFFICIENT_CUDA_DRIVER
CUDAParameters error message.
float & Get_rho0_sgy_scalar()
Get rho0_sgy_scalar value.
Definition: Parameters.h:206
float & Get_rho0_scalar()
Get rho0_scalar value.
Definition: Parameters.h:202
unsigned int slabSizeComplex
complex slab size.
float fftDividerY
normalization constant for 1D FFT over Y.
float rho0_sgy_scalar
dt / rho0_sgy in homogeneous case
unsigned int p_source_mode
p source mode
unsigned int u_source_many
u source many
unsigned int nyComplex
size of complex Y dimension.
Name space for all CUDA kernels used in the 3D solver.
static std::string FormatMessage(const std::string &format, Args...args)
C++-11 replacement for sprintf that works with std::string instead of char *.
Definition: Logger.h:126
unsigned int nElements
total number of elements.
cudaDeviceProp deviceProperties
Device properties of the selected GPU.
size_t Get_u_source_mode() const
Get ux_source_mode value.
Definition: Parameters.h:145
unsigned int ny
size of Y dimension.
int solverGridSize1D
Number of block for 1D grid used by kSpaceSolver.
size_t Get_p_source_index_size() const
Get p_source_index_size value.
Definition: Parameters.h:175
float & Get_absorb_tau_scalar()
Get absorb_tau_scalar value.
Definition: Parameters.h:192
float rho0_sgz_scalar
dt / rho0_sgz in homogeneous case
float fftDividerZ
normalization constant for 1D FFT over Z.
void CheckCUDAVersion()
Check whether the CUDA driver version installed is sufficient for the code.
void SetKernelConfiguration()
Set kernel configurations based on the simulation parameters.
size_t nz
number of elements in the z direction
float dt_rho0_scalar
dt * rho0 in homogeneous case
float absorb_tau_scalar
Absorb_tau value for homogeneous case.
float absorb_eta_scalar
Absorb_eta value for homogeneous case.
bool CheckCUDACodeVersion()
Check whether the code was compiled for a given SM model.
float & Get_rho0_sgx_scalar()
Get rho0_sgx_scalar value.
Definition: Parameters.h:204
TErrorMessage ERR_FM_CANNOT_READ_CUDA_VERSION
CUDAParameters error message.
TSensorMaskType Get_sensor_mask_type() const
Get sensor mask type (linear or corners).
Definition: Parameters.h:166
Structure with 4D dimension sizes (3 in space and 1 in time).
TErrorMessage ERR_FMT_GPU_NOT_SUPPORTED
CUDAParameters error message.
__host__ void SetUpCUDADeviceConstatns()
Set constant memory.
float & Get_rho0_sgz_scalar()
Get rho0_sgz_scalar value.
Definition: Parameters.h:208