summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/libomptarget/plugins/cuda/src/rtl.cpp69
1 files changed, 37 insertions, 32 deletions
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index 872e7f02aed..d265a87b83d 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -285,43 +285,48 @@ int32_t __tgt_rtl_init_device(int32_t device_id) {
return OFFLOAD_FAIL;
}
- // scan properties to determine number of threads/block and blocks/grid.
- CUdevprop Properties;
- err = cuDeviceGetProperties(&Properties, cuDevice);
+ // Query attributes to determine number of threads/block and blocks/grid.
+ int maxGridDimX;
+ err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+ cuDevice);
if (err != CUDA_SUCCESS) {
- DP("Error getting device Properties, use defaults\n");
+ DP("Error getting max grid dimension, use default\n");
DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
- DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
- DeviceInfo.WarpSize[device_id] = 32;
+ } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
+ DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
+ DP("Using %d CUDA blocks per grid\n", maxGridDimX);
} else {
- // Get blocks per grid
- if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
- DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
- DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
- } else {
- DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
- DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
- "at the hard limit\n", Properties.maxGridSize[0],
- RTLDeviceInfoTy::HardTeamLimit);
- }
+ DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
+ DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
+ "at the hard limit\n",
+ maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
+ }
- // Get threads per block, exploit threads only along x axis
- if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
- DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
- DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
- if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
- DP("(fewer than max per block along all xyz dims %d)\n",
- Properties.maxThreadsPerBlock);
- }
- } else {
- DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
- DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
- "capping at the hard limit\n", Properties.maxThreadsDim[0],
- RTLDeviceInfoTy::HardThreadLimit);
- }
+ // We are only exploiting threads along the x axis.
+ int maxBlockDimX;
+ err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+ cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting max block dimension, use default\n");
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
+ } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
+ DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
+ DP("Using %d CUDA threads per block\n", maxBlockDimX);
+ } else {
+ DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
+ DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
+ "at the hard limit\n",
+ maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
+ }
- // According to the documentation, SIMDWidth is "Warp size in threads".
- DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
+ int warpSize;
+ err =
+ cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
+ if (err != CUDA_SUCCESS) {
+ DP("Error getting warp size, assume default\n");
+ DeviceInfo.WarpSize[device_id] = 32;
+ } else {
+ DeviceInfo.WarpSize[device_id] = warpSize;
}
// Adjust teams to the env variables
OpenPOWER on IntegriCloud