Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- libomptarget/plugins/cuda/src/rtl.cpp +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -280,9 +279,9 @@ } // scan properties to determine number of threads/block and blocks/grid. - struct cudaDeviceProp Properties; - cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); - if (error != cudaSuccess) { + CUdevprop Properties; + err = cuDeviceGetProperties(&Properties, cuDevice); + if (err != CUDA_SUCCESS) { DP("Error getting device Properties, use defaults\n"); DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; @@ -314,8 +313,8 @@ RTLDeviceInfoTy::HardThreadLimit); } - // Get warp size - DeviceInfo.WarpSize[device_id] = Properties.warpSize; + // According to the documentation, SIMDWidth is "Warp size in threads". + DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth; } // Adjust teams to the env variables @@ -685,10 +684,12 @@ DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); - cudaError_t sync_error = cudaDeviceSynchronize(); - if (sync_error != cudaSuccess) { - DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr), - cudaGetErrorString(sync_error)); + CUresult sync_err = cuCtxSynchronize(); + if (sync_err != CUDA_SUCCESS) { + DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); + CUDA_ERR_STRING(sync_err); + assert(sync_err == CUDA_SUCCESS && + "Unable to synchronize target execution!"); return OFFLOAD_FAIL; } else { DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));