Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- libomptarget/plugins/cuda/src/rtl.cpp +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -99,7 +99,7 @@ static const int HardTeamLimit = 1<<16; // 64k static const int HardThreadLimit = 1024; static const int DefaultNumTeams = 128; - static const int DefaultNumThreads = 1024; + static const int DefaultNumThreads = 128; // Record entry point associated with device void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { @@ -583,6 +583,10 @@ DP("Setting CUDA threads per block to requested %d\n", thread_limit); } else { cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; + if (KernelInfo->ExecutionMode == GENERIC) { + // Leave room for the master warp which will be added below. + cudaThreadsPerBlock -= DeviceInfo.WarpSize[device_id]; + } DP("Setting CUDA threads per block to default %d\n", DeviceInfo.NumThreads[device_id]); } @@ -612,8 +616,12 @@ int cudaBlocksPerGrid; if (team_num <= 0) { if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { - // round up to the nearest integer - cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + if (KernelInfo->ExecutionMode == SPMD) { + // round up to the nearest integer + cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + } else { + cudaBlocksPerGrid = loop_tripcount; + } DP("Using %d teams due to loop trip count %" PRIu64 " and number of " "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, cudaThreadsPerBlock);