Index: openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp +++ openmp/trunk/libomptarget/plugins/cuda/src/rtl.cpp @@ -51,8 +51,9 @@ }; enum ExecutionModeType { - SPMD, - GENERIC, + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else NONE }; @@ -99,7 +100,7 @@ static const int HardTeamLimit = 1<<16; // 64k static const int HardThreadLimit = 1024; static const int DefaultNumTeams = 128; - static const int DefaultNumThreads = 1024; + static const int DefaultNumThreads = 128; // Record entry point associated with device void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { @@ -581,18 +582,17 @@ if (thread_limit > 0) { cudaThreadsPerBlock = thread_limit; DP("Setting CUDA threads per block to requested %d\n", thread_limit); + // Add master warp if necessary + if (KernelInfo->ExecutionMode == GENERIC) { + cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; + DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); + } } else { cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; DP("Setting CUDA threads per block to default %d\n", DeviceInfo.NumThreads[device_id]); } - // Add master warp if necessary - if (KernelInfo->ExecutionMode == GENERIC) { - cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; - DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); - } - if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; DP("Threads per block capped at device limit %d\n", @@ -612,8 +612,27 @@ int cudaBlocksPerGrid; if (team_num <= 0) { if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { - // round up to the nearest integer - cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + if (KernelInfo->ExecutionMode == SPMD) { + // We have a combined construct, i.e. `target teams distribute parallel + // for [simd]`. We launch so many teams so that each thread will + // execute one iteration of the loop. + // round up to the nearest integer + cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; + } else { + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + cudaBlocksPerGrid = loop_tripcount; + } DP("Using %d teams due to loop trip count %" PRIu64 " and number of " "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, cudaThreadsPerBlock);