Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -61,8 +61,8 @@ omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); int rc = 1; // default is 1 thread avail if (!currTaskDescr->InParallelRegion()) { - // not currently in a parallel region... all are available - rc = GetNumberOfProcsInTeam(); + // Not currently in a parallel region, return what was set. + rc = currTaskDescr->NThreads(); ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); } PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -193,25 +193,38 @@ // support for parallel that goes parallel (1 static level only) //////////////////////////////////////////////////////////////////////////////// -// return number of cuda threads that participate to parallel -// calculation has to consider simd implementation in nvptx -// i.e. (num omp threads * num lanes) -// -// cudathreads = -// if(num_threads != 0) { -// if(thread_limit > 0) { -// min (num_threads*numLanes ; thread_limit*numLanes); -// } else { -// min (num_threads*numLanes; blockDim.x) -// } -// } else { -// if (thread_limit != 0) { -// min (thread_limit*numLanes; blockDim.x) -// } else { // no thread_limit, no num_threads, use all cuda threads -// blockDim.x; -// } -// } -// +static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, + uint16_t NThreadsICV, + uint16_t ThreadLimit) { + uint16_t ThreadsRequested = NThreadsICV; + if (NumThreadsClause != 0) { + ThreadsRequested = NumThreadsClause; + } + + uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); + if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { + ThreadsAvailable = ThreadLimit; + } + + uint16_t NumThreads = ThreadsAvailable; + if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { + NumThreads = ThreadsRequested; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp participate in the parallel region. Round down to a + // multiple of WARPSIZE since it is legal to do so in OpenMP. + if (NumThreads < WARPSIZE) { + NumThreads = 1; + } else { + NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); + } +#endif + + return NumThreads; +} + // This routine is always called by the team master.. EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) { @@ -234,78 +247,26 @@ return; } - uint16_t CudaThreadsForParallel = 0; - uint16_t NumThreadsClause = + uint16_t &NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - // we cannot have more than block size - uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam(); + uint16_t NumThreads = + determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(), + currTaskDescr->ThreadLimit()); - // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as - // specified by the thread_limit clause on the target directive. - // GetNumberOfWorkersInTeam(): This is the number of workers available - // in this kernel instance. - // - // E.g: If thread_limit is 33, the kernel is launched with 33+32=65 - // threads. The last warp is the master warp so in this case - // GetNumberOfWorkersInTeam() returns 64. - - // this is different from ThreadAvail of OpenMP because we may be - // using some of the CUDA threads as SIMD lanes - int NumLanes = 1; if (NumThreadsClause != 0) { - // reset request to avoid propagating to successive #parallel - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = - 0; - - // assume that thread_limit*numlanes is already <= CudaThreadsAvail - // because that is already checked on the host side (CUDA offloading rtl) - if (currTaskDescr->ThreadLimit() != 0) - CudaThreadsForParallel = - NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes - ? NumThreadsClause * NumLanes - : currTaskDescr->ThreadLimit() * NumLanes; - else { - CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail) - ? CudaThreadsAvail - : NumThreadsClause * NumLanes; - } - } else { - if (currTaskDescr->ThreadLimit() != 0) { - CudaThreadsForParallel = - (currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail) - ? CudaThreadsAvail - : currTaskDescr->ThreadLimit() * NumLanes; - } else - CudaThreadsForParallel = CudaThreadsAvail; - } - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp participate in the parallel region. Round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - // CudaThreadsAvail is the number of workers available in this - // kernel instance and is greater than or equal to - // currTaskDescr->ThreadLimit(). - if (CudaThreadsForParallel < CudaThreadsAvail) { - CudaThreadsForParallel = - (CudaThreadsForParallel < WARPSIZE) - ? 1 - : CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1); + // Reset request to avoid propagating to successive #parallel + NumThreadsClause = 0; } -#endif - ASSERT(LT_FUSSY, CudaThreadsForParallel > 0, - "bad thread request of %d threads", CudaThreadsForParallel); + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + NumThreads); ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); - // set number of threads on work descriptor - // this is different from the number of cuda threads required for the parallel - // region + // Set number of threads on work descriptor. omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, - CudaThreadsForParallel / NumLanes); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads); } // All workers call this function. Deactivate those not needed. Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c @@ -0,0 +1,102 @@ +// RUN: %compile-run-and-check + +#include +#include + +const int WarpSize = 32; +const int NumThreads1 = 1 * WarpSize; +const int NumThreads2 = 2 * WarpSize; +const int NumThreads3 = 3 * WarpSize; +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + int check4[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = check4[i] = 0; + } + + int maxThreads1 = -1; + int maxThreads2 = -1; + int maxThreads3 = -1; + + #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \ + map(maxThreads1, maxThreads2, maxThreads3) + { + #pragma omp parallel num_threads(NumThreads1) + { + check1[omp_get_thread_num()] += omp_get_num_threads(); + } + + // API method to set number of threads in parallel regions without + // num_threads() clause. + omp_set_num_threads(NumThreads2); + maxThreads1 = omp_get_max_threads(); + #pragma omp parallel + { + check2[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads2 = omp_get_max_threads(); + + // num_threads() clause should override nthreads-var ICV. + #pragma omp parallel num_threads(NumThreads3) + { + check3[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads3 = omp_get_max_threads(); + + // Effect from omp_set_num_threads() should still be visible. + #pragma omp parallel + { + check4[omp_get_thread_num()] += omp_get_num_threads(); + } + } + + // CHECK: maxThreads1 = 64 + printf("maxThreads1 = %d\n", maxThreads1); + // CHECK: maxThreads2 = 64 + printf("maxThreads2 = %d\n", maxThreads2); + // CHECK: maxThreads3 = 64 + printf("maxThreads3 = %d\n", maxThreads3); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < NumThreads1) { + if (check1[i] != NumThreads1) { + printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + if (i < NumThreads2) { + if (check2[i] != NumThreads2) { + printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + if (i < NumThreads3) { + if (check3[i] != NumThreads3) { + printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + + if (i < NumThreads2) { + if (check4[i] != NumThreads2) { + printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]); + } + } else if (check4[i] != 0) { + printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); + } + } + + return 0; +}