Skip to content

Commit a743c04

Browse files
committedSep 29, 2018
[libomptarget-nvptx] Fix number of threads in parallel
If there is no num_threads() clause we must consider the nthreads-var ICV. Its value is set by omp_set_num_threads() and can be queried using omp_get_max_num_threads(). The rewritten code now closely resembles the algorithm given in the OpenMP standard. Differential Revision: https://reviews.llvm.org/D51783 llvm-svn: 343380
1 parent 54d31ef commit a743c04

File tree

3 files changed

+147
-84
lines changed

3 files changed

+147
-84
lines changed
 

‎openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ EXTERN int omp_get_max_threads(void) {
6161
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
6262
int rc = 1; // default is 1 thread avail
6363
if (!currTaskDescr->InParallelRegion()) {
64-
// not currently in a parallel region... all are available
65-
rc = GetNumberOfProcsInTeam();
64+
// Not currently in a parallel region, return what was set.
65+
rc = currTaskDescr->NThreads();
6666
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
6767
}
6868
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);

‎openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu

+43-82
Original file line numberDiff line numberDiff line change
@@ -193,25 +193,38 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
193193
// support for parallel that goes parallel (1 static level only)
194194
////////////////////////////////////////////////////////////////////////////////
195195

196-
// return number of cuda threads that participate to parallel
197-
// calculation has to consider simd implementation in nvptx
198-
// i.e. (num omp threads * num lanes)
199-
//
200-
// cudathreads =
201-
// if(num_threads != 0) {
202-
// if(thread_limit > 0) {
203-
// min (num_threads*numLanes ; thread_limit*numLanes);
204-
// } else {
205-
// min (num_threads*numLanes; blockDim.x)
206-
// }
207-
// } else {
208-
// if (thread_limit != 0) {
209-
// min (thread_limit*numLanes; blockDim.x)
210-
// } else { // no thread_limit, no num_threads, use all cuda threads
211-
// blockDim.x;
212-
// }
213-
// }
214-
//
196+
static INLINE uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
197+
uint16_t NThreadsICV,
198+
uint16_t ThreadLimit) {
199+
uint16_t ThreadsRequested = NThreadsICV;
200+
if (NumThreadsClause != 0) {
201+
ThreadsRequested = NumThreadsClause;
202+
}
203+
204+
uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
205+
if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
206+
ThreadsAvailable = ThreadLimit;
207+
}
208+
209+
uint16_t NumThreads = ThreadsAvailable;
210+
if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
211+
NumThreads = ThreadsRequested;
212+
}
213+
214+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
215+
// On Volta and newer architectures we require that all lanes in
216+
// a warp participate in the parallel region. Round down to a
217+
// multiple of WARPSIZE since it is legal to do so in OpenMP.
218+
if (NumThreads < WARPSIZE) {
219+
NumThreads = 1;
220+
} else {
221+
NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
222+
}
223+
#endif
224+
225+
return NumThreads;
226+
}
227+
215228
// This routine is always called by the team master..
216229
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
217230
int16_t IsOMPRuntimeInitialized) {
@@ -234,78 +247,26 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
234247
return;
235248
}
236249

237-
uint16_t CudaThreadsForParallel = 0;
238-
uint16_t NumThreadsClause =
250+
uint16_t &NumThreadsClause =
239251
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
240252

241-
// we cannot have more than block size
242-
uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
243-
244-
// currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
245-
// specified by the thread_limit clause on the target directive.
246-
// GetNumberOfWorkersInTeam(): This is the number of workers available
247-
// in this kernel instance.
248-
//
249-
// E.g: If thread_limit is 33, the kernel is launched with 33+32=65
250-
// threads. The last warp is the master warp so in this case
251-
// GetNumberOfWorkersInTeam() returns 64.
252-
253-
// this is different from ThreadAvail of OpenMP because we may be
254-
// using some of the CUDA threads as SIMD lanes
255-
int NumLanes = 1;
256-
if (NumThreadsClause != 0) {
257-
// reset request to avoid propagating to successive #parallel
258-
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
259-
0;
260-
261-
// assume that thread_limit*numlanes is already <= CudaThreadsAvail
262-
// because that is already checked on the host side (CUDA offloading rtl)
263-
if (currTaskDescr->ThreadLimit() != 0)
264-
CudaThreadsForParallel =
265-
NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
266-
? NumThreadsClause * NumLanes
267-
: currTaskDescr->ThreadLimit() * NumLanes;
268-
else {
269-
CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
270-
? CudaThreadsAvail
271-
: NumThreadsClause * NumLanes;
272-
}
273-
} else {
274-
if (currTaskDescr->ThreadLimit() != 0) {
275-
CudaThreadsForParallel =
276-
(currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
277-
? CudaThreadsAvail
278-
: currTaskDescr->ThreadLimit() * NumLanes;
279-
} else
280-
CudaThreadsForParallel = CudaThreadsAvail;
281-
}
253+
uint16_t NumThreads =
254+
determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(),
255+
currTaskDescr->ThreadLimit());
282256

283-
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
284-
// On Volta and newer architectures we require that all lanes in
285-
// a warp participate in the parallel region. Round down to a
286-
// multiple of WARPSIZE since it is legal to do so in OpenMP.
287-
// CudaThreadsAvail is the number of workers available in this
288-
// kernel instance and is greater than or equal to
289-
// currTaskDescr->ThreadLimit().
290-
if (CudaThreadsForParallel < CudaThreadsAvail) {
291-
CudaThreadsForParallel =
292-
(CudaThreadsForParallel < WARPSIZE)
293-
? 1
294-
: CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
257+
if (NumThreadsClause != 0) {
258+
// Reset request to avoid propagating to successive #parallel
259+
NumThreadsClause = 0;
295260
}
296-
#endif
297261

298-
ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
299-
"bad thread request of %d threads", CudaThreadsForParallel);
262+
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
263+
NumThreads);
300264
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
301265
"only team master can create parallel");
302266

303-
// set number of threads on work descriptor
304-
// this is different from the number of cuda threads required for the parallel
305-
// region
267+
// Set number of threads on work descriptor.
306268
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
307-
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
308-
CudaThreadsForParallel / NumLanes);
269+
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads);
309270
}
310271

311272
// All workers call this function. Deactivate those not needed.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
// RUN: %compile-run-and-check
2+
3+
#include <stdio.h>
4+
#include <omp.h>
5+
6+
const int WarpSize = 32;
7+
const int NumThreads1 = 1 * WarpSize;
8+
const int NumThreads2 = 2 * WarpSize;
9+
const int NumThreads3 = 3 * WarpSize;
10+
const int MaxThreads = 1024;
11+
12+
int main(int argc, char *argv[]) {
13+
int check1[MaxThreads];
14+
int check2[MaxThreads];
15+
int check3[MaxThreads];
16+
int check4[MaxThreads];
17+
for (int i = 0; i < MaxThreads; i++) {
18+
check1[i] = check2[i] = check3[i] = check4[i] = 0;
19+
}
20+
21+
int maxThreads1 = -1;
22+
int maxThreads2 = -1;
23+
int maxThreads3 = -1;
24+
25+
#pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
26+
map(maxThreads1, maxThreads2, maxThreads3)
27+
{
28+
#pragma omp parallel num_threads(NumThreads1)
29+
{
30+
check1[omp_get_thread_num()] += omp_get_num_threads();
31+
}
32+
33+
// API method to set number of threads in parallel regions without
34+
// num_threads() clause.
35+
omp_set_num_threads(NumThreads2);
36+
maxThreads1 = omp_get_max_threads();
37+
#pragma omp parallel
38+
{
39+
check2[omp_get_thread_num()] += omp_get_num_threads();
40+
}
41+
42+
maxThreads2 = omp_get_max_threads();
43+
44+
// num_threads() clause should override nthreads-var ICV.
45+
#pragma omp parallel num_threads(NumThreads3)
46+
{
47+
check3[omp_get_thread_num()] += omp_get_num_threads();
48+
}
49+
50+
maxThreads3 = omp_get_max_threads();
51+
52+
// Effect from omp_set_num_threads() should still be visible.
53+
#pragma omp parallel
54+
{
55+
check4[omp_get_thread_num()] += omp_get_num_threads();
56+
}
57+
}
58+
59+
// CHECK: maxThreads1 = 64
60+
printf("maxThreads1 = %d\n", maxThreads1);
61+
// CHECK: maxThreads2 = 64
62+
printf("maxThreads2 = %d\n", maxThreads2);
63+
// CHECK: maxThreads3 = 64
64+
printf("maxThreads3 = %d\n", maxThreads3);
65+
66+
// CHECK-NOT: invalid
67+
for (int i = 0; i < MaxThreads; i++) {
68+
if (i < NumThreads1) {
69+
if (check1[i] != NumThreads1) {
70+
printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
71+
}
72+
} else if (check1[i] != 0) {
73+
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
74+
}
75+
76+
if (i < NumThreads2) {
77+
if (check2[i] != NumThreads2) {
78+
printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
79+
}
80+
} else if (check2[i] != 0) {
81+
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
82+
}
83+
84+
if (i < NumThreads3) {
85+
if (check3[i] != NumThreads3) {
86+
printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
87+
}
88+
} else if (check3[i] != 0) {
89+
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
90+
}
91+
92+
if (i < NumThreads2) {
93+
if (check4[i] != NumThreads2) {
94+
printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
95+
}
96+
} else if (check4[i] != 0) {
97+
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
98+
}
99+
}
100+
101+
return 0;
102+
}

0 commit comments

Comments
 (0)
Please sign in to comment.