@@ -193,25 +193,38 @@ EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
193
193
// support for parallel that goes parallel (1 static level only)
194
194
// //////////////////////////////////////////////////////////////////////////////
195
195
196
- // return number of cuda threads that participate to parallel
197
- // calculation has to consider simd implementation in nvptx
198
- // i.e. (num omp threads * num lanes)
199
- //
200
- // cudathreads =
201
- // if(num_threads != 0) {
202
- // if(thread_limit > 0) {
203
- // min (num_threads*numLanes ; thread_limit*numLanes);
204
- // } else {
205
- // min (num_threads*numLanes; blockDim.x)
206
- // }
207
- // } else {
208
- // if (thread_limit != 0) {
209
- // min (thread_limit*numLanes; blockDim.x)
210
- // } else { // no thread_limit, no num_threads, use all cuda threads
211
- // blockDim.x;
212
- // }
213
- // }
214
- //
196
+ static INLINE uint16_t determineNumberOfThreads (uint16_t NumThreadsClause,
197
+ uint16_t NThreadsICV,
198
+ uint16_t ThreadLimit) {
199
+ uint16_t ThreadsRequested = NThreadsICV;
200
+ if (NumThreadsClause != 0 ) {
201
+ ThreadsRequested = NumThreadsClause;
202
+ }
203
+
204
+ uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam ();
205
+ if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
206
+ ThreadsAvailable = ThreadLimit;
207
+ }
208
+
209
+ uint16_t NumThreads = ThreadsAvailable;
210
+ if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
211
+ NumThreads = ThreadsRequested;
212
+ }
213
+
214
+ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
215
+ // On Volta and newer architectures we require that all lanes in
216
+ // a warp participate in the parallel region. Round down to a
217
+ // multiple of WARPSIZE since it is legal to do so in OpenMP.
218
+ if (NumThreads < WARPSIZE) {
219
+ NumThreads = 1 ;
220
+ } else {
221
+ NumThreads = (NumThreads & ~((uint16_t )WARPSIZE - 1 ));
222
+ }
223
+ #endif
224
+
225
+ return NumThreads;
226
+ }
227
+
215
228
// This routine is always called by the team master..
216
229
EXTERN void __kmpc_kernel_prepare_parallel (void *WorkFn,
217
230
int16_t IsOMPRuntimeInitialized) {
@@ -234,78 +247,26 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
234
247
return ;
235
248
}
236
249
237
- uint16_t CudaThreadsForParallel = 0 ;
238
- uint16_t NumThreadsClause =
250
+ uint16_t &NumThreadsClause =
239
251
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel (threadId);
240
252
241
- // we cannot have more than block size
242
- uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam ();
243
-
244
- // currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
245
- // specified by the thread_limit clause on the target directive.
246
- // GetNumberOfWorkersInTeam(): This is the number of workers available
247
- // in this kernel instance.
248
- //
249
- // E.g: If thread_limit is 33, the kernel is launched with 33+32=65
250
- // threads. The last warp is the master warp so in this case
251
- // GetNumberOfWorkersInTeam() returns 64.
252
-
253
- // this is different from ThreadAvail of OpenMP because we may be
254
- // using some of the CUDA threads as SIMD lanes
255
- int NumLanes = 1 ;
256
- if (NumThreadsClause != 0 ) {
257
- // reset request to avoid propagating to successive #parallel
258
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel (threadId) =
259
- 0 ;
260
-
261
- // assume that thread_limit*numlanes is already <= CudaThreadsAvail
262
- // because that is already checked on the host side (CUDA offloading rtl)
263
- if (currTaskDescr->ThreadLimit () != 0 )
264
- CudaThreadsForParallel =
265
- NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit () * NumLanes
266
- ? NumThreadsClause * NumLanes
267
- : currTaskDescr->ThreadLimit () * NumLanes;
268
- else {
269
- CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
270
- ? CudaThreadsAvail
271
- : NumThreadsClause * NumLanes;
272
- }
273
- } else {
274
- if (currTaskDescr->ThreadLimit () != 0 ) {
275
- CudaThreadsForParallel =
276
- (currTaskDescr->ThreadLimit () * NumLanes > CudaThreadsAvail)
277
- ? CudaThreadsAvail
278
- : currTaskDescr->ThreadLimit () * NumLanes;
279
- } else
280
- CudaThreadsForParallel = CudaThreadsAvail;
281
- }
253
+ uint16_t NumThreads =
254
+ determineNumberOfThreads (NumThreadsClause, currTaskDescr->NThreads (),
255
+ currTaskDescr->ThreadLimit ());
282
256
283
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
284
- // On Volta and newer architectures we require that all lanes in
285
- // a warp participate in the parallel region. Round down to a
286
- // multiple of WARPSIZE since it is legal to do so in OpenMP.
287
- // CudaThreadsAvail is the number of workers available in this
288
- // kernel instance and is greater than or equal to
289
- // currTaskDescr->ThreadLimit().
290
- if (CudaThreadsForParallel < CudaThreadsAvail) {
291
- CudaThreadsForParallel =
292
- (CudaThreadsForParallel < WARPSIZE)
293
- ? 1
294
- : CudaThreadsForParallel & ~((uint16_t )WARPSIZE - 1 );
257
+ if (NumThreadsClause != 0 ) {
258
+ // Reset request to avoid propagating to successive #parallel
259
+ NumThreadsClause = 0 ;
295
260
}
296
- #endif
297
261
298
- ASSERT (LT_FUSSY, CudaThreadsForParallel > 0 ,
299
- " bad thread request of %d threads " , CudaThreadsForParallel );
262
+ ASSERT (LT_FUSSY, NumThreads > 0 , " bad thread request of %d threads " ,
263
+ NumThreads );
300
264
ASSERT0 (LT_FUSSY, GetThreadIdInBlock () == GetMasterThreadID (),
301
265
" only team master can create parallel" );
302
266
303
- // set number of threads on work descriptor
304
- // this is different from the number of cuda threads required for the parallel
305
- // region
267
+ // Set number of threads on work descriptor.
306
268
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor ();
307
- workDescr.WorkTaskDescr ()->CopyToWorkDescr (currTaskDescr,
308
- CudaThreadsForParallel / NumLanes);
269
+ workDescr.WorkTaskDescr ()->CopyToWorkDescr (currTaskDescr, NumThreads);
309
270
}
310
271
311
272
// All workers call this function. Deactivate those not needed.
0 commit comments