Index: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -215,7 +215,8 @@ schedule <= kmp_sched_ordered_last; } - INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st, + INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, ST chunk) { int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); @@ -282,18 +283,15 @@ "unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); } - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // init schedules if (schedule == kmp_sched_static_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; // save ub omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; // compute static chunk ST stride; - T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); int lastiter = 0; ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params @@ -301,8 +299,8 @@ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 "," - "next lower bound = %llu, stride = %llu\n", + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), @@ -310,11 +308,12 @@ } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; // save ub omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; // compute static chunk ST stride; - T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); int lastiter = 0; ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params @@ -322,45 +321,50 @@ omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; PRINT(LD_LOOP, - "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 "," - "next lower bound = %llu, stride = %llu\n", + "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - if (chunk < 1) - chunk = 1; - Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks - // but each thread (but one) must discover that it is last - eventNum += tnum; - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum; + if (isSPMDMode()) + __syncthreads(); + else + __kmpc_barrier(loc, threadId); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + if (GetThreadIdInBlock() == 0) { + if (chunk < 1) + chunk = 1; + int teamId = GetOmpTeamId(); + omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub; + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb; + } + if (isSPMDMode()) + __syncthreads(); + else + __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, - "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", " - "events number = %llu\n", + "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 + ", chunk %" PRIu64 "\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), + omptarget_nvptx_threadPrivateContext->Chunk(teamId)); } } //////////////////////////////////////////////////////////////////////////////// // Support for dispatch next - INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg, - Counter priv, T &lb, T &ub, - Counter &chunkId, Counter ¤tEvent, - T chunkSize, T loopUpperBound) { - // get next event atomically - Counter nextEvent = cg.Next(); - // calculate chunk Id (priv was initialized upon entering the loop to - // 'start' == 'event') - chunkId = nextEvent - priv; + INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + Counter &loopLowerBound, + T loopUpperBound) { // calculate lower bound for all lanes in the warp - lb = chunkId * chunkSize; // this code assume normalization of LB + lb = atomicAdd(&loopLowerBound, (Counter)chunkSize); ub = lb + chunkSize - 1; // Clang uses i <= ub // 3 result cases: @@ -368,9 +372,8 @@ // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> // NOT_FINISHED // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - currentEvent = nextEvent; // a. - if (ub <= loopUpperBound) { + if (lb <= loopUpperBound && ub < loopUpperBound) { PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), P64(ub), P64(loopUpperBound)); return NOT_FINISHED; @@ -383,7 +386,8 @@ return LAST_CHUNK; } // c. if we are here, we are in case 'c' - lb = loopUpperBound + 1; + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), P64(ub), P64(loopUpperBound)); return FINISHED; @@ -437,29 +441,18 @@ ASSERT0(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, "bad sched"); - omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); T myLb, myUb; - Counter chunkId; - // xxx current event is now local - omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup(); + int teamId = GetOmpTeamId(); int finished = DynamicNextChunk( - cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb, - chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); - - if (finished == FINISHED) { - cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid), - omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); - cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid), - omptarget_nvptx_threadPrivateContext->CurrentEvent(tid)); + myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId), + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId)); + if (finished == FINISHED) return DISPATCH_FINISHED; - } // not finished (either not finished or last chunk) - *plast = (int32_t)( - myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + *plast = (int32_t)(finished == LAST_CHUNK); *plower = myLb; *pupper = myUb; *pstride = 1; @@ -491,7 +484,7 @@ int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, @@ -499,7 +492,7 @@ int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, @@ -507,7 +500,7 @@ int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, @@ -515,7 +508,7 @@ int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } // next