Changeset View
Changeset View
Standalone View
Standalone View
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/loop.cu
Show First 20 Lines • Show All 209 Lines • ▼ Show 20 Lines | public: | ||||
//////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||
// Support for dispatch Init | // Support for dispatch Init | ||||
INLINE static int OrderedSchedule(kmp_sched_t schedule) { | INLINE static int OrderedSchedule(kmp_sched_t schedule) { | ||||
return schedule >= kmp_sched_ordered_first && | return schedule >= kmp_sched_ordered_first && | ||||
schedule <= kmp_sched_ordered_last; | schedule <= kmp_sched_ordered_last; | ||||
} | } | ||||
INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st, | INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId, | ||||
kmp_sched_t schedule, T lb, T ub, ST st, | |||||
ST chunk) { | ST chunk) { | ||||
int tid = GetLogicalThreadIdInBlock(); | int tid = GetLogicalThreadIdInBlock(); | ||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); | omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); | ||||
T tnum = currTaskDescr->ThreadsInTeam(); | T tnum = currTaskDescr->ThreadsInTeam(); | ||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive | T tripCount = ub - lb + 1; // +1 because ub is inclusive | ||||
ASSERT0( | ASSERT0( | ||||
LT_FUSSY, | LT_FUSSY, | ||||
GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) < | GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) < | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { | ||||
P64(chunk)); | P64(chunk)); | ||||
} else { | } else { | ||||
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); | PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); | ||||
ASSERT(LT_FUSSY, | ASSERT(LT_FUSSY, | ||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, | schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, | ||||
"unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); | "unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); | ||||
} | } | ||||
// save sched state | |||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; | |||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; | |||||
// init schedules | // init schedules | ||||
if (schedule == kmp_sched_static_chunk) { | if (schedule == kmp_sched_static_chunk) { | ||||
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); | ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); | ||||
// save sched state | |||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; | |||||
// save ub | // save ub | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; | omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; | ||||
// compute static chunk | // compute static chunk | ||||
ST stride; | ST stride; | ||||
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); | |||||
int lastiter = 0; | int lastiter = 0; | ||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); | ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); | ||||
// save computed params | // save computed params | ||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; | omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; | ||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; | omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; | ||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; | omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; | ||||
PRINT(LD_LOOP, | PRINT(LD_LOOP, | ||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64 "," | "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 | ||||
"next lower bound = %llu, stride = %llu\n", | ", next lower bound = %llu, stride = %llu\n", | ||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), | omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), | ||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), | omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), | ||||
omptarget_nvptx_threadPrivateContext->Stride(tid)); | omptarget_nvptx_threadPrivateContext->Stride(tid)); | ||||
} else if (schedule == kmp_sched_static_nochunk) { | } else if (schedule == kmp_sched_static_nochunk) { | ||||
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); | ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); | ||||
// save sched state | |||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; | |||||
// save ub | // save ub | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; | omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; | ||||
// compute static chunk | // compute static chunk | ||||
ST stride; | ST stride; | ||||
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); | |||||
int lastiter = 0; | int lastiter = 0; | ||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); | ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); | ||||
// save computed params | // save computed params | ||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; | omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; | ||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; | omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; | ||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; | omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; | ||||
PRINT(LD_LOOP, | PRINT(LD_LOOP, | ||||
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 "," | "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 | ||||
"next lower bound = %llu, stride = %llu\n", | ", next lower bound = %llu, stride = %llu\n", | ||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), | omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), | ||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), | omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), | ||||
omptarget_nvptx_threadPrivateContext->Stride(tid)); | omptarget_nvptx_threadPrivateContext->Stride(tid)); | ||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { | } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { | ||||
if (isSPMDMode()) | |||||
__syncthreads(); | |||||
else | |||||
__kmpc_barrier(loc, threadId); | |||||
// save sched state | |||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; | |||||
if (GetThreadIdInBlock() == 0) { | |||||
if (chunk < 1) | if (chunk < 1) | ||||
chunk = 1; | chunk = 1; | ||||
Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks | int teamId = GetOmpTeamId(); | ||||
// but each thread (but one) must discover that it is last | omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk; | ||||
eventNum += tnum; | omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub; | ||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; | omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb; | ||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum; | } | ||||
if (isSPMDMode()) | |||||
__syncthreads(); | |||||
else | |||||
__kmpc_barrier(loc, threadId); | |||||
PRINT(LD_LOOP, | PRINT(LD_LOOP, | ||||
"dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", " | "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 | ||||
"events number = %llu\n", | ", chunk %" PRIu64 "\n", | ||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), | omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), | ||||
omptarget_nvptx_threadPrivateContext->Chunk(tid), | omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), | ||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); | omptarget_nvptx_threadPrivateContext->Chunk(teamId)); | ||||
} | } | ||||
} | } | ||||
//////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||
// Support for dispatch next | // Support for dispatch next | ||||
INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg, | INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, | ||||
Counter priv, T &lb, T &ub, | Counter &loopLowerBound, | ||||
Counter &chunkId, Counter ¤tEvent, | T loopUpperBound) { | ||||
T chunkSize, T loopUpperBound) { | |||||
// get next event atomically | |||||
Counter nextEvent = cg.Next(); | |||||
// calculate chunk Id (priv was initialized upon entering the loop to | |||||
// 'start' == 'event') | |||||
chunkId = nextEvent - priv; | |||||
// calculate lower bound for all lanes in the warp | // calculate lower bound for all lanes in the warp | ||||
lb = chunkId * chunkSize; // this code assume normalization of LB | lb = atomicAdd(&loopLowerBound, (Counter)chunkSize); | ||||
ub = lb + chunkSize - 1; // Clang uses i <= ub | ub = lb + chunkSize - 1; // Clang uses i <= ub | ||||
// 3 result cases: | // 3 result cases: | ||||
// a. lb and ub < loopUpperBound --> NOT_FINISHED | // a. lb and ub < loopUpperBound --> NOT_FINISHED | ||||
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> | // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> | ||||
// NOT_FINISHED | // NOT_FINISHED | ||||
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED | // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED | ||||
currentEvent = nextEvent; | |||||
// a. | // a. | ||||
if (ub <= loopUpperBound) { | if (lb <= loopUpperBound && ub < loopUpperBound) { | ||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), | PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), | ||||
P64(ub), P64(loopUpperBound)); | P64(ub), P64(loopUpperBound)); | ||||
return NOT_FINISHED; | return NOT_FINISHED; | ||||
} | } | ||||
// b. | // b. | ||||
if (lb <= loopUpperBound) { | if (lb <= loopUpperBound) { | ||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", | PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", | ||||
P64(lb), P64(ub), P64(loopUpperBound)); | P64(lb), P64(ub), P64(loopUpperBound)); | ||||
ub = loopUpperBound; | ub = loopUpperBound; | ||||
return LAST_CHUNK; | return LAST_CHUNK; | ||||
} | } | ||||
// c. if we are here, we are in case 'c' | // c. if we are here, we are in case 'c' | ||||
lb = loopUpperBound + 1; | lb = loopUpperBound + 2; | ||||
ub = loopUpperBound + 1; | |||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), | PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), | ||||
P64(ub), P64(loopUpperBound)); | P64(ub), P64(loopUpperBound)); | ||||
return FINISHED; | return FINISHED; | ||||
} | } | ||||
// On Pascal, with inlining of the runtime into the user application, | // On Pascal, with inlining of the runtime into the user application, | ||||
// this code deadlocks. This is probably because different threads | // this code deadlocks. This is probably because different threads | ||||
// in a warp cannot make independent progress. | // in a warp cannot make independent progress. | ||||
Show All 37 Lines | if (schedule == kmp_sched_static_chunk || | ||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; | omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; | ||||
PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", | PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", | ||||
P64(*plower), P64(*pupper)); | P64(*plower), P64(*pupper)); | ||||
return DISPATCH_NOTFINISHED; | return DISPATCH_NOTFINISHED; | ||||
} | } | ||||
ASSERT0(LT_FUSSY, | ASSERT0(LT_FUSSY, | ||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, | schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, | ||||
"bad sched"); | "bad sched"); | ||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); | |||||
T myLb, myUb; | T myLb, myUb; | ||||
Counter chunkId; | int teamId = GetOmpTeamId(); | ||||
// xxx current event is now local | |||||
omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup(); | |||||
int finished = DynamicNextChunk( | int finished = DynamicNextChunk( | ||||
cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb, | myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId), | ||||
chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid), | omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), | ||||
omptarget_nvptx_threadPrivateContext->Chunk(tid), | omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId)); | ||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); | |||||
if (finished == FINISHED) { | |||||
cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid), | |||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); | |||||
cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid), | |||||
omptarget_nvptx_threadPrivateContext->CurrentEvent(tid)); | |||||
if (finished == FINISHED) | |||||
return DISPATCH_FINISHED; | return DISPATCH_FINISHED; | ||||
} | |||||
// not finished (either not finished or last chunk) | // not finished (either not finished or last chunk) | ||||
*plast = (int32_t)( | *plast = (int32_t)(finished == LAST_CHUNK); | ||||
myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); | |||||
*plower = myLb; | *plower = myLb; | ||||
*pupper = myUb; | *pupper = myUb; | ||||
*pstride = 1; | *pstride = 1; | ||||
PRINT(LD_LOOP, | PRINT(LD_LOOP, | ||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n", | "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n", | ||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), | ||||
GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), | GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), | ||||
Show All 15 Lines | |||||
//////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||
// init | // init | ||||
EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid, | EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid, | ||||
int32_t schedule, int32_t lb, int32_t ub, | int32_t schedule, int32_t lb, int32_t ub, | ||||
int32_t st, int32_t chunk) { | int32_t st, int32_t chunk) { | ||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); | PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); | ||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( | omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init( | ||||
(kmp_sched_t)schedule, lb, ub, st, chunk); | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); | ||||
} | } | ||||
EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, | EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, | ||||
int32_t schedule, uint32_t lb, uint32_t ub, | int32_t schedule, uint32_t lb, uint32_t ub, | ||||
int32_t st, int32_t chunk) { | int32_t st, int32_t chunk) { | ||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); | PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); | ||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( | omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init( | ||||
(kmp_sched_t)schedule, lb, ub, st, chunk); | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); | ||||
} | } | ||||
EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, | EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, | ||||
int32_t schedule, int64_t lb, int64_t ub, | int32_t schedule, int64_t lb, int64_t ub, | ||||
int64_t st, int64_t chunk) { | int64_t st, int64_t chunk) { | ||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); | PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); | ||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( | omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init( | ||||
(kmp_sched_t)schedule, lb, ub, st, chunk); | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); | ||||
} | } | ||||
EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, | EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, | ||||
int32_t schedule, uint64_t lb, uint64_t ub, | int32_t schedule, uint64_t lb, uint64_t ub, | ||||
int64_t st, int64_t chunk) { | int64_t st, int64_t chunk) { | ||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); | PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); | ||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( | omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init( | ||||
(kmp_sched_t)schedule, lb, ub, st, chunk); | loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); | ||||
} | } | ||||
// next | // next | ||||
EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last, | EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last, | ||||
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { | int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { | ||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); | PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); | ||||
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( | return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( | ||||
p_last, p_lb, p_ub, p_st); | p_last, p_lb, p_ub, p_st); | ||||
▲ Show 20 Lines • Show All 246 Lines • Show Last 20 Lines |