Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/cancel.cu +++ libomptarget/deviceRTLs/nvptx/src/cancel.cu @@ -15,14 +15,14 @@ EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal); + PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); // disabled return FALSE; } EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal); + PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); // disabled return FALSE; } Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -84,7 +84,7 @@ "Entering __kmpc_initialize_data_sharing_environment\n"); unsigned WID = getWarpId(); - DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID); + DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); @@ -95,15 +95,16 @@ // We don't need to initialize the frame and active threads. - DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize); - DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS); + DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize); + DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS); DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", - (long long)RootS->DataEnd); - DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next); + (unsigned long long)RootS->DataEnd); + DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", + (unsigned long long)RootS->Next); DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", - (long long)DataSharingState.SlotPtr[WID]); + (unsigned long long)DataSharingState.SlotPtr[WID]); DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", - (long long)DataSharingState.StackPtr[WID]); + (unsigned long long)DataSharingState.StackPtr[WID]); DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); } @@ -121,8 +122,9 @@ if (!IsOMPRuntimeInitialized) return (void *)&DataSharingState; - DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize); + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); unsigned WID = getWarpId(); unsigned CurActiveThreads = getActiveThreadsMask(); @@ -139,11 +141,11 @@ *SavedSharedFrame = FrameP; *SavedActiveThreads = ActiveT; - DSPRINT(DSFLAG, "Warp ID: %d\n", WID); - DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP); - DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP); + DSPRINT(DSFLAG, "Warp ID: %u\n", WID); + DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP); DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); // Only the warp active master needs to grow the stack. if (IsWarpMasterActiveThread()) { @@ -161,12 +163,16 @@ const uintptr_t RequiredEndAddress = CurrentStartAddress + (uintptr_t)SharingDataSize; - DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize); - DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress); - DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress); - DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress); - DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT); + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); + DSPRINT(DSFLAG, "Current Start Address %016llx\n", + (unsigned long long)CurrentStartAddress); + DSPRINT(DSFLAG, "Current End Address %016llx\n", + (unsigned long long)CurrentEndAddress); + DSPRINT(DSFLAG, "Required End Address %016llx\n", + (unsigned long long)RequiredEndAddress); + DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT); // If we require a new slot, allocate it and initialize it (or attempt to // reuse one). Also, set the shared stack and slot pointers to the new @@ -184,11 +190,11 @@ (uintptr_t)(&ExistingSlot->Data[0]); if (ExistingSlotSize >= NewSize) { DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", - (long long)ExistingSlot); + (unsigned long long)ExistingSlot); NewSlot = ExistingSlot; } else { DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", - (long long)SlotP->Next); + (unsigned long long)SlotP->Next); free(ExistingSlot); } } @@ -197,7 +203,7 @@ NewSlot = (__kmpc_data_sharing_slot *)malloc( sizeof(__kmpc_data_sharing_slot) + NewSize); DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", - (long long)NewSlot, NewSize); + (unsigned long long)NewSlot, NewSize); } NewSlot->Next = 0; @@ -213,7 +219,7 @@ // not eliminate them because that may be used to return data. if (SlotP->Next) { DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", - (long long)SlotP->Next); + (unsigned long long)SlotP->Next); free(SlotP->Next); SlotP->Next = 0; } @@ -275,8 +281,8 @@ // have other threads that will return after the current ones. ActiveT &= ~CurActive; - DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive, - ActiveT); + DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", + (unsigned)CurActive, (unsigned)ActiveT); if (!ActiveT) { // No other active threads? Great, lets restore the stack. @@ -290,10 +296,13 @@ FrameP = *SavedSharedFrame; ActiveT = *SavedActiveThreads; - DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP); - DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP); - DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT); + DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", + (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", + (unsigned long long)StackP); + DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", + (unsigned long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); } } @@ -319,7 +328,7 @@ unsigned SourceWID = SourceThreadID / WARPSIZE; - DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); + DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID); void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); Index: libomptarget/deviceRTLs/nvptx/src/debug.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/debug.h +++ libomptarget/deviceRTLs/nvptx/src/debug.h @@ -164,16 +164,18 @@ #define PRINT0(_flag, _str) \ { \ if (omptarget_device_environment.debug_level && DON(_flag)) { \ - printf(": " _str, blockIdx.x, threadIdx.x, \ - threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + printf(": " _str, (int)blockIdx.x, \ + (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F)); \ } \ } #define PRINT(_flag, _str, _args...) \ { \ if (omptarget_device_environment.debug_level && DON(_flag)) { \ - printf(": " _str, blockIdx.x, threadIdx.x, \ - threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + printf(": " _str, (int)blockIdx.x, \ + (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F), _args); \ } \ } #else @@ -217,16 +219,18 @@ #define ASSERT0(_flag, _cond, _str) \ { \ if (TON(_flag) && !(_cond)) { \ - printf(" ASSERT: " _str "\n", blockIdx.x, \ - threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + printf(" ASSERT: " _str "\n", \ + (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F)); \ assert(_cond); \ } \ } #define ASSERT(_flag, _cond, _str, _args...) \ { \ if (TON(_flag) && !(_cond)) { \ - printf(" ASSERT: " _str "\n", blockIdx.x, \ - threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + printf(" ASSERT: " _str "\n", \ + (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F), _args); \ assert(_cond); \ } \ } @@ -253,15 +257,17 @@ #define WARNING0(_flag, _str) \ { \ if (WON(_flag)) { \ - printf(" WARNING: " _str, blockIdx.x, \ - threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \ + printf(" WARNING: " _str, (int)blockIdx.x, \ + (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F)); \ } \ } #define WARNING(_flag, _str, _args...) \ { \ if (WON(_flag)) { \ - printf(" WARNING: " _str, blockIdx.x, \ - threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \ + printf(" WARNING: " _str, (int)blockIdx.x, \ + (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \ + (int)(threadIdx.x & 0x1F), _args); \ } \ } Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -222,9 +222,11 @@ " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", "ancestor", steps, (currTaskDescr->IsParallelConstruct() ? "par" : "task"), - currTaskDescr->InParallelRegion(), sched, - currTaskDescr->RuntimeChunkSize(), currTaskDescr->ThreadId(), - currTaskDescr->ThreadsInTeam(), currTaskDescr->NThreads()); + (int)currTaskDescr->InParallelRegion(), (int)sched, + currTaskDescr->RuntimeChunkSize(), + (int)currTaskDescr->ThreadId(), + (int)currTaskDescr->ThreadsInTeam(), + (int)currTaskDescr->NThreads()); } if (currTaskDescr->IsParallelConstruct()) { Index: libomptarget/deviceRTLs/nvptx/src/loop.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/loop.cu +++ libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -113,7 +113,8 @@ PRINT(LD_LOOP, "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " "%d, num tids %d\n", - gtid, schedtype, P64(chunk), gtid, numberOfActiveOMPThreads); + (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, + (int)numberOfActiveOMPThreads); ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, "current thread is not needed here; error"); @@ -173,9 +174,9 @@ break; } default: { - ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype); + ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype); PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", - schedtype); + (int)schedtype); ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, numberOfActiveOMPThreads); break; @@ -189,8 +190,9 @@ PRINT(LD_LOOP, "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " "%d\n", - numberOfActiveOMPThreads, GetNumberOfWorkersInTeam(), P64(*plower), - P64(*pupper), P64(*pstride), lastiter); + (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), + (long long)(*plower), (long long)(*pupper), (long long)(*pstride), + (int)lastiter); } //////////////////////////////////////////////////////////////////////////////// @@ -229,7 +231,7 @@ __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", - (long)tnum, P64(tripCount), schedule); + (long)tnum, (long long)tripCount, (int)schedule); schedule = kmp_sched_static_chunk; chunk = tripCount; // one thread gets the whole loop } else if (schedule == kmp_sched_runtime) { @@ -255,18 +257,20 @@ break; } } - PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule, - P64(chunk)); + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); } else if (schedule == kmp_sched_auto) { schedule = kmp_sched_static_chunk; chunk = 1; - PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule, - P64(chunk)); + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); } else { - PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk)); + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); ASSERT(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); + "unknown schedule %d & chunk %lld\n", (int)schedule, + (long long)chunk); } // init schedules @@ -287,9 +291,12 @@ PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->Stride(tid)); + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); } else if (schedule == kmp_sched_static_balanced_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); // save sched state @@ -316,9 +323,12 @@ PRINT(LD_LOOP, "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->Stride(tid)); + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); // save sched state @@ -336,9 +346,12 @@ PRINT(LD_LOOP, "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ", next lower bound = %llu, stride = %llu\n", - tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->Stride(tid)); + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { __kmpc_barrier(loc, threadId); @@ -356,7 +369,9 @@ PRINT(LD_LOOP, "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 ", chunk %" PRIu64 "\n", - tnum, omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), + (int)tnum, + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), omptarget_nvptx_threadPrivateContext->Chunk(teamId)); } @@ -380,22 +395,22 @@ // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED // a. if (lb <= loopUpperBound && ub < loopUpperBound) { - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), - P64(ub), P64(loopUpperBound)); + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); return NOT_FINISHED; } // b. if (lb <= loopUpperBound) { PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", - P64(lb), P64(ub), P64(loopUpperBound)); + (long long)lb, (long long)ub, (long long)loopUpperBound); ub = loopUpperBound; return LAST_CHUNK; } // c. if we are here, we are in case 'c' lb = loopUpperBound + 2; ub = loopUpperBound + 1; - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), - P64(ub), P64(loopUpperBound)); + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, + (long long)ub, (long long)loopUpperBound); return FINISHED; } @@ -426,7 +441,7 @@ // finished? if (myLb > ub) { PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", - P64(myLb), P64(ub)); + (long long)myLb, (long long)ub); return DISPATCH_FINISHED; } // not finished, save current bounds @@ -442,7 +457,7 @@ ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", - P64(*plower), P64(*pupper)); + (long long)*plower, (long long)*pupper); return DISPATCH_NOTFINISHED; } ASSERT0(LT_FUSSY, @@ -464,12 +479,13 @@ *pupper = myUb; *pstride = 1; - PRINT(LD_LOOP, - "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " - "last %d\n", - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), - GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride), - *plast); + PRINT( + LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", + (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), + (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper, + (long long)*pstride, (int)*plast); return DISPATCH_NOTFINISHED; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -150,7 +150,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - newTaskDescr->ThreadId(), newTaskDescr->ThreadsInTeam()); + (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam()); if (RequiresDataSharing && threadId % WARPSIZE == 0) { // Warp master innitializes data sharing environment. Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -76,7 +76,7 @@ else *NumLanes = ConvergentSize; ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", - *NumLanes); + (int)*NumLanes); // Set to true for lanes participating in the simd region. bool isActive = false; @@ -152,7 +152,7 @@ else NumThreads = ConvergentSize; ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - NumThreads); + (int)NumThreads); // Set to true for workers participating in the parallel region. bool isActive = false; @@ -260,7 +260,7 @@ } ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - NumThreads); + (int)NumThreads); ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); @@ -307,7 +307,7 @@ PRINT(LD_PAR, "thread will execute parallel region with id %d in a team of " "%d threads\n", - newTaskDescr->ThreadId(), newTaskDescr->NThreads()); + (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); isActive = true; } @@ -438,7 +438,7 @@ EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, int32_t simd_limit) { - PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit); + PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; @@ -449,12 +449,12 @@ EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, int32_t num_teams, int32_t thread_limit) { - PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams); + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); ASSERT0(LT_FUSSY, FALSE, "should never have anything with new teams on device"); } EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) { - PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind); + PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); } Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -76,12 +76,7 @@ } EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - int lo, hi; - asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); - hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); - lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); - asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); - return val; + return __SHFL_DOWN_SYNC(0xFFFFFFFFFFFFFFFFL, val, delta, size); } static INLINE void gpu_regular_warp_reduce(void *reduce_data, Index: libomptarget/deviceRTLs/nvptx/src/supporti.h =================================================================== --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -231,19 +231,20 @@ { // compute the necessary padding to satisfy alignment constraint ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, - "alignment %ld is not a power of 2\n", alignment); + "alignment %lu is not a power of 2\n", alignment); return (~(unsigned long)size + 1) & (alignment - 1); } INLINE void *SafeMalloc(size_t size, const char *msg) // check if success { void *ptr = malloc(size); - PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); + PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, + (unsigned long long)ptr); return ptr; } INLINE void *SafeFree(void *ptr, const char *msg) { - PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg); + PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); free(ptr); return NULL; } Index: libomptarget/deviceRTLs/nvptx/src/sync.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/sync.cu +++ libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -61,7 +61,7 @@ PRINT(LD_SYNC, "call kmpc_barrier with %d omp threads, sync parameter %d\n", - numberOfActiveOMPThreads, threads); + (int)numberOfActiveOMPThreads, (int)threads); // Barrier #1 is for synchronization among active threads. named_sync(L1_BARRIER, threads); } @@ -89,7 +89,7 @@ PRINT(LD_SYNC, "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " "%d\n", - numberOfActiveOMPThreads, threads); + (int)numberOfActiveOMPThreads, (int)threads); // Barrier #1 is for synchronization among active threads. named_sync(L1_BARRIER, threads); PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); Index: libomptarget/deviceRTLs/nvptx/src/task.cu =================================================================== --- libomptarget/deviceRTLs/nvptx/src/task.cu +++ libomptarget/deviceRTLs/nvptx/src/task.cu @@ -39,14 +39,15 @@ PRINT(LD_IO, "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " "fct 0x%llx)\n", - P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub)); + (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable, + (unsigned long long)taskSub); // want task+priv to be a multiple of 8 bytes size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); sizeOfTaskInclPrivate += padForTaskInclPriv; size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, "need task descr of size %d to be a multiple of %d\n", - sizeof(omptarget_nvptx_TaskDescr), sizeof(void *)); + (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *)); size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( @@ -63,7 +64,8 @@ newKmpTaskDescr->sub = taskSub; newKmpTaskDescr->destructors = NULL; PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", - P64(newKmpTaskDescr), P64(newExplicitTaskDescr)); + (unsigned long long)newKmpTaskDescr, + (unsigned long long)newExplicitTaskDescr); return newKmpTaskDescr; } @@ -102,10 +104,11 @@ // 3. call sub PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", - P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr)); + (unsigned long long)newKmpTaskDescr->sub, + (unsigned long long)newKmpTaskDescr); newKmpTaskDescr->sub(0, newKmpTaskDescr); PRINT(LD_TASK, "return from call task sub 0x%llx()\n", - P64(newKmpTaskDescr->sub)); + (unsigned long long)newKmpTaskDescr->sub); // 4. pop context omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, @@ -118,7 +121,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", - P64(newKmpTaskDescr)); + (unsigned long long)newKmpTaskDescr); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); // 1. get explict task descr from kmp task descr @@ -144,7 +147,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, kmp_TaskDescr *newKmpTaskDescr) { PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", - P64(newKmpTaskDescr)); + (unsigned long long)newKmpTaskDescr); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); // 1. get explict task descr from kmp task descr