Index: libomptarget/deviceRTLs/nvptx/src/cancel.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/cancel.cu
+++ libomptarget/deviceRTLs/nvptx/src/cancel.cu
@@ -15,14 +15,14 @@
 
 EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
                                         int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
+  PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
   // disabled
   return FALSE;
 }
 
 EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
                              int32_t cancelVal) {
-  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
+  PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
   // disabled
   return FALSE;
 }
Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -84,7 +84,7 @@
            "Entering __kmpc_initialize_data_sharing_environment\n");
 
   unsigned WID = getWarpId();
-  DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID);
+  DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
 
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
@@ -95,15 +95,16 @@
 
   // We don't need to initialize the frame and active threads.
 
-  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize);
-  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS);
+  DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
+  DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
   DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
-          (long long)RootS->DataEnd);
-  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next);
+          (unsigned long long)RootS->DataEnd);
+  DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
+          (unsigned long long)RootS->Next);
   DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
-          (long long)DataSharingState.SlotPtr[WID]);
+          (unsigned long long)DataSharingState.SlotPtr[WID]);
   DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
-          (long long)DataSharingState.StackPtr[WID]);
+          (unsigned long long)DataSharingState.StackPtr[WID]);
 
   DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
 }
@@ -121,8 +122,9 @@
   if (!IsOMPRuntimeInitialized)
     return (void *)&DataSharingState;
 
-  DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
-  DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
+  DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+  DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+          (unsigned long long)SharingDefaultDataSize);
 
   unsigned WID = getWarpId();
   unsigned CurActiveThreads = getActiveThreadsMask();
@@ -139,11 +141,11 @@
   *SavedSharedFrame = FrameP;
   *SavedActiveThreads = ActiveT;
 
-  DSPRINT(DSFLAG, "Warp ID: %d\n", WID);
-  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP);
-  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP);
+  DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
+  DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
+  DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
   DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
-  DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
+  DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
 
   // Only the warp active master needs to grow the stack.
   if (IsWarpMasterActiveThread()) {
@@ -161,12 +163,16 @@
     const uintptr_t RequiredEndAddress =
         CurrentStartAddress + (uintptr_t)SharingDataSize;
 
-    DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
-    DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
-    DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress);
-    DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress);
-    DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress);
-    DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT);
+    DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
+    DSPRINT(DSFLAG, "Default Data Size %016llx\n",
+            (unsigned long long)SharingDefaultDataSize);
+    DSPRINT(DSFLAG, "Current Start Address %016llx\n",
+            (unsigned long long)CurrentStartAddress);
+    DSPRINT(DSFLAG, "Current End Address %016llx\n",
+            (unsigned long long)CurrentEndAddress);
+    DSPRINT(DSFLAG, "Required End Address %016llx\n",
+            (unsigned long long)RequiredEndAddress);
+    DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
 
     // If we require a new slot, allocate it and initialize it (or attempt to
     // reuse one). Also, set the shared stack and slot pointers to the new
@@ -184,11 +190,11 @@
                                      (uintptr_t)(&ExistingSlot->Data[0]);
         if (ExistingSlotSize >= NewSize) {
           DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
-                  (long long)ExistingSlot);
+                  (unsigned long long)ExistingSlot);
           NewSlot = ExistingSlot;
         } else {
           DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
-                  (long long)SlotP->Next);
+                  (unsigned long long)SlotP->Next);
           free(ExistingSlot);
         }
       }
@@ -197,7 +203,7 @@
         NewSlot = (__kmpc_data_sharing_slot *)malloc(
             sizeof(__kmpc_data_sharing_slot) + NewSize);
         DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
-                (long long)NewSlot, NewSize);
+                (unsigned long long)NewSlot, NewSize);
       }
 
       NewSlot->Next = 0;
@@ -213,7 +219,7 @@
       // not eliminate them because that may be used to return data.
       if (SlotP->Next) {
         DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
-                (long long)SlotP->Next);
+                (unsigned long long)SlotP->Next);
         free(SlotP->Next);
         SlotP->Next = 0;
       }
@@ -275,8 +281,8 @@
     // have other threads that will return after the current ones.
     ActiveT &= ~CurActive;
 
-    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive,
-            ActiveT);
+    DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
+            (unsigned)CurActive, (unsigned)ActiveT);
 
     if (!ActiveT) {
       // No other active threads? Great, lets restore the stack.
@@ -290,10 +296,13 @@
       FrameP = *SavedSharedFrame;
       ActiveT = *SavedActiveThreads;
 
-      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP);
-      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP);
-      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP);
-      DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
+      DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
+              (unsigned long long)SlotP);
+      DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
+              (unsigned long long)StackP);
+      DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
+              (unsigned long long)FrameP);
+      DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
     }
   }
 
@@ -319,7 +328,7 @@
 
   unsigned SourceWID = SourceThreadID / WARPSIZE;
 
-  DSPRINT(DSFLAG, "Source  warp: %d\n", SourceWID);
+  DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);
 
   void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
Index: libomptarget/deviceRTLs/nvptx/src/debug.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/debug.h
+++ libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -164,16 +164,18 @@
 #define PRINT0(_flag, _str)                                                    \
   {                                                                            \
     if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x,   \
-             threadIdx.x / WARPSIZE, threadIdx.x & 0x1F);                      \
+      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, (int)blockIdx.x,           \
+             (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),                  \
+             (int)(threadIdx.x & 0x1F));                                       \
     }                                                                          \
   }
 
 #define PRINT(_flag, _str, _args...)                                           \
   {                                                                            \
     if (omptarget_device_environment.debug_level && DON(_flag)) {              \
-      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x,   \
-             threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args);               \
+      printf("<b %2d, t %4d, w %2d, l %2d>: " _str, (int)blockIdx.x,           \
+             (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),                  \
+             (int)(threadIdx.x & 0x1F), _args);                                \
     }                                                                          \
   }
 #else
@@ -217,16 +219,18 @@
 #define ASSERT0(_flag, _cond, _str)                                            \
   {                                                                            \
     if (TON(_flag) && !(_cond)) {                                              \
-      printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x,    \
-             threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F);         \
+      printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n",                \
+             (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
+             (int)(threadIdx.x & 0x1F));                                       \
       assert(_cond);                                                           \
     }                                                                          \
   }
 #define ASSERT(_flag, _cond, _str, _args...)                                   \
   {                                                                            \
     if (TON(_flag) && !(_cond)) {                                              \
-      printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x,    \
-             threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args);  \
+      printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n",                \
+             (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE), \
+             (int)(threadIdx.x & 0x1F), _args);                                \
       assert(_cond);                                                           \
     }                                                                          \
   }
@@ -253,15 +257,17 @@
 #define WARNING0(_flag, _str)                                                  \
   {                                                                            \
     if (WON(_flag)) {                                                          \
-      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x,        \
-             threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F);         \
+      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, (int)blockIdx.x,   \
+             (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),                  \
+             (int)(threadIdx.x & 0x1F));                                       \
     }                                                                          \
   }
 #define WARNING(_flag, _str, _args...)                                         \
   {                                                                            \
     if (WON(_flag)) {                                                          \
-      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x,        \
-             threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args);  \
+      printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, (int)blockIdx.x,   \
+             (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),                  \
+             (int)(threadIdx.x & 0x1F), _args);                                \
     }                                                                          \
   }
 
Index: libomptarget/deviceRTLs/nvptx/src/libcall.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -222,9 +222,11 @@
                 " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
                 "ancestor", steps,
                 (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
-                currTaskDescr->InParallelRegion(), sched,
-                currTaskDescr->RuntimeChunkSize(), currTaskDescr->ThreadId(),
-                currTaskDescr->ThreadsInTeam(), currTaskDescr->NThreads());
+                (int)currTaskDescr->InParallelRegion(), (int)sched,
+                currTaskDescr->RuntimeChunkSize(),
+                (int)currTaskDescr->ThreadId(),
+                (int)currTaskDescr->ThreadsInTeam(),
+                (int)currTaskDescr->NThreads());
         }
 
         if (currTaskDescr->IsParallelConstruct()) {
Index: libomptarget/deviceRTLs/nvptx/src/loop.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -113,7 +113,8 @@
     PRINT(LD_LOOP,
           "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
           "%d, num tids %d\n",
-          gtid, schedtype, P64(chunk), gtid, numberOfActiveOMPThreads);
+          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
+          (int)numberOfActiveOMPThreads);
     ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
             "current thread is not needed here; error");
 
@@ -173,9 +174,9 @@
       break;
     }
     default: {
-      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
+      ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", (int)schedtype);
       PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
-            schedtype);
+            (int)schedtype);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
                      numberOfActiveOMPThreads);
       break;
@@ -189,8 +190,9 @@
     PRINT(LD_LOOP,
           "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
           "%d\n",
-          numberOfActiveOMPThreads, GetNumberOfWorkersInTeam(), P64(*plower),
-          P64(*pupper), P64(*pstride), lastiter);
+          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
+          (int)lastiter);
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -229,7 +231,7 @@
         __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
-            (long)tnum, P64(tripCount), schedule);
+            (long)tnum, (long long)tripCount, (int)schedule);
       schedule = kmp_sched_static_chunk;
       chunk = tripCount; // one thread gets the whole loop
     } else if (schedule == kmp_sched_runtime) {
@@ -255,18 +257,20 @@
         break;
       }
       }
-      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
-            P64(chunk));
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
     } else if (schedule == kmp_sched_auto) {
       schedule = kmp_sched_static_chunk;
       chunk = 1;
-      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
-            P64(chunk));
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
     } else {
-      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
       ASSERT(LT_FUSSY,
              schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
-             "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
+             "unknown schedule %d & chunk %lld\n", (int)schedule,
+             (long long)chunk);
     }
 
     // init schedules
@@ -287,9 +291,12 @@
       PRINT(LD_LOOP,
             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->Stride(tid));
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
     } else if (schedule == kmp_sched_static_balanced_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
       // save sched state
@@ -316,9 +323,12 @@
       PRINT(LD_LOOP,
             "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->Stride(tid));
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
       // save sched state
@@ -336,9 +346,12 @@
       PRINT(LD_LOOP,
             "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
             ", next lower bound = %llu, stride = %llu\n",
-            tnum, omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
-            omptarget_nvptx_threadPrivateContext->Stride(tid));
+            (int)tnum,
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
       __kmpc_barrier(loc, threadId);
@@ -356,7 +369,9 @@
       PRINT(LD_LOOP,
             "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
             ", chunk %" PRIu64 "\n",
-            tnum, omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+            (int)tnum,
+            (unsigned long long)
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
             omptarget_nvptx_threadPrivateContext->Chunk(teamId));
     }
@@ -380,22 +395,22 @@
     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
     // a.
     if (lb <= loopUpperBound && ub < loopUpperBound) {
-      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
-            P64(ub), P64(loopUpperBound));
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
       return NOT_FINISHED;
     }
     // b.
     if (lb <= loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
-            P64(lb), P64(ub), P64(loopUpperBound));
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
       ub = loopUpperBound;
       return LAST_CHUNK;
     }
     // c. if we are here, we are in case 'c'
     lb = loopUpperBound + 2;
     ub = loopUpperBound + 1;
-    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
-          P64(ub), P64(loopUpperBound));
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
+          (long long)ub, (long long)loopUpperBound);
     return FINISHED;
   }
 
@@ -426,7 +441,7 @@
       // finished?
       if (myLb > ub) {
         PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
-              P64(myLb), P64(ub));
+              (long long)myLb, (long long)ub);
         return DISPATCH_FINISHED;
       }
       // not finished, save current bounds
@@ -442,7 +457,7 @@
       ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
       PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
-            P64(*plower), P64(*pupper));
+            (long long)*plower, (long long)*pupper);
       return DISPATCH_NOTFINISHED;
     }
     ASSERT0(LT_FUSSY,
@@ -464,12 +479,13 @@
     *pupper = myUb;
     *pstride = 1;
 
-    PRINT(LD_LOOP,
-          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
-          "last %d\n",
-          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-          GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride),
-          *plast);
+    PRINT(
+        LD_LOOP,
+        "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+        "last %d\n",
+        (int)GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
+        (int)GetNumberOfWorkersInTeam(), (long long)*plower, (long long)*pupper,
+        (long long)*pstride, (int)*plast);
     return DISPATCH_NOTFINISHED;
   }
 
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -150,7 +150,7 @@
   PRINT(LD_PAR,
         "thread will execute parallel region with id %d in a team of "
         "%d threads\n",
-        newTaskDescr->ThreadId(), newTaskDescr->ThreadsInTeam());
+        (int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
 
   if (RequiresDataSharing && threadId % WARPSIZE == 0) {
     // Warp master innitializes data sharing environment.
Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -76,7 +76,7 @@
   else
     *NumLanes = ConvergentSize;
   ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
-         *NumLanes);
+         (int)*NumLanes);
 
   // Set to true for lanes participating in the simd region.
   bool isActive = false;
@@ -152,7 +152,7 @@
   else
     NumThreads = ConvergentSize;
   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         NumThreads);
+         (int)NumThreads);
 
   // Set to true for workers participating in the parallel region.
   bool isActive = false;
@@ -260,7 +260,7 @@
   }
 
   ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
-         NumThreads);
+         (int)NumThreads);
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "only team master can create parallel");
 
@@ -307,7 +307,7 @@
     PRINT(LD_PAR,
           "thread will execute parallel region with id %d in a team of "
           "%d threads\n",
-          newTaskDescr->ThreadId(), newTaskDescr->NThreads());
+          (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
 
     isActive = true;
   }
@@ -438,7 +438,7 @@
 
 EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
                                    int32_t simd_limit) {
-  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
+  PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
   tid = GetLogicalThreadIdInBlock();
   omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
@@ -449,12 +449,12 @@
 
 EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
                                   int32_t num_teams, int32_t thread_limit) {
-  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
+  PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
   ASSERT0(LT_FUSSY, FALSE,
           "should never have anything with new teams on device");
 }
 
 EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid,
                                   int proc_bind) {
-  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
+  PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
 }
Index: libomptarget/deviceRTLs/nvptx/src/reduction.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -76,12 +76,7 @@
 }
 
 EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-  int lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
-  hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
-  lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-  return val;
+  return __SHFL_DOWN_SYNC(0xFFFFFFFFFFFFFFFFL, val, delta, size);
 }
 
 static INLINE void gpu_regular_warp_reduce(void *reduce_data,
Index: libomptarget/deviceRTLs/nvptx/src/supporti.h
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -231,19 +231,20 @@
 {
   // compute the necessary padding to satisfy alignment constraint
   ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
-         "alignment %ld is not a power of 2\n", alignment);
+         "alignment %lu is not a power of 2\n", alignment);
   return (~(unsigned long)size + 1) & (alignment - 1);
 }
 
 INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
 {
   void *ptr = malloc(size);
-  PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr));
+  PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg,
+        (unsigned long long)ptr);
   return ptr;
 }
 
 INLINE void *SafeFree(void *ptr, const char *msg) {
-  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg);
+  PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
   free(ptr);
   return NULL;
 }
Index: libomptarget/deviceRTLs/nvptx/src/sync.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -61,7 +61,7 @@
 
         PRINT(LD_SYNC,
               "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-              numberOfActiveOMPThreads, threads);
+              (int)numberOfActiveOMPThreads, (int)threads);
         // Barrier #1 is for synchronization among active threads.
         named_sync(L1_BARRIER, threads);
       }
@@ -89,7 +89,7 @@
   PRINT(LD_SYNC,
         "call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
         "%d\n",
-        numberOfActiveOMPThreads, threads);
+        (int)numberOfActiveOMPThreads, (int)threads);
   // Barrier #1 is for synchronization among active threads.
   named_sync(L1_BARRIER, threads);
   PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
Index: libomptarget/deviceRTLs/nvptx/src/task.cu
===================================================================
--- libomptarget/deviceRTLs/nvptx/src/task.cu
+++ libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -39,14 +39,15 @@
   PRINT(LD_IO,
         "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
         "fct 0x%llx)\n",
-        P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub));
+        (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
+        (unsigned long long)taskSub);
   // want task+priv to be a multiple of 8 bytes
   size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
   sizeOfTaskInclPrivate += padForTaskInclPriv;
   size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
   ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
          "need task descr of size %d to be a multiple of %d\n",
-         sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
+         (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
   size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
   omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
       (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
@@ -63,7 +64,8 @@
   newKmpTaskDescr->sub = taskSub;
   newKmpTaskDescr->destructors = NULL;
   PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
-        P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
+        (unsigned long long)newKmpTaskDescr,
+        (unsigned long long)newExplicitTaskDescr);
 
   return newKmpTaskDescr;
 }
@@ -102,10 +104,11 @@
 
   // 3. call sub
   PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
-        P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
+        (unsigned long long)newKmpTaskDescr->sub,
+        (unsigned long long)newKmpTaskDescr);
   newKmpTaskDescr->sub(0, newKmpTaskDescr);
   PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
-        P64(newKmpTaskDescr->sub));
+        (unsigned long long)newKmpTaskDescr->sub);
 
   // 4. pop context
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
@@ -118,7 +121,7 @@
 EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
                                       kmp_TaskDescr *newKmpTaskDescr) {
   PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
-        P64(newKmpTaskDescr));
+        (unsigned long long)newKmpTaskDescr);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
           "Runtime must be initialized.");
   // 1. get explict task descr from kmp task descr
@@ -144,7 +147,7 @@
 EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
                                          kmp_TaskDescr *newKmpTaskDescr) {
   PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
-        P64(newKmpTaskDescr));
+        (unsigned long long)newKmpTaskDescr);
   ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
           "Runtime must be initialized.");
   // 1. get explict task descr from kmp task descr